In [20]:
import pandas as pd
import nltk
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

In [22]:
df = pd.read_csv ("classifier2.csv",on_bad_lines='skip')
df
# df.head()

Unnamed: 0,information element,classe
0,personal data,data
1,additional data,data
2,why collecting account information,policy
3,where delivered messages are stored,data
4,how to deal with undelivered messages,process
...,...,...
144,other information users provide,data
145,financial information,data
146,internet activity information,data
147,professional and employment related information,data


In [23]:
# turn the classes and paragraphs into lists 

IEs = df['information element'].tolist()
classes = df['classe'].tolist()

In [24]:
IEs [1], classes [1]

('additional data', 'data')

In [25]:
# It is better to split the dataset only once and store the two datasets in two separate files
# If the samples (separate workouts) will be different then we can't
# compare between different models if you apply separate training

X1_train, X1_test, y_train, y_test = train_test_split(df["information element"],df["classe"],test_size=0.2, random_state=42)

Tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
# count_vectorizer = CountVectorizer()
X_train = Tfidf_vectorizer.fit_transform(X1_train)
X_test = Tfidf_vectorizer.transform(X1_test)

In [26]:
#training Naive Bayes

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

MultinomialNB()

In [27]:
#test Naive Bayes

Y_naive_bayes = naive_bayes.predict(X_test)

In [28]:
print(classification_report(y_test, Y_naive_bayes))

              precision    recall  f1-score   support

        data       0.62      0.89      0.73        18
      policy       0.50      0.50      0.50         4
     process       0.00      0.00      0.00         8

    accuracy                           0.60        30
   macro avg       0.37      0.46      0.41        30
weighted avg       0.44      0.60      0.50        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# The prediction function
# For naive bayes, the prediction must go through the same pipeline

def predict(text_str):
    x_input = Tfidf_vectorizer.transform([text_str])
    print (naive_bayes.predict(x_input))
    return naive_bayes.predict_proba(x_input)[0]

In [30]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : process
#---------------------------------------------------------------------
predict("deal with forwarded media")

['process']


array([0.36806448, 0.24245664, 0.38947888])

In [33]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : policy
#---------------------------------------------------------------------
predict("cookies information")

['policy']


array([0.44094079, 0.45426022, 0.10479899])

In [35]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : data
#---------------------------------------------------------------------
predict("payment information")

['data']


array([0.83972969, 0.09814457, 0.06212573])

In [36]:
import pickle
pickle.dump(naive_bayes, open('./naive_bayes_2.pkl', 'wb'))
pickle.dump(Tfidf_vectorizer, open('./count_vectorizer_2.pkl', 'wb'))

In [37]:
naive_bayes = pickle.load(open('naive_bayes_2.pkl', 'rb'))
naive_bayes

MultinomialNB()

In [38]:
Tfidf_vectorizer = pickle.load(open('count_vectorizer_2.pkl', 'rb'))
Tfidf_vectorizer

TfidfVectorizer(ngram_range=(1, 2))

In [39]:
#===========================================================================================
# Testing the prediction function using the reloaded objects (ML model + Count Vectorizer)
#===========================================================================================
# RESULTS : policy
#-------------------------------------------------------------------------------------------
from scipy.sparse import csr_matrix
str = ['using end to end encryption']
x_input = Tfidf_vectorizer.transform([word.lower() for word in str])
csr_matrix.toarray(x_input)
# print (csr_matrix.toarray(x_input)[0])
naive_bayes.predict(csr_matrix.toarray(x_input))[0]

'policy'