In [18]:
import pandas as pd
import nltk
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

In [19]:
df = pd.read_csv ("classifier2.csv",on_bad_lines='skip')
df
# df.head()

Unnamed: 0,information element,classe
0,the purpose of collecting data,policy
1,the purpose of collecting additional data,policy
2,why collecting the account information,policy
3,where delivered messages are stored,data
4,how to deal with undelivered messages,process
...,...,...
91,the purpose of sharing data with advertisement...,policy
92,the purpose of sharing information with softwa...,policy
93,the purpose of sharing information in connecti...,policy
94,the purpose of sharing nonpersonal dara for in...,policy


In [20]:
# turn the classes and paragraphs into lists 

IEs = df['information element'].tolist()
classes = df['classe'].tolist()

In [21]:
IEs [1], classes [1]

('the purpose of collecting additional data', 'policy')

In [50]:
# It is better to split the dataset only once and store the two datasets in two separate files
# If the samples (separate workouts) will be different then we can't
# compare between different models if you apply separate training

X1_train, X1_test, y_train, y_test = train_test_split(df["information element"],df["classe"],test_size=0.2, random_state=42)

Tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
# count_vectorizer = CountVectorizer()
X_train = Tfidf_vectorizer.fit_transform(X1_train)
X_test = Tfidf_vectorizer.transform(X1_test)

In [44]:
#training Naive Bayes

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

MultinomialNB()

In [45]:
#test Naive Bayes

Y_naive_bayes = naive_bayes.predict(X_test)

In [51]:
print(classification_report(y_test, Y_naive_bayes))

              precision    recall  f1-score   support

        data       1.00      0.43      0.60         7
      policy       0.71      1.00      0.83        10
     process       1.00      1.00      1.00         3

    accuracy                           0.80        20
   macro avg       0.90      0.81      0.81        20
weighted avg       0.86      0.80      0.78        20



In [52]:
# The prediction function
# For naive bayes, the prediction must go through the same pipeline

def predict(text_str):
    x_input = Tfidf_vectorizer.transform([text_str])
    print (naive_bayes.predict(x_input))
    return naive_bayes.predict_proba(x_input)[0]

In [53]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : process
#---------------------------------------------------------------------
predict("how to deal with forwarded media")

['process']


array([0.19509166, 0.2850151 , 0.51989324])

In [54]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : policy
#---------------------------------------------------------------------
predict("the purpose of using end to end encryption")

['policy']


array([0.0965845 , 0.80992805, 0.09348744])

In [55]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : data
#---------------------------------------------------------------------
predict("where delivered messages are stored")

['data']


array([0.60532183, 0.24422657, 0.1504516 ])

In [56]:
import pickle
pickle.dump(naive_bayes, open('./naive_bayes_2.pkl', 'wb'))
pickle.dump(Tfidf_vectorizer, open('./count_vectorizer_2.pkl', 'wb'))

In [57]:
naive_bayes = pickle.load(open('naive_bayes_2.pkl', 'rb'))
naive_bayes

MultinomialNB()

In [58]:
Tfidf_vectorizer = pickle.load(open('count_vectorizer_2.pkl', 'rb'))
Tfidf_vectorizer

TfidfVectorizer(ngram_range=(1, 2))

In [59]:
#===========================================================================================
# Testing the prediction function using the reloaded objects (ML model + Count Vectorizer)
#===========================================================================================
# RESULTS : policy
#-------------------------------------------------------------------------------------------
from scipy.sparse import csr_matrix
str = ['the purpose of using end to end encryption']
x_input = Tfidf_vectorizer.transform([word.lower() for word in str])
csr_matrix.toarray(x_input)
# print (csr_matrix.toarray(x_input)[0])
naive_bayes.predict(csr_matrix.toarray(x_input))[0]

'policy'