In [1]:
import pandas as pd
import nltk
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv ("classifier2.csv",on_bad_lines='skip')
# df
df.head()

Unnamed: 0,information element,classe
0,the purpose of collecting data,policy
1,the purpose of collecting additional data,policy
2,why collecting the account information,policy
3,where delivered messages are stored,data
4,how to deal with undelivered messages,process


In [3]:
# turn the classes and paragraphs into lists 

IEs = df['information element'].tolist()
classes = df['classe'].tolist()

In [4]:
IEs [1], classes [1]

('the purpose of collecting additional data', 'policy')

In [6]:
# It is better to split the dataset only once and store the two datasets in two separate files
# If the samples (separate workouts) will be different then we can't
# compare between different models if you apply separate training

X1_train, X1_test, y_train, y_test = train_test_split(df["information element"],df["classe"],test_size=0.2, random_state=42)

count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(X1_train)
X_test = count_vectorizer.transform(X1_test)

In [7]:
#training Naive Bayes

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

MultinomialNB()

In [8]:
#test Naive Bayes

Y_naive_bayes = naive_bayes.predict(X_test)

In [9]:
print(classification_report(y_test, Y_naive_bayes))

              precision    recall  f1-score   support

        data       1.00      1.00      1.00         1
      policy       1.00      1.00      1.00         2
     process       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



In [10]:
# The prediction function
# For naive bayes, the prediction must go through the same pipeline

def predict(text_str):
    x_input = count_vectorizer.transform([text_str])
    print (naive_bayes.predict(x_input))
    return naive_bayes.predict_proba(x_input)[0]

In [11]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : process
#---------------------------------------------------------------------
predict("how to deal with forwarded media")

['process']


array([0.0070974 , 0.03116045, 0.96174215])

In [12]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : policy
#---------------------------------------------------------------------
predict("the purpose of using end to end encryption")

['policy']


array([8.21144550e-04, 9.97168023e-01, 2.01083200e-03])

In [13]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : data
#---------------------------------------------------------------------
predict("where delivered messages are stored")

['data']


array([0.79448022, 0.07630188, 0.1292179 ])

In [14]:
import pickle
pickle.dump(naive_bayes, open('./naive_bayes_2.pkl', 'wb'))
pickle.dump(count_vectorizer, open('./count_vectorizer_2.pkl', 'wb'))

In [15]:
naive_bayes = pickle.load(open('naive_bayes_2.pkl', 'rb'))
naive_bayes

MultinomialNB()

In [16]:
count_vectorizer = pickle.load(open('count_vectorizer_2.pkl', 'rb'))
count_vectorizer

CountVectorizer()

In [17]:
#===========================================================================================
# Testing the prediction function using the reloaded objects (ML model + Count Vectorizer)
#===========================================================================================
# RESULTS : policy
#-------------------------------------------------------------------------------------------
from scipy.sparse import csr_matrix
str = ['the purpose of using end to end encryption']
x_input = count_vectorizer.transform([word.lower() for word in str])
csr_matrix.toarray(x_input)
# print (csr_matrix.toarray(x_input)[0])
naive_bayes.predict(csr_matrix.toarray(x_input))[0]

'policy'