In [1]:
import pandas as pd
import nltk
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv ("classifier1.csv",on_bad_lines='skip')
df
df.head()

Unnamed: 0,paragraph,classe
0,if you live in the european region whatsapp ir...,False
1,whatsapp legal info,False
2,if you live outside the european region whatsa...,False
3,for example our privacy policy talks about wha...,False
4,we are one of the facebook companies you can l...,False


In [3]:
# turn the classes and paragraphs into lists 

classes = df['classe'].tolist()
paragraphs = df['paragraph'].tolist()

In [5]:
classes [1], paragraphs [1]

(False, 'whatsapp legal info')

In [6]:
# It is better to split the dataset only once and store the two datasets in two separate files
# If the samples (separate workouts) will be different then we can't
# compare between different models if you apply separate training

X1_train, X1_test, y_train, y_test = train_test_split(df["paragraph"],df["classe"],test_size=0.2, random_state=42)

count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(X1_train)
X_test = count_vectorizer.transform(X1_test)

In [7]:
#training Naive Bayes

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

MultinomialNB()

In [8]:
#test Naive Bayes

Y_naive_bayes = naive_bayes.predict(X_test)

In [9]:
print(classification_report(y_test, Y_naive_bayes))

              precision    recall  f1-score   support

       False       0.70      0.64      0.67        33
        True       0.82      0.86      0.84        63

    accuracy                           0.78        96
   macro avg       0.76      0.75      0.75        96
weighted avg       0.78      0.78      0.78        96



In [10]:
# The prediction function
# For naive bayes, the prediction must go through the same pipeline

def predict(text_str):
    x_input = count_vectorizer.transform([text_str])
    return naive_bayes.predict(x_input)[0]

In [13]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : True 
#---------------------------------------------------------------------
predict("we may share personal data with card networks and payment processors")

True

In [14]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : False
#---------------------------------------------------------------------
predict("if the new version reduces your rights or increases your responsibilities well post it on the policy updates or privacy statement page of our website at least 21 days before it becomes effective")

False

In [16]:
# Storing the ML model and the vectorizer using joblib
# joblib saves data structures in a file to be able to reload them later (for example a machine learning model)
# it is more efficient for numpy structures than pickle

import joblib
joblib.dump(naive_bayes, "./naive_bayes.joblib", compress=True)
joblib.dump(count_vectorizer, "./count_vectorizer.joblib", compress=True)

['./count_vectorizer.joblib']

In [17]:
#=====================================================================
# Testing if the model can be reloaded correctly 
#=====================================================================
# RESULTS : MultinomialNB()
#---------------------------------------------------------------------
modelReload = joblib.load("naive_bayes.joblib")
modelReload

MultinomialNB()

In [18]:
#=====================================================================
# Testing if the count vectorizer can be reloaded correctly 
#=====================================================================
# RESULTS : CountVectorizer()
#---------------------------------------------------------------------
vectorizer_to_train_data = joblib.load("count_vectorizer.joblib")
vectorizer_to_train_data

CountVectorizer()

In [19]:
#===========================================================================================
# Testing the prediction function using the reloaded objects (ML model + Count Vectorizer)
#===========================================================================================
# RESULTS : True
#-------------------------------------------------------------------------------------------

from scipy.sparse import csr_matrix
str = ['make it easy for you to find and connect with others for instance if you let us access your contacts we can suggest connections with people you may know']
x_input = vectorizer_to_train_data.transform([word.lower() for word in str])
csr_matrix.toarray(x_input)
# print (csr_matrix.toarray(x_input)[0])
modelReload.predict(csr_matrix.toarray(x_input))[0]

True