In [1]:
import pathlib
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
#to see where fils are saved locally
BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / "exports"
DATASET_CLASSIFIER2_PATH = EXPORT_DIR / "datasetClassifier2.csv"
METADATA_EXPORT_PATH = EXPORT_DIR / "metadata.pkl"
TOKENIZER_EXPORT_PATH = EXPORT_DIR / "tokenizer.pkl"

DATASET_DIR.mkdir(exist_ok=True)
EXPORT_DIR.mkdir(exist_ok=True)

In [3]:
df = pd.read_csv ("D:\ESI\PFE\pfeCode\datasets\dataset_classifier1.csv")
df
df.head()

Unnamed: 0,classe,paragraph
0,False,if you live in the european region whatsapp ir...
1,False,whatsapp legal info
2,False,if you live outside the european region whatsa...
3,False,our privacy policy privacy policy helps explai...
4,False,for example our privacy policy talks about wha...


In [4]:
df.to_csv(DATASET_CLASSIFIER2_PATH, index = False)

In [5]:
# turn the classes and paragraphs into lists 
classes = df['classe'].tolist()
paragraphs = df['paragraph'].tolist()

In [6]:
classes [20], paragraphs [20]

(False,
 'we use information we have subject to choices you make and applicable law to operate provide improve understand customize support and market our services heres how')

In [7]:
classe_legend = {"True" : 1, "False" : 0}
classe_legend_reverted = {f"{v}": k for k,v in classe_legend.items()}

In [8]:
MAX_NUM_WORDS = 370

In [9]:
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS) 
tokenizer.fit_on_texts(paragraphs)
sequences = tokenizer.texts_to_sequences(paragraphs)
sequences

[[16,
  2,
  127,
  19,
  10,
  128,
  129,
  24,
  188,
  189,
  130,
  10,
  4,
  5,
  2,
  93,
  25,
  31,
  12,
  32,
  1,
  21,
  33],
 [24, 190, 191],
 [16,
  2,
  127,
  192,
  10,
  128,
  129,
  24,
  193,
  24,
  3,
  6,
  15,
  20,
  130,
  3,
  4,
  5,
  2,
  93,
  25,
  31,
  12,
  32,
  1,
  21,
  33],
 [3, 21, 33, 21, 33, 194, 195, 3, 94, 131, 27, 10, 8, 6, 132, 5, 17, 3, 4],
 [18,
  45,
  3,
  21,
  33,
  196,
  22,
  197,
  8,
  6,
  55,
  1,
  48,
  25,
  198,
  2,
  72,
  73,
  199,
  10,
  200,
  6,
  201,
  5,
  74,
  7,
  21,
  133,
  202,
  3,
  4,
  203,
  95,
  23,
  134,
  96,
  56,
  20,
  1,
  204,
  2,
  205,
  206,
  97,
  2,
  28,
  11,
  13,
  3,
  4],
 [6,
  34,
  135,
  12,
  10,
  35,
  46,
  2,
  26,
  75,
  57,
  207,
  136,
  19,
  25,
  21,
  33,
  22,
  10,
  208,
  19,
  58,
  6,
  36,
  8,
  209,
  25,
  210,
  12,
  46],
 [25, 21, 33, 211, 5, 137, 12, 3, 4, 212, 213, 214],
 [98,
  73,
  215,
  216,
  31,
  12,
  32,
  31,
  58,
  138,
  10,
  

In [10]:
word_index = tokenizer.word_index
word_index

{'and': 1,
 'you': 2,
 'our': 3,
 'services': 4,
 'to': 5,
 'we': 6,
 'your': 7,
 'information': 8,
 'use': 9,
 'the': 10,
 'with': 11,
 'of': 12,
 'on': 13,
 'a': 14,
 'or': 15,
 'if': 16,
 'provide': 17,
 'for': 18,
 'in': 19,
 'us': 20,
 'privacy': 21,
 'about': 22,
 'messages': 23,
 'whatsapp': 24,
 'this': 25,
 'can': 26,
 'including': 27,
 'communicate': 28,
 'account': 29,
 'as': 30,
 'terms': 31,
 'service': 32,
 'policy': 33,
 'are': 34,
 'facebook': 35,
 'share': 36,
 'such': 37,
 'other': 38,
 'thirdparty': 39,
 'will': 40,
 'not': 41,
 'be': 42,
 'have': 43,
 'businesses': 44,
 'example': 45,
 'companies': 46,
 'may': 47,
 'how': 48,
 'support': 49,
 'when': 50,
 'contacts': 51,
 'them': 52,
 'that': 53,
 'products': 54,
 'collect': 55,
 'by': 56,
 'more': 57,
 'which': 58,
 'receive': 59,
 'operate': 60,
 'improve': 61,
 'understand': 62,
 'market': 63,
 'phone': 64,
 'feature': 65,
 'do': 66,
 'from': 67,
 'through': 68,
 'those': 69,
 'users': 70,
 'transaction': 71,
 'i

In [9]:
MAX_SEQ_LENGTH = 370

In [12]:
X = pad_sequences (sequences, maxlen = MAX_SEQ_LENGTH)
X

array([[  0,   0,   0, ...,   1,  21,  33],
       [  0,   0,   0, ...,  24, 190, 191],
       [  0,   0,   0, ...,   1,  21,  33],
       ...,
       [  0,   0,   0, ...,   8,   2,  36],
       [  0,   0,   0, ...,  12,   3,   4],
       [  0,   0,   0, ...,   3,   1,  31]])

In [13]:
classes_as_int_array = np.asarray(classes)
classes_as_int_array   

array([False, False, False, False, False, False, False, False, False,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True, False, False,  True,  True,  True, False,  True,  True,
        True,  True,  True, False, False])

In [10]:
y = to_categorical(classes_as_int_array)
y

NameError: name 'to_categorical' is not defined

In [15]:
X1_train, X1_test, y_train, y_test = train_test_split(df["paragraph"],df["classe"],test_size=0.2, random_state=42)

count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(X1_train)
X_test = count_vectorizer.transform(X1_test)

In [16]:
training_data = {
    "X_train" : X_train,
    "X_test" : X_test,
    "y_train" : y_train,
    "y_test" : y_test, 
    "max_words" : MAX_NUM_WORDS,
    "max_seq_length" : MAX_SEQ_LENGTH,
    "classe_legend" : classe_legend, 
    "classe_legend_reverted" : classe_legend_reverted,
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

32304

In [17]:
with open(METADATA_EXPORT_PATH, 'wb') as f: 
    pickle.dump(training_data, f)

In [18]:
data = {}

with open(METADATA_EXPORT_PATH, 'rb') as f:
    data = pickle.load(f)

In [19]:
# X_test = data['X_test']
# X_train = data['X_train']
# y_test = data['y_test']
# y_train = data['y_train']
# labels_legend_inverted = data['classe_legend']
# # legend = data['legend']
# max_sequence = data['MAX_SEQ_LENGTH']
# max_words = data['MAX_NUM_WORDS']
# # tokenizer = data['tokenizer']

In [20]:
temps_train = {}
temps_test = {}

In [21]:
#entrainement Naive Bayes
# model = Sequential()
naive_bayes = MultinomialNB()
temps_debut = timeit.default_timer()
naive_bayes.fit(X_train, y_train)
temps_train["naive_bayes"] = timeit.default_timer() - temps_debut

In [22]:
#test Naive Bayes
temps_debut = timeit.default_timer()
Y_naive_bayes = naive_bayes.predict(X_test)
temps_test["naive_bayes"] = timeit.default_timer() - temps_debut
Y_naive_bayes

array([ True,  True, False,  True,  True,  True,  True])

In [23]:
rappel = {}
precision = {} 

In [24]:
#Naive Bayes
precision["naive_bayes"] = precision_score(y_test, Y_naive_bayes, pos_label=1)
rappel["naive_bayes"] = recall_score(y_test, Y_naive_bayes, pos_label=1)

In [25]:
pd.DataFrame({
    "actual class" : [y_test], 
    "predicted classe": [Y_naive_bayes]
})

y_test

29     True
15     True
24    False
17    False
8     False
9      True
30    False
Name: classe, dtype: bool

In [26]:
Y_naive_bayes

array([ True,  True, False,  True,  True,  True,  True])

In [27]:
pd.DataFrame({
    "Alogorithme" : ["Naive Bayes"],
    "Temps d'entrainement" : [temps_train["naive_bayes"]],
    "Temps de test" : [temps_test["naive_bayes"]]
})

Unnamed: 0,Alogorithme,Temps d'entrainement,Temps de test
0,Naive Bayes,0.011276,0.003483


In [28]:
pd.DataFrame({
    "Alogorithme" : ["Naive Bayes"],
    "Rappel" : [rappel["naive_bayes"]],
    "Precision" : [precision["naive_bayes"]]
})

Unnamed: 0,Alogorithme,Rappel,Precision
0,Naive Bayes,1.0,0.5


In [29]:
print(classification_report(y_test, Y_naive_bayes))

              precision    recall  f1-score   support

       False       1.00      0.25      0.40         4
        True       0.50      1.00      0.67         3

    accuracy                           0.57         7
   macro avg       0.75      0.62      0.53         7
weighted avg       0.79      0.57      0.51         7



In [70]:
#predict new data 
import numpy as np

def predict(text_str, max_words=370, max_sequence = 370, tokenizer=None):
    if not tokenizer:
        return None
#     tokenizer.fit_on_texts(text_str)
    sequences = tokenizer.texts_to_sequences([text_str])
#     print(sequences)
    x_input = pad_sequences(sequences, maxlen=max_sequence)
    y_output = naive_bayes.predict(x_input)
    top_y_index = np.argmax(y_output)
#     print (top_y_index)
    preds = y_output[top_y_index]
#     print (preds)    
    class_preds = [{f"{preds}"}]
#     print (class_preds)
    return class_preds

In [71]:
predict("we may share personal data with card networks and payment processors", max_words=MAX_NUM_WORDS, max_sequence=MAX_SEQ_LENGTH, tokenizer=tokenizer)

[{'True'}]

In [72]:
predict("if the new version reduces your rights or increases your responsibilities well post it on the policy updates or privacy statement page of our website at least 21 days before it becomes effective", max_words=MAX_NUM_WORDS, max_sequence=MAX_SEQ_LENGTH, tokenizer=tokenizer)

[{'False'}]

In [73]:
predict("prezi inc will be the controller of your personal information provided to or collected by or for or processed in connection with our services the terms of use together with the other policies and agreements listed therein are between you and prezi inc in the case of content information you provide see content information section below you control such information you should be solely liable for it and it is outside our control", max_words=MAX_NUM_WORDS, max_sequence=MAX_SEQ_LENGTH, tokenizer=tokenizer)

[{'False'}]

In [None]:
ACCESS_KEY = "<your_do_spaces_access_key>"
SECRET_KEY = "<your_do_spaces_secret_key>"

# Space Endpoint URL
ENDPOINT = "https://ai-cfe-1.nyc3.digitaloceanspaces.com"

# Space Region (also in your endpoint url)
REGION = 'nyc3'

# Set this to a valid slug (without a "/" )
BUCKET_NAME = 'datasets'

In [None]:
os.environ["AWS_ACCESS_KEY_ID"] = ACCESS_KEY
os.environ["AWS_SECRET_ACCESS_KEY"] = SECRET_KEY

In [None]:
# Upload paths 
MODEL_KEY_NAME = f"exports/spam-sms/{MODEL_EXPORT_PATH.name}"
TOKENIZER_KEY_NAME = f"exports/spam-sms/{TOKENIZER_EXPORT_PATH.name}"
METADATA_KEY_NAME = f"exports/spam-sms/{METADATA_EXPORT_PATH.name}"

In [None]:
session = boto3.session.Session()
client = session.client('s3', region_name=REGION, endpoint_url=ENDPOINT)
client.upload_file(str(MODEL_EXPORT_PATH), BUCKET_NAME,  MODEL_KEY_NAME) 
client.upload_file(str(TOKENIZER_EXPORT_PATH), BUCKET_NAME,  TOKENIZER_KEY_NAME) 
client.upload_file(str(METADATA_EXPORT_PATH), BUCKET_NAME,  METADATA_KEY_NAME)  

In [None]:
client.download_file(BUCKET_NAME, MODEL_KEY_NAME, pathlib.Path(MODEL_KEY_NAME).name)
client.download_file(BUCKET_NAME, TOKENIZER_KEY_NAME, pathlib.Path(TOKENIZER_KEY_NAME).name)
client.download_file(BUCKET_NAME, METADATA_KEY_NAME, pathlib.Path(METADATA_KEY_NAME).name)