In [58]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')
data = pd.read_csv("C:/Users/Administrateur/OneDrive/Bureau/ESCPTerm2/NLP/Cours1/assignmentspamdetector/spam.csv",
                   encoding='latin-1', usecols=["v1", "v2"])

data = data.rename(columns={"v1": "label", "v2": "text"})
data.label.value_counts()
data['label_num'] = data.label.map({'ham': 0, 'spam': 1})
print(data)
# so now, after converting label into numerical values, ie label_num column, we are
# working with this newly created column (it becomes our y vector, text => X vector)

     label                                               text  label_num
0      ham  Go until jurong point, crazy.. Available only ...          0
1      ham                      Ok lar... Joking wif u oni...          0
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...          1
3      ham  U dun say so early hor... U c already then say...          0
4      ham  Nah I don't think he goes to usf, he lives aro...          0
...    ...                                                ...        ...
5567  spam  This is the 2nd time we have tried 2 contact u...          1
5568   ham              Will Ì_ b going to esplanade fr home?          0
5569   ham  Pity, * was in mood for that. So...any other s...          0
5570   ham  The guy did some bitching but I acted like i'd...          0
5571   ham                         Rofl. Its true to its name          0

[5572 rows x 3 columns]


In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label_num"], test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

from sklearn.feature_extraction.text import CountVectorizer

# Scikit-learn’s CountVectorizer is used to
# convert a collection of text documents to a vector of term/token counts.
# list of text documents

(4457,)
(1115,)
(4457,)
(1115,)


In [60]:
vect = CountVectorizer()
# we are applying CountVectorizer on our spam dataset

vect.fit(X_train)
# tokenize, extract tokens and build vocab, dictionary of all the tokens

print("Vocabulary size: {}".format(len(vect.vocabulary_)))
# print("Vocabulary content:\n {}".format(vect.vocabulary_))
X_train_df = vect.transform(X_train)
# encode your data, represent your text as count vector(s)

X_train_df[:3].nonzero()
prediction = dict()

Vocabulary size: 7735


In [61]:
# choosing the best features/model, here I tested all the model in order to fin the best one (CountVectorizer)
#"raw-count features"
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# creating a list of models
models = [LogisticRegression(random_state=0), SVC(kernel='rbf'), SVC(kernel='linear'), MultinomialNB(), RandomForestClassifier(n_estimators=100)]

# testing all the models, to find the one with the highest accuracy
for i in models:
    i.fit(X_train_df, y_train)  # X_train_df: count vectors
    X_test_df = vect.transform(X_test)
    prediction[i] = i.predict(X_test_df)
    print(str(i))
    # assessing the model
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

    print("accuracy:", accuracy_score(y_test, prediction[i]))
    conf_mat = confusion_matrix(y_test, prediction[i])
    print("confusion matrix", conf_mat, sep='\n')
    tn, fp, fn, tp = conf_mat.ravel()
    fpr = (fp)/(fp+tn)
    print('False Positive Rate : {}'.format(fpr))
    conf_mat_normalized = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]

    print("normalized confusion matrix",conf_mat_normalized, " ", sep='\n')
    print()
# Here , we have to look which model gives us the lowest FPR because classifying legitimate mail as spam will be a serious error .
#The model with the lowest false positive rate and the highest accuracy rate is SVC()

LogisticRegression(random_state=0)
accuracy: 0.97847533632287
confusion matrix
[[965   0]
 [ 24 126]]
False Positive Rate : 0.0
normalized confusion matrix
[[1.   0.  ]
 [0.16 0.84]]
 

SVC()
accuracy: 0.979372197309417
confusion matrix
[[965   0]
 [ 23 127]]
False Positive Rate : 0.0
normalized confusion matrix
[[1.         0.        ]
 [0.15333333 0.84666667]]
 

SVC(kernel='linear')
accuracy: 0.979372197309417
confusion matrix
[[961   4]
 [ 19 131]]
False Positive Rate : 0.004145077720207254
normalized confusion matrix
[[0.99585492 0.00414508]
 [0.12666667 0.87333333]]
 

MultinomialNB()
accuracy: 0.9838565022421525
confusion matrix
[[963   2]
 [ 16 134]]
False Positive Rate : 0.002072538860103627
normalized confusion matrix
[[0.99792746 0.00207254]
 [0.10666667 0.89333333]]
 

RandomForestClassifier()
accuracy: 0.9757847533632287
confusion matrix
[[965   0]
 [ 27 123]]
False Positive Rate : 0.0
normalized confusion matrix
[[1.   0.  ]
 [0.18 0.82]]
 



In [40]:
# classification model MultinomialNB()
print("MULTINOMIAL REGRESSION")
model = MultinomialNB()
model.fit(X_train_df, y_train)  # X_train_df: count vectors
X_test_df = vect.transform(X_test)
prediction["Multinomial"] = model.predict(X_test_df)

# assessing the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("accuracy:", accuracy_score(y_test, prediction["Multinomial"]))
print(classification_report(y_test, prediction["Multinomial"]))
conf_mat = confusion_matrix(y_test, prediction['Multinomial'])
print("confusion matrix", conf_mat, sep='\n')
conf_mat_normalized = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]

print("normalized confusion matrix",conf_mat_normalized, sep='\n')

print("train score:", model.score(X_train_df, y_train), sep='\n')
print("test score:", model.score(X_test_df, y_test), " ", sep='\n')

MULTINOMIAL REGRESSION
accuracy: 0.9838565022421525
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

confusion matrix
[[963   2]
 [ 16 134]]
normalized confusion matrix
[[0.99792746 0.00207254]
 [0.10666667 0.89333333]]
train score:
0.9943908458604442
test score:
0.9838565022421525
 


In [41]:
# classification model LogisticRegression()
print("LOGISTIC REGRESSION")
model = LogisticRegression()
model.fit(X_train_df, y_train)  # X_train_df: count vectors
X_test_df = vect.transform(X_test)
prediction["Logistic"] = model.predict(X_test_df)

# assessing the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("accuracy:", accuracy_score(y_test, prediction["Logistic"]))
print(classification_report(y_test, prediction["Logistic"]))
conf_mat = confusion_matrix(y_test, prediction['Logistic'])
print("confusion matrix", conf_mat, sep='\n')
conf_mat_normalized = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]

print("normalized confusion matrix",conf_mat_normalized, sep='\n')

print("train score:", model.score(X_train_df, y_train), sep='\n')
print("test score:", model.score(X_test_df, y_test), " ", sep='\n')


LOGISTIC REGRESSION
accuracy: 0.97847533632287
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

confusion matrix
[[965   0]
 [ 24 126]]
normalized confusion matrix
[[1.   0.  ]
 [0.16 0.84]]
train score:
0.9984294368409243
test score:
0.97847533632287
 


In [62]:
#TF-IDF
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')
data = pd.read_csv("C:/Users/Administrateur/OneDrive/Bureau/ESCPTerm2/NLP/Cours1/assignmentspamdetector/spam.csv",
                   encoding='latin-1', usecols=["v1", "v2"])

data = data.rename(columns={"v1": "label", "v2": "text"})
data.label.value_counts()
data['label_num'] = data.label.map({'ham': 0, 'spam': 1})
print(data)
# so now, after converting label into numerical values, ie label_num column, we are
# working with this newly created column (it becomes our y vector, text => X vector)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label_num"], test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# second method of vectorization, TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# create a vector transform
vc_tf_idf = TfidfVectorizer()
# tokenize and build vocab
vc_tf_idf.fit(X_train)
print(vc_tf_idf.vocabulary_)
print(vc_tf_idf.idf_)
# encode document
X_train_df = vc_tf_idf.transform(X_train)
print("the dimension of your vector")
print(vector.shape)
print("the encoded vector")
print(vector.toarray())
# encode data

     label                                               text  label_num
0      ham  Go until jurong point, crazy.. Available only ...          0
1      ham                      Ok lar... Joking wif u oni...          0
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...          1
3      ham  U dun say so early hor... U c already then say...          0
4      ham  Nah I don't think he goes to usf, he lives aro...          0
...    ...                                                ...        ...
5567  spam  This is the 2nd time we have tried 2 contact u...          1
5568   ham              Will Ì_ b going to esplanade fr home?          0
5569   ham  Pity, * was in mood for that. So...any other s...          0
5570   ham  The guy did some bitching but I acted like i'd...          0
5571   ham                         Rofl. Its true to its name          0

[5572 rows x 3 columns]
(4457,)
(1115,)
(4457,)
(1115,)
[7.20523094 6.31141306 8.70930833 ... 8.70930833 7.45654536 8.70930

In [63]:
# choosing the best features/model, here I tested all the model in order to fin the best one (TF-IDF)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# creating a list of models
models = [LogisticRegression(random_state=0), SVC(kernel='rbf'), SVC(kernel='linear'), MultinomialNB(), RandomForestClassifier(n_estimators=100)]

# testing all the models, to find the one with the highest accuracy
for i in models:
    i.fit(X_train_df, y_train)  # X_train_df: count vectors
    X_test_df = vect.transform(X_test)
    prediction[i] = i.predict(X_test_df)
    print(str(i))
    # assessing the model
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

    print("accuracy:", accuracy_score(y_test, prediction[i]))
    conf_mat = confusion_matrix(y_test, prediction[i])
    print("confusion matrix", conf_mat, sep='\n')
    tn, fp, fn, tp = conf_mat.ravel()
    fpr = (fp)/(fp+tn)
    print('False Positive Rate : {}'.format(fpr))
    conf_mat_normalized = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]

    print("normalized confusion matrix",conf_mat_normalized, " ", sep='\n')
    print()
# we found out that the model with the highest accuracy and lowest False Positive Rate is RandomForestClassifier() 

LogisticRegression(random_state=0)
accuracy: 0.9443946188340807
confusion matrix
[[910  55]
 [  7 143]]
False Positive Rate : 0.05699481865284974
normalized confusion matrix
[[0.94300518 0.05699482]
 [0.04666667 0.95333333]]
 

SVC()
accuracy: 0.8654708520179372
confusion matrix
[[965   0]
 [150   0]]
False Positive Rate : 0.0
normalized confusion matrix
[[1. 0.]
 [1. 0.]]
 

SVC(kernel='linear')
accuracy: 0.9560538116591928
confusion matrix
[[921  44]
 [  5 145]]
False Positive Rate : 0.04559585492227979
normalized confusion matrix
[[0.95440415 0.04559585]
 [0.03333333 0.96666667]]
 

MultinomialNB()
accuracy: 0.9668161434977578
confusion matrix
[[964   1]
 [ 36 114]]
False Positive Rate : 0.0010362694300518134
normalized confusion matrix
[[0.99896373 0.00103627]
 [0.24       0.76      ]]
 

RandomForestClassifier()
accuracy: 0.9739910313901345
confusion matrix
[[965   0]
 [ 29 121]]
False Positive Rate : 0.0
normalized confusion matrix
[[1.         0.        ]
 [0.19333333 0.80666667

In [None]:
#The results, comparing the best models

#"raw-count features"
#SVC() model (SVM)
accuracy: 0.979372197309417
confusion matrix
[[965   0]
 [ 23 127]]
False Positive Rate : 0.0
normalized confusion matrix
[[1.         0.        ]
 [0.15333333 0.84666667]]
 

#TF-IDF
#RandomForestClassifier()
accuracy: 0.9757847533632287
confusion matrix
[[965   0]
 [ 27 123]]
False Positive Rate : 0.0
normalized confusion matrix
[[1.   0.  ]
 [0.18 0.82]]

# As I mention earlier , we have to look which model gives us the lowest 
#FPR because classifying legitimate mail as spam will be a serious error. Since the 2 models have a False Positive Rate : 0.0

We will choose the one with the highest accuracy, here the best model is the RandomForestClassifier(),with the "TF-IDF" vectorizer.

