# Example: Email spam detector

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import svm 

In [2]:
spam = pd.read_csv('data/spam.csv')

In [4]:
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
spam.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True)

In [6]:
spam.columns = ["is_spam", "email"]

In [7]:
spam.shape

(5572, 2)

In [8]:
spam = spam[pd.notnull(spam['email'])]

In [9]:
spam.shape

(5572, 2)

In [10]:
spam.head()

Unnamed: 0,is_spam,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
spam["is_spam"].value_counts()

ham     4825
spam     747
Name: is_spam, dtype: int64

In [12]:
X = spam['email']
y = spam["is_spam"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

**Extracting features**

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [14]:
# CountVectorizer
pipe = make_pipeline(TfidfVectorizer(norm=None), MultinomialNB())

param_grid = {'tfidfvectorizer__min_df': [1,5,10,15,20,25]}

grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(norm=None)),
                                       ('multinomialnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'tfidfvectorizer__min_df': [1, 5, 10, 15, 20, 25]},
             verbose=1)

In [15]:
print(f"Best params:\n{grid.best_params_}\n")

Best params:
{'tfidfvectorizer__min_df': 1}



In [16]:
print(f"Test-set score: {grid.score(X_test, y_test):.2f}")

Test-set score: 0.98


In [17]:
prediction = grid.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, prediction)

In [19]:
confusion

array([[930,  10],
       [  9, 166]])

In [20]:
from sklearn.metrics import classification_report

print(classification_report(y_test, prediction, target_names=["not spam", "spam"], zero_division=0))

              precision    recall  f1-score   support

    not spam       0.99      0.99      0.99       940
        spam       0.94      0.95      0.95       175

    accuracy                           0.98      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115



### Find best model

In [21]:
def score_pipelines(pipeline, parameters, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)  
    grid = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    print(f"Best params:\n{grid.best_params_}\n")
    print(f"\nTest-set score: {grid.score(X_test, y_test):.2f}\n")
    # -- predicton --
    prediction = grid.predict(X_test)
    confusion = confusion_matrix(y_test, prediction)
    print(f"\nConfusion matrix:\n {confusion}\n\n")
    print(classification_report(y_test, prediction, target_names=["not spam", "spam"], zero_division=0))

In [22]:
pipe = make_pipeline(CountVectorizer(), MultinomialNB())
param_grid = {'countvectorizer__min_df': [1,5,10,15,20]}

score_pipelines(pipe, param_grid, X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params:
{'countvectorizer__min_df': 1}


Test-set score: 0.99


Confusion matrix:
 [[947   2]
 [ 12 154]]


              precision    recall  f1-score   support

    not spam       0.99      1.00      0.99       949
        spam       0.99      0.93      0.96       166

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [23]:
pipe = make_pipeline(TfidfVectorizer(norm=None), MultinomialNB())
param_grid = {'tfidfvectorizer__min_df': [1,5,10,15,20,25]}

score_pipelines(pipe, param_grid, X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best params:
{'tfidfvectorizer__min_df': 1}


Test-set score: 0.99


Confusion matrix:
 [[945   4]
 [ 10 156]]


              precision    recall  f1-score   support

    not spam       0.99      1.00      0.99       949
        spam       0.97      0.94      0.96       166

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [24]:
from sklearn.svm import SVC

pipe = make_pipeline(TfidfVectorizer(norm=None), SVC())
param_grid = {'tfidfvectorizer__min_df': [1,5]}

score_pipelines(pipe, param_grid, X, y)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:
{'tfidfvectorizer__min_df': 5}


Test-set score: 0.98


Confusion matrix:
 [[948   1]
 [ 24 142]]


              precision    recall  f1-score   support

    not spam       0.98      1.00      0.99       949
        spam       0.99      0.86      0.92       166

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

