# Example: Email spam detector

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import svm 

In [None]:
spam = pd.read_csv('data/spam.csv')

In [None]:
spam.head()

In [None]:
spam.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True)

In [None]:
spam.columns = ["is_spam", "email"]

In [None]:
spam.shape

In [None]:
spam = spam[pd.notnull(spam['email'])]

In [None]:
spam.shape

In [None]:
spam.head()

In [None]:
spam["is_spam"].value_counts()

In [None]:
X = spam['email']
y = spam["is_spam"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

**Extracting features**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
# CountVectorizer
pipe = make_pipeline(TfidfVectorizer(norm=None), MultinomialNB())

param_grid = {'tfidfvectorizer__min_df': [1,5,10,15,20,25]}

grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1)

grid.fit(X_train, y_train)

In [None]:
print(f"Best params:\n{grid.best_params_}\n")

In [None]:
print(f"Test-set score: {grid.score(X_test, y_test):.2f}")

In [None]:
prediction = grid.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, prediction)

In [None]:
confusion

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, prediction, target_names=["not spam", "spam"], zero_division=0))

### Find best model

In [None]:
def score_pipelines(pipeline, parameters, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)  
    grid = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    print(f"Best params:\n{grid.best_params_}\n")
    print(f"\nTest-set score: {grid.score(X_test, y_test):.2f}\n")
    # -- predicton --
    prediction = grid.predict(X_test)
    confusion = confusion_matrix(y_test, prediction)
    print(f"\nConfusion matrix:\n {confusion}\n\n")
    print(classification_report(y_test, prediction, target_names=["not spam", "spam"], zero_division=0))

In [None]:
pipe = make_pipeline(CountVectorizer(), MultinomialNB())
param_grid = {'countvectorizer__min_df': [1,5,10,15,20]}

score_pipelines(pipe, param_grid, X, y)

In [None]:
pipe = make_pipeline(TfidfVectorizer(norm=None), MultinomialNB())
param_grid = {'tfidfvectorizer__min_df': [1,5,10,15,20,25]}

score_pipelines(pipe, param_grid, X, y)

In [None]:
from sklearn.svm import SVC

pipe = make_pipeline(TfidfVectorizer(norm=None), SVC())
param_grid = {'tfidfvectorizer__min_df': [1,5]}

score_pipelines(pipe, param_grid, X, y)