In [105]:
%matplotlib inline

#text analysis
import os
from collections import Counter
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

#train
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline


#text processing
from bs4 import BeautifulSoup
from nltk import word_tokenize, WordNetLemmatizer

In [106]:
# Read file in the same folder.
df = pd.read_csv(r'data/email-dataset.csv')

In [107]:
df.shape

(16183, 2)

In [108]:
df.head(2)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1


In [109]:
import string
from nltk.corpus import stopwords


# Converting a word to its base form

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()


def purify_text(message):
    soup = BeautifulSoup(message)
    text =  soup.get_text()
    text = text.replace("\n", " ").replace("/", "").replace("|", "").replace("http", "").replace(':', "").replace('\t', "").replace("Subject", "").replace('re', "").strip()
    
    # Split a sentence into words. Ex: "I love you." --> ["I", "love", "you", "."]. Similar to split().
    tokens = word_tokenize(text) 
    
    # Ex: playing or played --> Lemmatization --> play
    temp = [lemmatizer.lemmatize(word.lower()) for word in tokens] 
    return ' '.join(temp)


def text_process(message):
    noPunc = [char for char in message if char not in string.punctuation]
    noPunc = ''.join(noPunc)
    
    return [word for word in noPunc.split() if word not in stopwords.words('english')]

In [110]:
df['text'] = df['text'].apply(purify_text)

In [111]:
df.head(2)

Unnamed: 0,text,spam
0,naturally irsistible your corporate identity l...,1
1,the stock trading gunslinger fanny is merrill ...,1


In [112]:
# apply text_process for each email in column text (take time to process: 4 mins)
df['text'] = df['text'].apply(text_process)

In [113]:
df.head(2)

Unnamed: 0,text,spam
0,"[naturally, irsistible, corporate, identity, l...",1
1,"[stock, trading, gunslinger, fanny, merrill, m...",1


In [114]:
df.tail(2)

Unnamed: 0,text,spam
16181,"[reminder, o2, get, 250, pound, fe, call, cdit...",1
16182,"[2nd, time, tried, 2, contact, u, u, å£750, po...",1


In [115]:
df['text'] = df['text'].apply(lambda x: ' '.join(x))

In [116]:
df.head(2)

Unnamed: 0,text,spam
0,naturally irsistible corporate identity lt all...,1
1,stock trading gunslinger fanny merrill muzo co...,1


# Working With Text Data

## Tokenizing text with scikit-learn

In [117]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
bow_transformer = count_vect.fit(df['text'])

In [118]:
print(len(bow_transformer.vocabulary_))

32464


In [119]:
message_bow = bow_transformer.transform(df['text'])

In [120]:
message_bow.shape

(16183, 32464)

In [121]:
count_vect.vocabulary_.get(u'free')

13773

In [122]:
sparsity = (100.0 * message_bow.nnz/(message_bow.shape[0] * message_bow.shape[1]))
print('sparsity {}'.format(sparsity))

sparsity 0.1324401352483167


## From occurrences to frequencies

In [123]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(message_bow)

In [124]:
message_tfidf = tf_transformer.transform(message_bow)

In [125]:
message_tfidf.shape

(16183, 32464)

## Training

In [126]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(message_tfidf, df['spam'])

In [127]:
docs_new = ['I love you', '90% off this purchase, get now!']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

In [128]:
predicted

array([0, 1])

# Pipeline

## Build a Pipeline and use GridSearch to find the best params

In [143]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['spam'], test_size = 0.15) # 15%

In [144]:
pipeline = Pipeline([('vec', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB(alpha=0.01))])

In [145]:
parameters = {'clf__alpha': (0.00001, 0.0001, 0.001, 0.01, 0.1, 0.8, 0.9, 1),'clf__fit_prior': [True, False],}
grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, scoring='accuracy', cv=10, verbose=1)

In [146]:
grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:  1.9min finished


In [147]:
best_parameters

{'clf__alpha': 0.01, 'clf__fit_prior': True}

In [148]:
best_accuracy

0.9720829809725158

## Apply best parameters to save the best params for future work.

In [173]:
pipeline = Pipeline([('vec', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB(alpha=0.01))])

In [174]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vec', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB(alpha=0.01))])

### Test input
This is an example when user enter in text box in website the text input will be process and then predict

In [175]:
input_word = ['50% off for any purchase. Get it NOW!']
input_word = text_process(input_word)
input_word = [' '.join(input_word)]

In [176]:
# 0: ham, 1: spam
pipeline.predict(input_word)

array([1])

In [177]:
test_prob = pipeline.predict_proba(input_word)
print(test_prob)
print("Prob is ham:", test_prob[0][0])
print("Prob is spam:", test_prob[0][1])

[[0.28954534 0.71045466]]
Prob is ham: 0.28954534232169127
Prob is spam: 0.7104546576783078


### Test set

In [178]:
y_pred = pipeline.predict(X_test)

In [179]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [180]:
cm

array([[1219,   44],
       [   3, 1162]])

## Export file pkl  to save Pipeline

In [182]:
# write-binary. Store data using Python pickle
import pickle
filename = 'emailSpamClf.pkl'
pickle.dump(pipeline, open(filename, 'wb'))