In [23]:
%matplotlib inline

#text analysis
import os
from collections import Counter
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

#train
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline


#text processing
from bs4 import BeautifulSoup
from nltk import word_tokenize, WordNetLemmatizer

In [24]:
# Read file in the same folder.
df = pd.read_csv(r'data/email-dataset.csv')

In [25]:
df.shape

(16183, 2)

In [26]:
df.head(2)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1


In [27]:
import string
from nltk.corpus import stopwords


# Converting a word to its base form

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()


def purify_text(message):
    soup = BeautifulSoup(message)
    text =  soup.get_text()
    text = text.replace("\n", " ").replace("/", "").replace("|", "").replace("http", "").replace(':', "").replace('\t', "").replace("Subject", "").replace('re', "").strip()
    
    # Split a sentence into words. Ex: "I love you." --> ["I", "love", "you", "."]. Similar to split().
    tokens = word_tokenize(text) 
    
    # Ex: playing or played --> Lemmatization --> play
    temp = [lemmatizer.lemmatize(word.lower()) for word in tokens] 
    return ' '.join(temp)


def text_process(message):
    noPunc = [char for char in message if char not in string.punctuation]
    noPunc = ''.join(noPunc)
    
    return [word for word in noPunc.split() if word not in stopwords.words('english')]

In [28]:
df['text'] = df['text'].apply(purify_text)

In [29]:
df.head(2)

Unnamed: 0,text,spam
0,naturally irsistible your corporate identity l...,1
1,the stock trading gunslinger fanny is merrill ...,1


In [31]:
# apply text_process for each email in column text (take time to process: 4 mins)
df['text'] = df['text'].apply(text_process)

In [32]:
df.head(2)

Unnamed: 0,text,spam
0,"[naturally, irsistible, corporate, identity, l...",1
1,"[stock, trading, gunslinger, fanny, merrill, m...",1


In [33]:
df.tail(2)

Unnamed: 0,text,spam
16181,"[reminder, o2, get, 250, pound, fe, call, cdit...",1
16182,"[2nd, time, tried, 2, contact, u, u, å£750, po...",1


In [34]:
df['text'] = df['text'].apply(lambda x: ' '.join(x))

In [36]:
df.head(2)

Unnamed: 0,text,spam
0,naturally irsistible corporate identity lt all...,1
1,stock trading gunslinger fanny merrill muzo co...,1


In [37]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow_transformer = vectorizer.fit(df['text'])

In [38]:
print(len(bow_transformer.vocabulary_))

32464


In [39]:
message_bow = bow_transformer.transform(df['text'])

In [40]:
sparsity = (100.0 * message_bow.nnz/(message_bow.shape[0] * message_bow.shape[1]))
print('sparsity {}'.format(sparsity))

sparsity 0.1324401352483167


In [41]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer_bow = TfidfTransformer().fit(message_bow)

In [42]:
message_tfidf = tfidf_transformer_bow.transform(message_bow)

In [43]:
def buildClassifier(alpha):
    text_classifier = MultinomialNB(alpha=alpha)
    return text_classifier

In [44]:
pipeline = Pipeline([('vec', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB(alpha=0.01))])

In [45]:
parameters = {'clf__alpha': (0.00001, 0.0001, 0.001, 0.01, 0.1, 0.8, 0.9, 1)}
grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, scoring='accuracy', cv=10, verbose=1)

In [46]:
grid_search.fit(df['text'], df['spam'])
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.1min finished


In [47]:
best_parameters

{'clf__alpha': 0.001}

In [48]:
best_accuracy

0.9729916145646834

## Apply best parameters

In [51]:
pipeline = Pipeline([('vec', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB(alpha=0.001))])

In [52]:
pipeline.fit(df['text'], df['spam'])

Pipeline(steps=[('vec', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB(alpha=0.001))])

### Test input
This is an example when user enter in text box in website the text input will be process and then predic

In [71]:
input_word = ['I miss you soo much']
input_word = text_process(input_word)
input_word = [' '.join(input_word)]

In [72]:
# 0: ham, 1: spam
pipeline.predict(input_word)

array([0])

In [74]:
test_prob = pipeline.predict_proba(input_word)
print(test_prob)
print("Prob is ham:", test_prob[0][0])
print("Prob is spam:", test_prob[0][1])

[[0.89221873 0.10778127]]
Prob is ham: 0.8922187259524372
Prob is spam: 0.10778127404756234


In [66]:
# write-binary. Store data using Python pickle
import pickle
filename = 'emailSpamClf.pkl'
pickle.dump(pipeline, open(filename, 'wb'))