In [2]:
import pandas as pd
# Read file in the same folder.
df = pd.read_csv(r'data/email-dataset.csv')

In [3]:
df.shape

(10381, 2)

In [5]:
df.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [6]:
import string
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk import word_tokenize, WordNetLemmatizer

# Converting a word to its base form

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()


def purify_text(message):
    soup = BeautifulSoup(message)
    text =  soup.get_text()
    text = text.replace("\n", " ").replace("/", "").replace("|", "").replace("http", "").replace(':', "").replace('\t', "").replace("Subject", "").replace('re', "").strip()
    
    # Split a sentence into words. Ex: "I love you." --> ["I", "love", "you", "."]. Similar to split().
    tokens = word_tokenize(text) 
    
    # Ex: playing or played --> Lemmatization --> play
    temp = [lemmatizer.lemmatize(word.lower()) for word in tokens] 
    return ' '.join(temp)


def text_process(message):
    noPunc = [char for char in message if char not in string.punctuation]
    noPunc = ''.join(noPunc)
    
    return [word for word in noPunc.split() if word not in stopwords.words('english')]

In [7]:
# Tokenization and lemmatization text. 
df['text'] = df['text'].apply(purify_text)

In [8]:
df.head(5)

Unnamed: 0,text,spam
0,naturally irsistible your corporate identity l...,1
1,the stock trading gunslinger fanny is merrill ...,1
2,unbelievable new home made easy im wanting to ...,1
3,4 color printing special quest additional info...,1
4,"do not have money , get softwa cd from he ! so...",1


In [9]:
# Remove Stop Words (english words).
# apply text_process for each email in column text (4 mins).
df['text'] = df['text'].apply(text_process)

In [10]:
df.head(5)

Unnamed: 0,text,spam
0,"[naturally, irsistible, corporate, identity, l...",1
1,"[stock, trading, gunslinger, fanny, merrill, m...",1
2,"[unbelievable, new, home, made, easy, im, want...",1
3,"[4, color, printing, special, quest, additiona...",1
4,"[money, get, softwa, cd, softwa, compatibility...",1


In [11]:
df.tail(5)

Unnamed: 0,text,spam
10376,"[2nd, time, tried, 2, contact, u, u, å£750, po...",1
10377,"[ì, b, going, esplanade, fr, home]",0
10378,"[pity, wa, mood, suggestion]",0
10379,"[guy, bitching, acted, like, intested, buying,...",0
10380,"[rofl, true, name]",0


In [12]:
df['text'] = df['text'].apply(lambda x: ' '.join(x))

In [14]:
df.head(5)

Unnamed: 0,text,spam
0,naturally irsistible corporate identity lt all...,1
1,stock trading gunslinger fanny merrill muzo co...,1
2,unbelievable new home made easy im wanting sho...,1
3,4 color printing special quest additional info...,1
4,money get softwa cd softwa compatibility gat g...,1


# Working With Text Data

## Tokenizing text with scikit-learn

In [15]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
bow_transformer = count_vect.fit(df['text'])

In [16]:
print(len(bow_transformer.vocabulary_))

32464


In [17]:
message_bow = bow_transformer.transform(df['text'])

In [18]:
message_bow.shape

(10381, 32464)

In [19]:
count_vect.vocabulary_.get(u'free')

13773

In [20]:
sparsity = (100.0 * message_bow.nnz/(message_bow.shape[0] * message_bow.shape[1]))
print('sparsity {}'.format(sparsity))

sparsity 0.12709282972280034


## From occurrences to frequencies

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(message_bow)

In [22]:
message_tfidf = tf_transformer.transform(message_bow)

In [23]:
message_tfidf.shape

(10381, 32464)

## Training

In [35]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(message_tfidf, df['spam'])

In [36]:
docs_new = ['I love you', '100% off this purchase for free and more, get now!']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

In [37]:
predicted

array([0, 1])

# Pipeline

## Build a Pipeline and use GridSearch to find the best params

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['spam'], test_size = 0.15) # 15%

In [29]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vec', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB(alpha=0.01))])

In [31]:
from sklearn.model_selection import GridSearchCV
parameters = {'clf__alpha': (0.00001, 0.0001, 0.001, 0.01, 0.1, 0.8, 0.9, 1),'clf__fit_prior': [True, False],}
grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, scoring='accuracy', cv=10, verbose=1)

In [32]:
grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:  1.2min finished


In [33]:
best_parameters

{'clf__alpha': 0.1, 'clf__fit_prior': True}

In [34]:
best_accuracy

0.9691715266703133

## Apply best parameters to save the best params for future work.

In [38]:
pipeline = Pipeline([('vec', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB(alpha=0.01))])

In [39]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vec', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB(alpha=0.01))])

### Test input
This is an example when user enter in text box in website the text input will be process and then predict

In [43]:
input_word = ['50% off for any purchase free for 1st times. Get it NOW!']
input_word = text_process(input_word)
input_word = [' '.join(input_word)]

In [44]:
# 0: ham, 1: spam
pipeline.predict(input_word)

array([1])

In [45]:
test_prob = pipeline.predict_proba(input_word)
print(test_prob)
print("Prob is ham:", test_prob[0][0])
print("Prob is spam:", test_prob[0][1])

[[0.01235419 0.98764581]]
Prob is ham: 0.01235419095075997
Prob is spam: 0.9876458090492392


### Test set

In [46]:
y_pred = pipeline.predict(X_test)

In [47]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [48]:
cm

array([[1265,   15],
       [  24,  254]])

## Export file pkl  to save Pipeline

In [49]:
# write-binary. Store data using Python pickle
import pickle
filename = 'emailSpamClf.pkl'
pickle.dump(pipeline, open(filename, 'wb'))