In [1]:
import pandas as pd

df = pd.read_csv('Data/ham-spam.csv')
df.head()

Unnamed: 0,IsSpam,Text
0,0,key issues going forwarda year end reviews rep...
1,0,congrats contratulations the execution the cen...
2,0,key issues going forwardall under control set...
3,0,epmi files protest entergy transcoattached our...
4,0,california power please contact kristin walsh ...


In [2]:
df.groupby('IsSpam').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
IsSpam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,500,499,paso firm capacity award memorandumlouise del...,2
1,500,500,get discount drugs without prescriptiondiscoun...,1


In [3]:
df = df.drop_duplicates()
df.groupby('IsSpam').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
IsSpam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,499,499,the cre public consultation proposals cera ins...,1
1,500,500,get discount drugs without prescriptiondiscoun...,1


In [4]:
# Build a vocabulary of word counts from the words in the 'Text' column
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=['the', 'and', 'are', 'not'])
vectorizer.fit(df['Text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['the', 'and', 'are', 'not'], strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [5]:
# Show the vocabulary built by the vectorizer
vectorizer.vocabulary_

{'key': 55587,
 'issues': 53692,
 'going': 44140,
 'forwarda': 40891,
 'year': 112798,
 'end': 33600,
 'reviews': 85111,
 'report': 83789,
 'needs': 66557,
 'generating': 43186,
 'like': 57996,
 'mid': 63361,
 'documenting': 29808,
 'business': 14189,
 'unit': 104245,
 'performance': 74332,
 'review': 85033,
 'completion': 20928,
 'david': 25721,
 'john': 54624,
 'work': 111682,
 'out': 71777,
 'plan': 75446,
 'generation': 43195,
 'for': 39642,
 'nim': 67429,
 'employees': 33392,
 'hpl': 49361,
 'transition': 102585,
 'ongoing': 69956,
 'officially': 69346,
 'transferred': 102542,
 'regardsdelainey': 82909,
 'key issues': 55597,
 'issues going': 53719,
 'going forwarda': 44159,
 'forwarda year': 40892,
 'year end': 112825,
 'end reviews': 33636,
 'reviews report': 85118,
 'report needs': 83822,
 'needs generating': 66567,
 'generating like': 43189,
 'like mid': 58078,
 'mid year': 63375,
 'year documenting': 112824,
 'documenting business': 29809,
 'business unit': 14291,
 'unit perfo

In [6]:
# Vectorize the words in the 'Text' column and split the data 80/20
from sklearn.model_selection import train_test_split

vectors = vectorizer.transform(df['Text'])
x_train, x_test, y_train, y_test = train_test_split(vectors, df['IsSpam'], test_size=0.2, random_state=0)  

In [7]:
# Train a Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, model.predict(x_test))

array([[101,   1],
       [  2,  96]])

In [9]:
# Get the overall accuracy of the model
model.score(x_test, y_test)

0.985

In [10]:
# Get the ROC score
from sklearn.metrics import roc_auc_score

probabilities = model.predict_proba(x_test)
roc_auc_score(y_test, probabilities[:, 1])

0.999499799919968

In [11]:
# Define a function for cleaning text by removing punctuation characters, converting multiple
# spaces to single spaces, and converting characters to lowercase
import string, re
table = str.maketrans('', '', string.punctuation)

def clean_text(text):
    return re.sub(' +', ' ', text.translate(table).lower())

In [12]:
# Test a message that is not spam
message = vectorizer.transform([clean_text('Can you attend a code review on Tuesday? Need to make sure the logic is rock solid.')])
print(model.predict(message))

[0]


In [13]:
print(model.predict_proba(message))

[[9.99982697e-01 1.73031242e-05]]


In [14]:
# Test a message that is spam
message = vectorizer.transform([clean_text('Why pay more for expensive meds when you can order them online and save $$$?')])
print(model.predict(message))

[1]


In [15]:
print(model.predict_proba(message))

[[6.76841828e-06 9.99993232e-01]]
