In [22]:
# import the necessary libraries
import pandas as pd
import nltk
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [23]:
# load the corpus and drop the missing rows
df = pd.read_csv("~/Desktop/email_corpus.csv")
df = df.dropna()
df = df.drop(['Unnamed: 0'], axis=1)
df = df.set_index('id')

In [24]:
df

Unnamed: 0_level_0,spam,file,words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,00249.5f45607c1bffe89f60ba1ec9f878039a,zzzzasonorg mailsweeperabc-arbitragecom mailsw...
1,1,00373.ebe8670ac56b04125c25100a36ab0510,zzzzasonorg webnotenet serverhub webnotenet mx...
2,1,00214.1367039e50dc6b7adb0f2aa8aba83216,zzzzasonorg mail pickup service microsoft smtp...
3,1,00210.050ffd105bd4e006771ee63cabc59978,zzzzasonorg x-authentication-warning didn't us...
4,1,00033.9babb58d9298daa2963d4f514193d7d6,phoboslabsspamassassintaintorg mailwebnotenet ...
...,...,...,...
2997,0,01037.6b42b5f3d3d9e6293bf24af66b250655,phoboslabsnetnoteinccom phobos egwnnet egwnnet...
2998,0,02056.7bc7703e40a24dda665d4ce7b0cba710,gamasutra ryder cup url europe united states e...
2999,0,01782.278f53b8f65fcd422cb26c5bbe74599d,[use perl] stories precedence list x-bulkmail ...
3000,0,00043.d2673a72d215cbdd747dc98cde41fbd2,phoboslabsnetnoteinccom phobos lughtuathaorg l...


In [25]:
Xs = df['words'].values
Ys = df['spam'].values

X_train, X_test, y_train, y_test = train_test_split(Xs,
                                                    Ys,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=0,
                                                    stratify=Ys)

In [26]:
feature_names = vectorizer.get_feature_names()
print("Number of different words: {0}".format(len(feature_names)))
print("Word example: {0}".format(feature_names[5369]))

# Check the split printing the shape of each set.
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

NotFittedError: Vocabulary not fitted or provided

In [7]:


email = ["Hello George, how about a game of tennis tomorrow?",
         "Hello, click here if you want to satisfy your wife tonight",
         "We offer free viagra!!! Click here now!!!",
         "Dear Sara, I prepared the annual report. Please check the attachment.",
         "Hi David, will we go for cinema tonight?",
         "Best holidays offers only here!!!"]

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                       stop_words='english')

examples = vectorizer.transform(email)
predictions = clf.predict(examples)
predictions

NotFittedError: The TF-IDF vectorizer is not fitted

In [8]:
# split the sample into 80% for training and 20% for testing
df['mask'] = [np.random.uniform(0,1)  for k in df.index]
train = df[df['mask'] < 0.8]
test = df[df['mask']>= 0.8]

In [9]:
# create two lists of words that bring together all of the words found
# in the spam and ham training rows
spamwords = []
spamcount = 0
hamwords = []
hamcount = 0
for index, row in train.iterrows():
    li = list(row['words'].split(' '))
    if row['spam']==1:
        spamcount = spamcount + 1
        spamwords = spamwords + li
    else:
        hamcount = hamcount + 1
        hamwords = hamwords + li

In [10]:
# getting all of the words found in the ham emails
hamdist = nltk.FreqDist(hamwords)
hamset = []
for each in hamdist:
    hamset.append(each)
print("The number of words in all of the ham emails: ", len(hamset))

The number of words in all of the ham emails:  27921


In [11]:
# getting all of the words found in the spam emails
spamdist = nltk.FreqDist(spamwords)
spamset = []
for each in spamdist:
    spamset.append(each)
print("The number of words in all of the spam emails: ",len(spamset))

The number of words in all of the spam emails:  10852


In [12]:
# from a Venn Diagram perspective, we create two functions to help us
# understand the set of words

#this function returns all of the words shared between two sets
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2))

# this function returns all of the words found only in the first set
def difference (lst1, lst2):
    result = []
    for element in lst1:
        if element not in lst2:
            result.append(element)
    return result

In [13]:
# we now look at taking the two sets and making three sets
# intersection_set:  words found in both ham and spam
# spamonly_set:  words found in spam but not ham
# hamonly_set:  words found in ham but not spam
    
intersection_set = intersection(hamset, spamset)
print("The number of words shared by ham and spam: ",len(intersection_set))

spamonly_set = difference (spamset, hamset)
print("The number of words only in spam: ", len(spamonly_set))

hamonly_set = difference (hamset, spamset)
print("The number of words only in ham: ", len(hamonly_set))

The number of words shared by ham and spam:  5761
The number of words only in spam:  5091
The number of words only in ham:  22160


In [14]:
# let's go through the full dataframe now and get a count of ham, spam,
# and shared words

# create a column in the dataframe for spamcount
df['spamcount']=0
df['hamcount']=0
df['sharedcount']=0

for ind in df.index:
    li = list(df.loc[ind, 'words'].split(' '))
    df.loc[ind, 'spamcount'] = len(intersection(li, spamonly_set))
    df.loc[ind, 'hamcount'] = len(intersection(li, hamonly_set))
    df.loc[ind, 'sharedcount'] = len(intersection(li, intersection_set))


In [27]:
# one approach we can take is to look at hamcount and spamcount
# and just see which one is larger.  If spamcount>=hamcount,
# then it seem logical that it is a spam email.  However,
# when spamcount<hamcount, then it probably is ham.  Let's
# try this on our testing data set.

df['simple_prediction']=0 #default is that email is ham unless the condition is met
df.loc[df['spamcount']>=df['hamcount'], 'simple_prediction']= 1

df['result']='unknown'
df.loc[(df['simple_prediction']==1) & (df['spam']==1), 'result']= "true positive"
df.loc[(df['simple_prediction']==0) & (df['spam']==1), 'result']= "false negative"
df.loc[(df['simple_prediction']==1) & (df['spam']==0), 'result']= "false positive"
df.loc[(df['simple_prediction']==0) & (df['spam']==0), 'result']= "true negative"

train = df[df['mask'] < 0.8]
test = df[df['mask']>= 0.8]
results = confusion_matrix(test['spam'], test['simple_prediction'])
print ('Confusion Matrix :')
print(results) 

Confusion Matrix :
[[495   2]
 [  7 101]]


In [28]:
# let's do a deep dive into the errors and see if we can find words that could be
# added to the intersection_set
df_errors = df[(df['result']=='false positive') | (df['result']=='false negative')]
for index, row in df_errors.iterrows():
    li = list(row['words'].split(' '))
    print('Is it spam?: ', row['spam'], "\n", row['result'])
    print("SPAM ", row['spamcount'], ": ", intersection(li, spamonly_set))
    print("HAM ", row['hamcount'], ": ", intersection(li, hamonly_set))
    print("\n")

Is it spam?:  1 
 false negative
SPAM  2 :  ['zzzzasonorg', 'hallo']
HAM  3 :  ['english', 'bother', 'russian']


Is it spam?:  1 
 false negative
SPAM  2 :  ['zzzzasonorg', 'adv']
HAM  4 :  ['enterprises', 'ass', 'x-accept-language', 'beach']


Is it spam?:  1 
 false negative
SPAM  9 :  ['purchasing', 'traditionally', 'mlm', "'remove'", 'multi-level', 'intrusion', 'zzzzasonorg', 'risk-free', 'miracle']
HAM  24 :  ['leg', 'carbon', '"look', 'exception', 'ingenious', 'scream', 'winner', 'smiling', 'flaws', 'cautious', 'hoopla', 'likes', 'bones', 'overly', 'deep', 'definitely', 'awhile', 'positions', 'initially', 'arm', 'convinced', 'excited', 'magic', 'dubious']


Is it spam?:  1 
 false negative
SPAM  4 :  ['attained', 'zzzzasonorg', 'cocks', 'overlook']
HAM  6 :  ['sucking', 'scared', 'gauge', 'showed', 'hall', 'tons']


Is it spam?:  1 
 false negative
SPAM  14 :  ['helvetica', 'llc"', 'sans-serif', 'warehouse', 'padding-right', 'arial', 'padding-left', 'none"', 'lbs', 'font-size', 

In [29]:
# based on the analysis above, we can create a set of words
# we want to move from the spamonly_set into the intersection_set

move_list = [] # put words in this list to be moved
for each in move_list:
    spamonly_set.remove(each)
    intersection_set.append(each)

In [30]:
# redoing the analysis with the updated word sets
for ind in df.index:
    li = list(df.loc[ind, 'words'].split(' '))
    df.loc[ind, 'spamcount'] = len(intersection(li, spamonly_set))
    df.loc[ind, 'hamcount'] = len(intersection(li, hamonly_set))
    df.loc[ind, 'sharedcount'] = len(intersection(li, intersection_set))

df['simple_prediction']=0 #default is that email is ham unless the condition is met
df.loc[df['spamcount']>=df['hamcount'], 'simple_prediction']= 1
df['result']='unknown'
df.loc[(df['simple_prediction']==1) & (df['spam']==1), 'result']= "true positive"
df.loc[(df['simple_prediction']==0) & (df['spam']==1), 'result']= "false negative"
df.loc[(df['simple_prediction']==1) & (df['spam']==0), 'result']= "false positive"
df.loc[(df['simple_prediction']==0) & (df['spam']==0), 'result']= "true negative"

train = df[df['mask'] < 0.8]
test = df[df['mask']>= 0.8]

print("True positives: ", test[test['result'] == "true positive"].count()["result"])
print("True negatives: ", test[test['result'] == "true negative"].count()["result"])
print("False positives: ", test[test['result'] == "false positive"].count()["result"])
print("False negatives: ", test[test['result'] == "false negative"].count()["result"])

True positives:  101
True negatives:  495
False positives:  2
False negatives:  7
