In [None]:
# import the necessary libraries
import pandas as pd
import nltk
import numpy as np
import statsmodels.api as sm

In [None]:
# load the corpus and drop the missing rows
df = pd.read_csv("./email_corpus.csv")
df = df.dropna()
df = df.drop(['Unnamed: 0'], axis=1)

In [None]:
# split the sample into 80% for training and 20% for testing
df['mask'] = [np.random.uniform(0,1)  for k in df.index]
train = df[df['mask'] < 0.8]
test = df[df['mask']>= 0.8]

In [None]:
# create two lists of words that bring together all of the words found
# in the spam and ham training rows
spamwords = []
spamcount = 0
hamwords = []
hamcount = 0
for index, row in train.iterrows():
    li = list(row['words'].split(' '))
    if row['spam']==1:
        spamcount = spamcount + 1
        spamwords = spamwords + li
    else:
        hamcount = hamcount + 1
        hamwords = hamwords + li

In [None]:
hamdist = nltk.FreqDist(hamwords)
hamset = []
for each in hamdist:
    hamset.append(each)
print("The number of words in all of the ham emails: ", len(hamset))

In [None]:
spamdist = nltk.FreqDist(spamwords)
spamset = []
for each in spamdist:
    spamset.append(each)
print("The number of words in all of the spam emails: ",len(spamset))

In [None]:
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2))

def difference (lst1, lst2):
    result = []
    for element in lst1:
        if element not in lst2:
            result.append(element)
    return result
    
intersection_set = intersection(hamset, spamset)
print("The number of words shared by ham and spam: ",len(intersection_set))

spamonly_set = difference (spamset, hamset)
print("The number of words only in spam: ", len(spamonly_set))

hamonly_set = difference (hamset, spamset)
print("The number of words only in ham: ", len(hamonly_set))

In [None]:
# let's go through the full dataframe now and get a count of ham, spam,
# and shared words

# create a column in the dataframe for spamcount
df['spamcount']=0
df['hamcount']=0
df['sharedcount']=0

for ind in df.index:
    li = list(df.loc[ind, 'words'].split(' '))
    df.loc[ind, 'spamcount'] = len(intersection(li, spamonly_set))
    df.loc[ind, 'hamcount'] = len(intersection(li, hamonly_set))
    df.loc[ind, 'sharedcount'] = len(intersection(li, intersection_set))


In [None]:
# one approach we can take is to look at hamcount and spamcount
# and just see which one is larger.  If spamcount>=hamcount,
# then it seem logical that it is a spam email.  However,
# when spamcount<hamcount, then it probably is ham.  Let's
# try this on our testing data set.

df['simple_prediction']=0 #default is that email is ham unless the condition is met
df.loc[df['spamcount']>=df['hamcount'], 'simple_prediction']= 1
df['result']='unknown'
df.loc[(df['simple_prediction']==1) & (df['spam']==1), 'result']= "true positive"
df.loc[(df['simple_prediction']==0) & (df['spam']==1), 'result']= "false negative"
df.loc[(df['simple_prediction']==1) & (df['spam']==0), 'result']= "false positive"
df.loc[(df['simple_prediction']==0) & (df['spam']==0), 'result']= "true negative"

In [None]:
# at this point, we have a dataframe that is ready for analyzing

train = df[df['mask'] < 0.8]
test = df[df['mask']>= 0.8]

In [None]:
# let's determine our confusion matrix
print("True positives: ", test[test['result'] == "true positive"].count()["result"])
print("True negatives: ", test[test['result'] == "true negative"].count()["result"])
print("False positives: ", test[test['result'] == "false positive"].count()["result"])
print("False negatives: ", test[test['result'] == "false negative"].count()["result"])

In [None]:
# let's do a deep dive into the errors and see if we can find words that could be
# added to the intersection_set
df_errors = df[(df['result']=='false positive') | (df['result']=='false negative')]
for index, row in df_errors.iterrows():
    li = list(row['words'].split(' '))
    print('Is it spam?: ', row['spam'], "\n", row['result'])
    print("SPAM ", row['spamcount'], ": ", intersection(li, spamonly_set))
    print("HAM ", row['hamcount'], ": ", intersection(li, hamonly_set))
    print("\n")

In [None]:
move_list = [] # put words in this list to be moved
for each in move_list:
    spamonly_set.remove(each)
    intersection_set.append(each)

In [None]:
for ind in df.index:
    li = list(df.loc[ind, 'words'].split(' '))
    df.loc[ind, 'spamcount'] = len(intersection(li, spamonly_set))
    df.loc[ind, 'hamcount'] = len(intersection(li, hamonly_set))
    df.loc[ind, 'sharedcount'] = len(intersection(li, intersection_set))

df['simple_prediction']=0 #default is that email is ham unless the condition is met
df.loc[df['spamcount']>=df['hamcount'], 'simple_prediction']= 1
df['result']='unknown'
df.loc[(df['simple_prediction']==1) & (df['spam']==1), 'result']= "true positive"
df.loc[(df['simple_prediction']==0) & (df['spam']==1), 'result']= "false negative"
df.loc[(df['simple_prediction']==1) & (df['spam']==0), 'result']= "false positive"
df.loc[(df['simple_prediction']==0) & (df['spam']==0), 'result']= "true negative"

train = df[df['mask'] < 0.8]
test = df[df['mask']>= 0.8]

print("True positives: ", test[test['result'] == "true positive"].count()["result"])
print("True negatives: ", test[test['result'] == "true negative"].count()["result"])
print("False positives: ", test[test['result'] == "false positive"].count()["result"])
print("False negatives: ", test[test['result'] == "false negative"].count()["result"])