In [1]:
import glob
import string
import numpy as np
from pandas import DataFrame
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from collections import Counter

import nltk
nltk.download('popular')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/ferd/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/ferd/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/ferd/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/ferd/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/ferd/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/ferd/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!

In [2]:
directory = [('enronTrain/ham','ham'),('enronTrain/spam','spam')]

In [3]:
trainHamFiles = glob.glob('enronTrain/ham/*.txt')
trainSpamFiles = glob.glob('enronTrain/spam/*.txt')

### Helper Functions - Pre-Processing

In [53]:
#Remove punctuations from txt file
def removePunc(emailText):
    puncResult = ''.join(x for x in emailText if x not in string.punctuation)
    return puncResult

#Remove captial letters from the txt file
def removeCap(emailText):
    lowerResult = emailText.lower()
    return lowerResult

#Remove common stop words using nltk stop word list
def removeStop(emailText):
    stop_list = stopwords.words('english')
    stop_list.append('subject')

    tokens = word_tokenize(emailText)
    stopResult = ' '.join(y for y in tokens if not y in stop_list)
    return stopResult

#Find the most common owrds of given corpus
def mostCommonWords(contentDf,num):
    sumContents = contentDf.content.str.cat(sep=' ')
    tokens = word_tokenize(sumContents)
    frequency_dist = nltk.FreqDist(tokens)
    return frequency_dist.most_common(num)

### Helper Function - Data Collection

In [50]:
#Read the contents of all txt files of a given path 
def readFile(path,noStop):
    trainFiles = glob.glob(path+'/*.txt')
    
    for file in trainFiles:
        fileObject = open(file,'rb')
        fileContents = fileObject.read().decode('utf-8','ignore')
        fileContents = fileContents.replace('\n', ' ')
        fileContents = removePunc(fileContents)
        fileContents = removeCap(fileContents)
        if noStop is True:
            fileContents = removeStop(fileContents)
        fileObject.close()
        yield fileContents

#Create a pandas data frame of the email contnets with labels
def createDataFrame(path,label,noStop):
    rows = []
    
    for fileContent in readFile(path,noStop):
        rows.append({'content':fileContent, 'label':label})
    df = DataFrame(rows)

    return df

### Train Model: No Capitilization, No punctuation

In [6]:
#Build data frame of all the training emails enron1-enron5
trainData1 = DataFrame({'content':[],'label':[]})
for path, label in directory:
    trainData1 = trainData1.append(createDataFrame(path,label,False))
    
#Create a bag of words representation based on word frequency of training data
trainVectorizer1 = CountVectorizer()
bagOfWordsTrain1 = trainVectorizer1.fit_transform(trainData1.content)

In [8]:
#Fit Naive Bayes model
clf1 = MultinomialNB()
clf2 = MultinomialNB(alpha=0)
clf1.fit(bagOfWordsTrain1,trainData1.label)
clf2.fit(bagOfWordsTrain1,trainData1.label)

  'setting alpha = %.1e' % _ALPHA_MIN)


MultinomialNB(alpha=0, class_prior=None, fit_prior=True)

In [9]:
#Create a data frame to extract features of test data
testDirectory = [('enronTest/ham','ham'),('enronTest/spam','spam')]
testData1 = DataFrame({'content':[],'label':[]})

for path, label in testDirectory:
    testData1 = testData1.append(createDataFrame(path,label,False))
    
bagOfWordsTest1 = trainVectorizer1.transform(testData1.content)
resultsWithStopLap = clf1.predict(bagOfWordsTest1)
resultsWithStopNoLap = clf2.predict(bagOfWordsTest1)


In [10]:
print('Accuracy With Stop Words & Laplace Smoothing: ', metrics.accuracy_score(testData1.label,resultsWithStopLap))
print('Accuracy With Stop Words & No Lapalace Smoothing: ', metrics.accuracy_score(testData1.label,resultsWithStopNoLap))

Accuracy With Stop Words & Laplace Smoothing:  0.983
Accuracy With Stop Words & No Lapalace Smoothing:  0.9791666666666666


### Most Frequent Words

In [54]:
#Find the most common 10 words form ham and spam emails
hamTrainContent1 = trainData1.iloc[:15045,:]
spamTrainContent1 = trainData1.iloc[15045:,:]

print('Top 10 Most Frequent Ham Words With Stop Words:')
print(mostCommonWords(hamTrainContent1,10),end='\n\n')
print('Top 10 Most Frequent Spam Words With Stop Words:')
print(mostCommonWords(spamTrainContent1,10))

Top 10 Most Frequent Ham Words With Stop Words:
[('the', 166481), ('to', 119021), ('and', 76114), ('of', 70479), ('a', 60482), ('in', 55667), ('enron', 55334), ('for', 45435), ('on', 38392), ('i', 35588)]

Top 10 Most Frequent Spam Words With Stop Words:
[('the', 74749), ('to', 59967), ('and', 51802), ('of', 47514), ('a', 38100), ('you', 35064), ('in', 31755), ('your', 25821), ('for', 25593), ('this', 23851)]


### Most Discriminative Words

In [48]:
logProb = clf1.feature_log_prob_

discrimHam = logProb[0,:] - logProb[1,:]
discrimSpam = logProb[1,:] - logProb[0,:]

In [49]:
indexMaxHam = np.argsort(discrimHam)[::-1]
indexMaxSpam = np.argsort(discrimSpam)[::-1]

topHam = np.array(trainVectorizer1.get_feature_names())[indexMaxHam[0:10]]
topSpam = np.array(trainVectorizer1.get_feature_names())[indexMaxSpam[0:10]]

print('Top 5 Most Discriminative Ham Words:',topHam)
print('Top 5 Most Discriminative Spam Words',topSpam)

Top 5 Most Discriminative Ham Words: ['enron' 'kaminski' 'dynegy' 'ect' 'ees' 'ena' 'dbcaps' 'hourahead'
 'fastow' 'mmbtu']
Top 5 Most Discriminative Spam Words ['pills' 'viagra' 'computron' 'cialis' 'nbsp' 'photoshop' 'width' 'href'
 'voip' 'paypal']


### Train Model: No Capitilization, No Punctuation, No Stop Words

In [12]:
#Build data frame of all the training emails enron1-enron5
trainData2 = DataFrame({'content':[],'label':[]})
for path, label in directory:
    trainData2 = trainData2.append(createDataFrame(path,label,True))
    

In [58]:
print(len(trainVectorizer2.vocabulary_))

136031


In [13]:
#Create a bag of words representation based on word frequency of training data
trainVectorizer2 = CountVectorizer()
bagOfWordsTrain2 = trainVectorizer2.fit_transform(trainData2.content)

In [14]:
#Fit Naive Bayes model
clf3 = MultinomialNB()
clf4 = MultinomialNB(alpha=0)
clf3.fit(bagOfWordsTrain2,trainData2.label)
clf4.fit(bagOfWordsTrain2,trainData2.label)

  'setting alpha = %.1e' % _ALPHA_MIN)


MultinomialNB(alpha=0, class_prior=None, fit_prior=True)

### Most Frequent Words

In [55]:
# Get the top 10 most common words for ham and spam emails without stop words
#Find the most common 10 words form ham and spam emails
hamTrainContent2 = trainData2.iloc[:15045,:]
spamTrainContent2 = trainData2.iloc[15045:,:]

print('Top 10 Most Frequent Ham Words Without Stop Words:')
print(mostCommonWords(hamTrainContent2,10),end='\n\n')
print('Top 10 Most Frequent Spam Words Without Stop Words:')
print(mostCommonWords(spamTrainContent2,10))

Top 10 Most Frequent Ham Words Without Stop Words:
[('enron', 55334), ('ect', 35223), ('hou', 16893), ('2001', 14700), ('2000', 12851), ('1', 12638), ('please', 11867), ('would', 11833), ('company', 11321), ('com', 11265)]

Top 10 Most Frequent Spam Words Without Stop Words:
[('com', 8842), ('1', 7794), ('3', 7661), ('company', 6938), ('2', 6793), ('http', 6729), ('e', 6647), ('email', 6001), ('information', 5492), ('5', 5388)]


### Most Discriminative Words

In [52]:
logProb = clf3.feature_log_prob_

discrimHam = logProb[0,:] - logProb[1,:]
discrimSpam = logProb[1,:] - logProb[0,:]

In [45]:
indexMaxHam = np.argsort(discrimHam)[::-1]
indexMaxSpam = np.argsort(discrimSpam)[::-1]

topHam = np.array(trainVectorizer2.get_feature_names())[indexMaxHam[0:10]]
topSpam = np.array(trainVectorizer2.get_feature_names())[indexMaxSpam[0:10]]

print('Top 5 Most Discriminative Ham Words:',topHam)
print('Top 5 Most Discriminative Spam Words',topSpam)

Top 5 Most Discriminative Ham Words: ['enron' 'kaminski' 'dynegy' 'ect' 'ees' 'ena' 'dbcaps' 'hourahead'
 'fastow' 'mmbtu']
Top 5 Most Discriminative Spam Words ['pills' 'viagra' 'computron' 'cialis' 'nbsp' 'photoshop' 'width' 'href'
 'voip' 'paypal']


In [27]:
#Create a data frame to extract features of test data
testDirectory = [('enronTest/ham','ham'),('enronTest/spam','spam')]
testData2 = DataFrame({'content':[],'label':[]})

for path, label in testDirectory:
    testData2 = testData2.append(createDataFrame(path,label,True))
    
bagOfWordsTest2 = trainVectorizer2.transform(testData2.content)
resultsNoStopLap = clf3.predict(bagOfWordsTest2)
resultsNoStopNoLap = clf4.predict(bagOfWordsTest2)



In [28]:
print('Accuracy No Stop Words & Laplace Smoothing: ', metrics.accuracy_score(testData2.label,resultsNoStopLap))
print('Accuracy No Stop Words & No Lapalace Smoothing: ', metrics.accuracy_score(testData2.label,resultsNoStopNoLap))

Accuracy No Stop Words & Laplace Smoothing:  0.9821666666666666
Accuracy No Stop Words & No Lapalace Smoothing:  0.9781666666666666
