In [1]:
#import necessary packages
import os
import sys
import random
import nltk
from nltk.corpus import stopwords
import time
import numpy as np
from sklearn.model_selection import KFold
import collections
from nltk.metrics.scores import (precision, recall)

In [2]:
# function to read spam and ham files, train and test a classifier 
def processspamham(limitStr):
      # convert the limit argument from a string to an int
    limit = int(limitStr)

      # start lists for spam and ham email texts
    hamtexts = []
    spamtexts = []
    #os.chdir(dirPath)
      # process all files in directory that end in .txt up to the limit
      #    assuming that the emails are sufficiently randomized
    for file in os.listdir("./corpus/spam"):
        if (file.endswith(".txt")) and (len(spamtexts) < limit):
          # open file for reading and read entire file into a string
            f = open("./corpus/spam/"+file, 'r', encoding="latin-1")
            spamtexts.append (f.read())
            f.close()
    for file in os.listdir("./corpus/ham"):
        if (file.endswith(".txt")) and (len(hamtexts) < limit):
          # open file for reading and read entire file into a string
            f = open("./corpus/ham/"+file, 'r', encoding="latin-1")
            hamtexts.append (f.read())
            f.close()

      # print number emails read
    print ("Number of spam files:",len(spamtexts))
    print ("Number of ham files:",len(hamtexts))
    print

      # create list of mixed spam and ham email documents as (list of words, label)
    global emaildocs
    emaildocs = []
      # add all the spam
    for spam in spamtexts:
        tokens = nltk.word_tokenize(spam)
        emaildocs.append((tokens, 'spam'))
      # add all the regular emails
    for ham in hamtexts:
        tokens = nltk.word_tokenize(ham)
        emaildocs.append((tokens, 'ham'))

      # randomize the list
    random.shuffle(emaildocs)

      # print a few token lists
    for email in emaildocs[:4]:
        print (email)
        print
        
      # possibly filter tokens

      # continue as usual to get all words and create word features

      # feature sets from a feature definition function

      # train classifier and show performance in cross-validation

In [3]:
processspamham(1000)

Number of spam files: 1000
Number of ham files: 1000
(['Subject', ':', 'sitara', '#', '92886', 'for', 'febo', '0', ',', 'any', 'gas', 'taken', 'by', 'global', 'octanes', 'will', 'be', 'priced', 'off', 'of', 'gas', 'daily', 'rather', 'than', 'if', '.', '.', '.', '.', 'i', "'", 've', 'changed', 'the', 'sitara', 'ticket', '.', '.', '.', '.', 'but', 'one', 'of', 'you', 'gentlemen', 'needs', 'to', 'take', 'care', 'of', 'risk', 'assignment', 'impact', '.', '.', '.', '.', 'lee'], 'ham')
(['Subject', ':', 'entex', 'increase', 'rom', ':', 'huntsville', '/', 'woodlands', '/', 'conroe', '11500', 'by', '8', 'days', '=', '92', ',', '000', 'big', 'ticket', '50000', 'by', '8', 'days', '=', '400', ',', '000', 'ami'], 'ham')
(['Subject', ':', 'discreet', 'cheapest', 'pr', '!', 'escription', 'd', '\\', 'rugs', 'online', '!'], 'spam')
(['Subject', ':', 're', ':', '3', '-', 'rivers', '/', 'king', 'ranch', '/', 'hplr', 'liquids', 'i', 'have', 'identified', 'the', 'additional', 'hplc', 'wellhead', 'purchase

In [4]:
#defining set of words that will be used for features
#we'll find the 2000 most common words and used them as an important feature of the whole corpus

all_words = [word for (email,category) in emaildocs for word in email]
top_words = nltk.FreqDist(all_words)
most_common_words = top_words.most_common(2000)
word_features = [word for (word,count) in most_common_words]

#checking if we have the 2000 words we need
len(set(all_words))

37261

In [5]:
word_features[:20]

['-',
 '.',
 '/',
 ',',
 ':',
 'the',
 'to',
 'ect',
 'and',
 '@',
 'of',
 'a',
 'for',
 'you',
 'in',
 'is',
 'this',
 'hou',
 'on',
 '=']

In [6]:
#now we will use that list of most frequent words in the entire corpus
#to iterate over each sentence and check if any of those words are present
#in that way, we will see if this unigram corpus feature is present on that particular sentence
#using Boolean logic that matches values and returns 'True' or 'False'
#we do this by defining a Python "function," i.e.a piece of code writen to be reused
def document_features(document, word_features):
    document_words = set(document)
    #we open a Python dictionary instead of a list
    features = {}
    for word in word_features:
        #checking if the word from word_features matches a word in the document
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [7]:
#now we apply the function to the document dataset
featuresets = [(document_features(d, word_features), c) for (d, c) in emaildocs]

#we print the list of features matches for the first document ([0]) in the corpus
#we'll see a Python dictionary with the key being the feature word
#and the value being 'True' or 'False' according to that word being matched in the present document or not
#we'll se a lot of 'False' values because (of course) not all 2000 words will be on each sentence!
featuresets[0]

({'contains(-)': False,
  'contains(.)': True,
  'contains(/)': False,
  'contains(,)': True,
  'contains(:)': True,
  'contains(the)': True,
  'contains(to)': True,
  'contains(ect)': False,
  'contains(and)': False,
  'contains(@)': False,
  'contains(of)': True,
  'contains(a)': False,
  'contains(for)': True,
  'contains(you)': True,
  'contains(in)': False,
  'contains(is)': False,
  'contains(this)': False,
  'contains(hou)': False,
  'contains(on)': False,
  'contains(=)': False,
  'contains(?)': False,
  'contains(i)': True,
  "contains(')": True,
  'contains())': False,
  'contains(()': False,
  'contains(Subject)': True,
  'contains(!)': False,
  'contains(your)': False,
  'contains(that)': False,
  'contains(be)': True,
  'contains(2000)': False,
  'contains(enron)': False,
  'contains(with)': False,
  'contains(will)': True,
  'contains(we)': False,
  'contains(have)': False,
  'contains(3)': False,
  'contains(from)': False,
  'contains($)': False,
  'contains(s)': False,


In [8]:
#We are ready now to do machine learning using the unigram list we just created
#we use a Naive Bayes classifier with 5-fold cross validation for training on sentiments using unigrams

start = time.time()
num_split = 5
kf = KFold(n_splits = num_split)
sum = 0
prec = 0
recall = 0

for train, test in kf.split(featuresets):
    train_data = np.array(featuresets)[train]
    test_data = np.array(featuresets)[test]
    classifier = nltk.NaiveBayesClassifier.train(train_data)
    sum += nltk.classify.accuracy(classifier, test_data)
    
    #calculating precision and recall 
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_data):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    
    prec += nltk.precision(refsets['spam'], testsets['spam'])
    recall += nltk.recall(refsets['spam'], testsets['spam'])

#storing the score in a variable 
acc = sum/num_split
avg_precision = prec/num_split
avg_recall = recall/num_split

end = time.time()
print("Processing Time: {0:.2f} seconds".format(end - start))

#let's see the accuracy score for this new classifier
print("Accuracy:", round(acc,4))
print("Precision:", round(avg_precision,4))
print("Recall:", round(avg_recall,4))

Processing Time: 29.18 seconds
Accuracy: 0.9485
Precision: 0.9079
Recall: 0.999


In [9]:
## EXPERIMENT 1: USING BIGRAMS INSTEAD OF UNIGRAMS ##

# We will now create a new feature: bigrams
#we'll use the code we already know from class labs 

from nltk.collocations import *
import re

#data cleaning and preprocessing
stopwords = nltk.corpus.stopwords.words('english')

def alpha(w):
    pattern = re.compile('^[^a-z]+$')
    if(pattern.match(w)):
        return True
    else:
        return False

#creating bigrams features for the corpus and applying cleaning steps
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_words)
finder.apply_word_filter(alpha)
finder.apply_word_filter(lambda w: w in stopwords)
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored[:10]

[(('ect', 'cc'), 0.001047708058603369),
 (('ect', 'subject'), 0.0005868018658165508),
 (('daren', 'j'), 0.00044170176808736733),
 (('j', 'farmer'), 0.00044170176808736733),
 (('ami', 'chokshi'), 0.00043743411815415606),
 (('please', 'let'), 0.00037982084405580377),
 (('would', 'like'), 0.0002325869213600147),
 (('xls', 'Subject'), 0.00023045309639340905),
 (('e', 'lloyd'), 0.00020911484672735264),
 (('robert', 'e'), 0.00020911484672735264)]

In [10]:
#extracting clean bigrams (no frequency information)
bigram_features = [bigram for (bigram, count) in scored[:2000]]
#printing the first 30 for confirmation
bigram_features[:30]

[('ect', 'cc'),
 ('ect', 'subject'),
 ('daren', 'j'),
 ('j', 'farmer'),
 ('ami', 'chokshi'),
 ('please', 'let'),
 ('would', 'like'),
 ('xls', 'Subject'),
 ('e', 'lloyd'),
 ('robert', 'e'),
 ('enron', 'cc'),
 ('b', 'camp'),
 ('howard', 'b'),
 ('anita', 'luong'),
 ('attached', 'file'),
 ('north', 'america'),
 ('td', 'td'),
 ('see', 'attached'),
 ('pat', 'clynes'),
 ('brenda', 'f'),
 ('f', 'herod'),
 ('enron', 'north'),
 ('looking', 'statements'),
 ('jackie', 'young'),
 ('julie', 'meyers'),
 ('america', 'corp'),
 ('aimee', 'lannou'),
 ('tr', 'td'),
 ('jpg', 'width'),
 ('cotton', 'valley')]

In [11]:
#after finding all bigrams of the corpus (i.e. the bigram feature of the corpus)
#we create a function that checks if those feature bigrams are present on each specific document
#exactly as we did with unigrams

def bi_document_features(document, bigram_features):
    document_words = list(nltk.bigrams(document))
    features = {}
    for word in bigram_features:
        #boolean logic will retunt 'True' if there is a match, or 'False' if not
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [12]:
#applying the function to our documents
featuresets2 = [(bi_document_features(d, bigram_features), c) for (d, c) in emaildocs]

#seeing the featureset for the first document
featuresets2[0]

({"contains(('ect', 'cc'))": False,
  "contains(('ect', 'subject'))": False,
  "contains(('daren', 'j'))": False,
  "contains(('j', 'farmer'))": False,
  "contains(('ami', 'chokshi'))": False,
  "contains(('please', 'let'))": False,
  "contains(('would', 'like'))": False,
  "contains(('xls', 'Subject'))": False,
  "contains(('e', 'lloyd'))": False,
  "contains(('robert', 'e'))": False,
  "contains(('enron', 'cc'))": False,
  "contains(('b', 'camp'))": False,
  "contains(('howard', 'b'))": False,
  "contains(('anita', 'luong'))": False,
  "contains(('attached', 'file'))": False,
  "contains(('north', 'america'))": False,
  "contains(('td', 'td'))": False,
  "contains(('see', 'attached'))": False,
  "contains(('pat', 'clynes'))": False,
  "contains(('brenda', 'f'))": False,
  "contains(('f', 'herod'))": False,
  "contains(('enron', 'north'))": False,
  "contains(('looking', 'statements'))": False,
  "contains(('jackie', 'young'))": False,
  "contains(('julie', 'meyers'))": False,
  "cont

In [13]:
#how we run the classifier to see if we get a good accuracy score
#all this is just to find out which feature is more informative to predict sentiment
#Naive Bayes classifier with 5-fold cross validation for training on sentiments using bigram features
start = time.time()
kf = KFold(n_splits = num_split)
sum = 0
prec = 0
recall = 0

for train, test in kf.split(featuresets2):
    train_data = np.array(featuresets2)[train]
    test_data = np.array(featuresets2)[test]
    classifier = nltk.NaiveBayesClassifier.train(train_data)
    sum += nltk.classify.accuracy(classifier, test_data)
    
    #calculating precision and recall 
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_data):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    
    prec += nltk.precision(refsets['spam'], testsets['spam'])
    recall += nltk.recall(refsets['spam'], testsets['spam'])

#storing the score in a variable 
acc2 = sum/num_split
avg_precision2 = prec/num_split
avg_recall2 = recall/num_split

end = time.time()
print("Processing Time: {0:.2f} seconds".format(end - start))

#let's see the accuracy score for this new classifier
print("Accuracy:", round(acc2,4))
print("Precision:", round(avg_precision2,4))
print("Recall:", round(avg_recall2,4))

Processing Time: 30.12 seconds
Accuracy: 0.93
Precision: 0.8835
Recall: 0.991


In [14]:
## EXPERIMENT 2 - REMOVE STOPWORDS AND CERTAIN PUNCTATION ##

all_words = [word for (email,category) in emaildocs for word in email]
nltkstopwords = nltk.corpus.stopwords.words('english')
punctuation = ['-', '.', '/', ',', ':', '@', '=', '?', "'", ')', '(', '_', ';', '*']
stopwords = nltkstopwords + punctuation
all_words_stopped = [w for w in all_words if not w in stopwords]
top_words = nltk.FreqDist(all_words_stopped)
most_common_words = top_words.most_common(2000)
word_features3 = [word for (word,count) in most_common_words]

#checking if we have the 2000 words we need
len(set(all_words_stopped))

37097

In [15]:
word_features3[:20]

['ect',
 'hou',
 'Subject',
 '!',
 '2000',
 'enron',
 '3',
 '$',
 'please',
 '``',
 '1',
 'com',
 '2',
 'e',
 '%',
 '00',
 '#',
 '>',
 'subject',
 'meter']

In [16]:
#now we apply the function to the document dataset
featuresets3 = [(document_features(d, word_features3), c) for (d, c) in emaildocs]

#we print the list of features matches for the first document ([0]) in the corpus
#we'll see a Python dictionary with the key being the feature word
#and the value being 'True' or 'False' according to that word being matched in the present document or not
#we'll se a lot of 'False' values because (of course) not all 2000 words will be on each sentence!
featuresets3[0]

({'contains(ect)': False,
  'contains(hou)': False,
  'contains(Subject)': True,
  'contains(!)': False,
  'contains(2000)': False,
  'contains(enron)': False,
  'contains(3)': False,
  'contains($)': False,
  'contains(please)': False,
  'contains(``)': False,
  'contains(1)': False,
  'contains(com)': False,
  'contains(2)': False,
  'contains(e)': False,
  'contains(%)': False,
  'contains(00)': False,
  'contains(#)': True,
  'contains(>)': False,
  'contains(subject)': False,
  'contains(meter)': False,
  'contains(pm)': False,
  'contains(gas)': True,
  'contains(cc)': False,
  'contains(deal)': False,
  'contains(0)': True,
  'contains(http)': False,
  'contains(99)': False,
  'contains(000)': False,
  'contains(corp)': False,
  'contains(new)': False,
  'contains(thanks)': False,
  'contains(get)': False,
  'contains(10)': False,
  'contains(4)': False,
  'contains(5)': False,
  'contains(&)': False,
  'contains(know)': False,
  'contains(|)': False,
  'contains(need)': False,


In [17]:
#Naive Bayes classifier with 5-fold cross validation for training on sentiments using
#unigrams without stopwords and certain punctuation

start = time.time()
kf = KFold(n_splits = num_split)
sum = 0
prec = 0
recall = 0

for train, test in kf.split(featuresets3):
    train_data = np.array(featuresets3)[train]
    test_data = np.array(featuresets3)[test]
    classifier = nltk.NaiveBayesClassifier.train(train_data)
    sum += nltk.classify.accuracy(classifier, test_data)
    
    #calculating precision and recall 
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_data):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    
    prec += nltk.precision(refsets['spam'], testsets['spam'])
    recall += nltk.recall(refsets['spam'], testsets['spam'])

#storing the score in a variable 
acc3 = sum/num_split
avg_precision3 = prec/num_split
avg_recall3 = recall/num_split

end = time.time()
print("Processing Time: {0:.2f} seconds".format(end - start))

#let's see the accuracy score for this new classifier
print("Accuracy:", round(acc3,4))
print("Precision:", round(avg_precision3,4))
print("Recall:", round(avg_recall3,4))

Processing Time: 29.85 seconds
Accuracy: 0.9615
Precision: 0.9297
Recall: 0.999


In [18]:
## EXPERIMENT 3 - INCREASE THE FEATURE VOCABULARY##
top_words = nltk.FreqDist(all_words)
most_common_words = top_words.most_common(5000)
word_features4 = [word for (word,count) in most_common_words]
word_features4[:20]

['-',
 '.',
 '/',
 ',',
 ':',
 'the',
 'to',
 'ect',
 'and',
 '@',
 'of',
 'a',
 'for',
 'you',
 'in',
 'is',
 'this',
 'hou',
 'on',
 '=']

In [19]:
#now we apply the function to the document dataset
featuresets4 = [(document_features(d, word_features4), c) for (d, c) in emaildocs]

#we print the list of features matches for the first document ([0]) in the corpus
featuresets4[0]

({'contains(-)': False,
  'contains(.)': True,
  'contains(/)': False,
  'contains(,)': True,
  'contains(:)': True,
  'contains(the)': True,
  'contains(to)': True,
  'contains(ect)': False,
  'contains(and)': False,
  'contains(@)': False,
  'contains(of)': True,
  'contains(a)': False,
  'contains(for)': True,
  'contains(you)': True,
  'contains(in)': False,
  'contains(is)': False,
  'contains(this)': False,
  'contains(hou)': False,
  'contains(on)': False,
  'contains(=)': False,
  'contains(?)': False,
  'contains(i)': True,
  "contains(')": True,
  'contains())': False,
  'contains(()': False,
  'contains(Subject)': True,
  'contains(!)': False,
  'contains(your)': False,
  'contains(that)': False,
  'contains(be)': True,
  'contains(2000)': False,
  'contains(enron)': False,
  'contains(with)': False,
  'contains(will)': True,
  'contains(we)': False,
  'contains(have)': False,
  'contains(3)': False,
  'contains(from)': False,
  'contains($)': False,
  'contains(s)': False,


In [20]:
#Naive Bayes classifier with 5-fold cross validation for training on sentiments using
#expanded unigram features vocabulary

start = time.time()
kf = KFold(n_splits = num_split)
sum = 0
prec = 0
recall = 0

for train, test in kf.split(featuresets4):
    train_data = np.array(featuresets4)[train]
    test_data = np.array(featuresets4)[test]
    classifier = nltk.NaiveBayesClassifier.train(train_data)
    sum += nltk.classify.accuracy(classifier, test_data)
    
    #calculating precision and recall 
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_data):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    
    prec += nltk.precision(refsets['spam'], testsets['spam'])
    recall += nltk.recall(refsets['spam'], testsets['spam'])

#storing the score in a variable 
acc4 = sum/num_split
avg_precision4 = prec/num_split
avg_recall4 = recall/num_split

end = time.time()
print("Processing Time: {0:.2f} seconds".format(end - start))

#let's see the accuracy score for this new classifier
print("Accuracy:", round(acc4,4))
print("Precision:", round(avg_precision4,4))
print("Recall:", round(avg_recall4,4))

Processing Time: 76.23 seconds
Accuracy: 0.9625
Precision: 0.9309
Recall: 1.0


In [21]:
## BONUS EXPERIMENT - COMBINE THE STOPWORD REMOVAL WITH INCREASED VOCABULARY ##

top_words = nltk.FreqDist(all_words_stopped)
most_common_words = top_words.most_common(5000)
word_features5 = [word for (word,count) in most_common_words]
word_features5[:20]

['ect',
 'hou',
 'Subject',
 '!',
 '2000',
 'enron',
 '3',
 '$',
 'please',
 '``',
 '1',
 'com',
 '2',
 'e',
 '%',
 '00',
 '#',
 '>',
 'subject',
 'meter']

In [22]:
#now we apply the function to the document dataset
featuresets5 = [(document_features(d, word_features5), c) for (d, c) in emaildocs]

#we print the list of features matches for the first document ([0]) in the corpus
featuresets5[0]

({'contains(ect)': False,
  'contains(hou)': False,
  'contains(Subject)': True,
  'contains(!)': False,
  'contains(2000)': False,
  'contains(enron)': False,
  'contains(3)': False,
  'contains($)': False,
  'contains(please)': False,
  'contains(``)': False,
  'contains(1)': False,
  'contains(com)': False,
  'contains(2)': False,
  'contains(e)': False,
  'contains(%)': False,
  'contains(00)': False,
  'contains(#)': True,
  'contains(>)': False,
  'contains(subject)': False,
  'contains(meter)': False,
  'contains(pm)': False,
  'contains(gas)': True,
  'contains(cc)': False,
  'contains(deal)': False,
  'contains(0)': True,
  'contains(http)': False,
  'contains(99)': False,
  'contains(000)': False,
  'contains(corp)': False,
  'contains(new)': False,
  'contains(thanks)': False,
  'contains(get)': False,
  'contains(10)': False,
  'contains(4)': False,
  'contains(5)': False,
  'contains(&)': False,
  'contains(know)': False,
  'contains(|)': False,
  'contains(need)': False,


In [23]:
#Naive Bayes classifier with 5-fold cross validation for training on sentiments using
#expanded unigram features vocabulary without stopword and punctuation

start = time.time()
kf = KFold(n_splits = num_split)
sum = 0
prec = 0
recall = 0

for train, test in kf.split(featuresets5):
    train_data = np.array(featuresets5)[train]
    test_data = np.array(featuresets5)[test]
    classifier = nltk.NaiveBayesClassifier.train(train_data)
    sum += nltk.classify.accuracy(classifier, test_data)
    
    #calculating precision and recall 
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_data):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    
    prec += nltk.precision(refsets['spam'], testsets['spam'])
    recall += nltk.recall(refsets['spam'], testsets['spam'])

#storing the score in a variable 
acc5 = sum/num_split
avg_precision5 = prec/num_split
avg_recall5 = recall/num_split

end = time.time()
print("Processing Time: {0:.2f} seconds".format(end - start))

#let's see the accuracy score for this new classifier
print("Accuracy:", round(acc5,4))
print("Precision:", round(avg_precision5,4))
print("Recall:", round(avg_recall5,4))

Processing Time: 77.58 seconds
Accuracy: 0.976
Precision: 0.9561
Recall: 0.998
