# Spam Filter (NLP Project)

From Chapter 2 of Getting Started with Natural Language Processing (2022, Kochmar)

# Preliminary Steps

In [1]:
# import libraries

# general
import os # to iterate through folders
import codecs # helps with different text encodings
import random # to shuffle the order of emails (to prep for selecting train and test sets)

# processing
import nltk # nlp toolkit
from nltk import word_tokenize # nltk's word tokenizer
nltk.download('punkt') # nltk's sentence tokenizer

# modeling
from nltk import NaiveBayesClassifier, classify # NLTK's Naive Bayes Classifier
from nltk.text import Text # NLTK's Text data structure

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yang0108\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Step 1: Define data and classes

Read in the data to train and test the spam filter. Shuffle the data to prep it for splitting into train and test sets.

Data: 
Enron email dataset (www.cs.cmu.edu/~enron/)

Download subsets at: http://mng.bz/WxYg

Subset and data collection processes described: http://www2.aueb.gr/users/ion/docs/ceas2006_paper.pdf

In [2]:
# read in the contents of the ham files

# initialize empty ham_list
ham_list = []

# use os.walk to access files that have been uploaded to drive
for root, dirs, files in os.walk("Data/enron1/ham"):
    
    # iterate through each separate file
    for file in files:

        # if the file is a text file
        if file.endswith('.txt'):

            # open the file as f
            with open(os.path.join(root, file), 'r', 
                      encoding = "ISO-8859-1", 
                      errors="ignore") as f:
                
                # read the file and save as variable text
                text = f.read()

                # append text to ham_list
                ham_list.append(text)



In [3]:
# repeat step above for spam files

spam_list = []
for root, dirs, files in os.walk("Data/enron1/spam"):
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r', 
                      encoding = "ISO-8859-1", 
                      errors="ignore") as f:
                text = f.read()
                spam_list.append(text)


In [4]:
# check that files have been uploaded correctly

# print the length of spam_list and ham_list
# spam_list should contain 1,500 and ham_list should contain 3,672
print(f"Length of spam_list: {len(spam_list)} \n\nLength of ham_list: \
      {len(ham_list)}") 

# print extra lines for readability
print()

# print the first example of both lists to verify contents
print(f"First example of spam: \n{spam_list[0]} \n\nFirst example of ham: \n \
      {ham_list[0]}")

Length of spam_list: 1500 

Length of ham_list:       3672

First example of spam: 
Subject: dobmeos with hgh my energy level has gone up! Stukm
Introducing
Doctor - formulated
Hgh
Human growth hormone - also called hgh
Is referred to in medical science as the master hormone. It is very plentiful
When we are young, but near the age of twenty - one our bodies begin to produce
Less of it. By the time we are forty nearly everyone is deficient in hgh,
And at eighty our production has normally diminished at least 90 - 95%.
Advantages of hgh:
- increased muscle strength
- loss in body fat
- increased bone density
- lower blood pressure
- quickens wound healing
- reduces cellulite
- improved vision
- wrinkle disappearance
- increased skin thickness texture
- increased energy levels
- improved sleep and emotional stability
- improved memory and mental alertness
- increased sexual potency
- resistance to common illness
- strengthened heart muscle
- controlled cholesterol
- controlled mood swing

In [5]:
# read both list contents into single list all_emails, storing email content 
# and content label (ham or spam) in tuples

# create all_emails with spam_list
all_emails = [(email_content, "spam") for email_content in spam_list]

# add ham_list to all_emails
all_emails += [(email_content, "ham") for email_content in ham_list]

In [6]:
# select the seed of the random operator to make sure that all future runs 
# will shuffle the data in the same way
random.seed(42)

In [7]:
# shuffle the list to prepare for splitting into train and test sets
random.shuffle(all_emails)

In [8]:
# check list size
print(f"Dataset size = {len(all_emails)} emails") 
# 5,172 is the correct number (1,500 spam + 3,672 ham)

Dataset size = 5172 emails


# Step 2: Split text into words

Use NLTK's word tokenizer to split the text.

In [9]:
# define tokenize function
def tokenize(text):

  # create word_list that contains all lowercase tokenized words from input
  word_list = [word for word in word_tokenize(text)]

  # return word_list
  return word_list

In [10]:
# test tokenize function

# give it some text as input
input = "What's the best way to split a sentence into words?"

# print results of tokenize function
print(tokenize(input))

['What', "'s", 'the', 'best', 'way', 'to', 'split', 'a', 'sentence', 'into', 'words', '?']


# Step 3: Extract and normalize features

This step uses the functionality built in the previous step for adding tokenized words into word_list.

In [11]:
# define get_features function to extract features from a text
# this function takes in a text and returns a dictionary
# keys in the dictionary are the words in the text
# values in the dictionary are True
def get_features(text):

  # initialize features dictionary
  features = {}

  # create word_list, which is a list of all tokenized words in the text
  # copy the tokenize functionality from the tokenize function above
  # add in normalization of characters into lowercase
  word_list = [word for word in word_tokenize(text.lower())]

  # for each word in word_list
  for word in word_list:
    
    # switch on the "flag" that this word is contained in this text (email)
    features[word] = True
  
  return features

In [12]:
# check the get_features function

# give get_features some text
print(get_features("Participate In Our New Lottery NOW!"))

{'participate': True, 'in': True, 'our': True, 'new': True, 'lottery': True, 'now': True, '!': True}


In [13]:
# create a list of tuples called all_features
# that iterates over all texts (emails) in the all_emails list
# and adds a tuple for each email
# the tuple contains the features dictionary (from get_features) and the 
# label for each email (ham or spam) (from all_emails)

all_features = [(get_features(email), label) for (email, label) in all_emails]

In [14]:
# check the length and type of all_features (length should be same as number 
# of emails, 5,172; type should be list)
print(f"all_features length: {len(all_features)} \nall_features type: \
{type(all_features)}")

all_features length: 5172 
all_features type: <class 'list'>


In [15]:
# check some items in all_features to verify contents
print(all_features[0]) 
# should return tuple with dictionary and label of first item

print()
print(all_features[99]) 
# should return tuple with dictionary and label of hundredth item

print()
print(all_features[0][0]) 
# should return feature dictionary of first item

print()
print(all_features[99][0]) 
# should return feature dictionary of hundredth item

print()
print(all_features[0][1]) 
# should return label of first item

print()
print(all_features[99][1]) 
# should return label of hundredth item

print()
print(len(all_features[0][0])) 
# should return length of feature dictionary of first item (number of unique 
# words in email)

print()
print(len(all_features[99][0])) 
# should return length of feature dictionary of hundredth item (number of 
# unique words in email)


({'subject': True, ':': True, 'bloodline': True, ',': True, 'ahead': True, 'of': True, 'the': True, 'street': True, 'microcap': True, 'alert': True, 'when': True, 'living': True, 'with': True, 'sheriff': True, 'is': True, 'obsequious': True, 'blood': True, 'clot': True, 'beyond': True, 'deficit': True, 'reach': True, 'an': True, 'understanding': True, 'toward': True, '.': True, '[': True, '3': True}, 'spam')

({'subject': True, ':': True, 'mobil': True, 'beaumont': True, '-': True, 'marol': True, 'rebecca': True, 'for': True, 'in': True, 'march': True, ',': True, 'beginning': True, 'on': True, '21': True, 'hpl': True, 'started': True, 'delivering': True, '30': True, '000/d': True, 'midcon': True, '(': True, 'just': True, 'like': True, 'we': True, 'did': True, 'dec': True, 'and': True, 'jan': True, 'maybe': True, 'feb': True, 'too': True, ')': True, 'check': True, 'with': True, 'daren': True, 'farmer': True, 'when': True, 'you': True, 'get': True, 'ready': True, 'to': True, 'do': True, 

# Step 4: Train classifier

In [16]:
def train(features, proportion):

    # get the size of the training set based on the proportion
    train_size = int(len(features) * proportion)

    # select out the number of items you need for training from the features 
    # dictionary
    train_set = features[:train_size]

    # select out the rest of the items you need for testing from the features 
    # dictionary
    test_set = features[train_size:]

    # make sure the data split correctly, print the number of training and 
    # test items
    print(f"Training set size: {len(train_set)} emails")
    print(f"Test set size: {len(test_set)} emails")

    # initialize the classifier
    classifier = NaiveBayesClassifier.train(train_set)

    # return
    return train_set, test_set, classifier

In [17]:
# apply the train function using 80% of emails for training
train_set, test_set, classifier = train(all_features, 0.8)

Training set size: 4137 emails
Test set size: 1035 emails


# Step 5: Evaluate classifier

NLTK's classifier returns an accuracy score for the train and test sets. It also allows you to inspect the more informative features (words).

In [18]:
# create evaluate function to test accuracy of classifier
def evaluate(train_set, test_set, classifier):

    # accuracy on training set
    print(f"Accuracy on the training set: \
{classify.accuracy(classifier, train_set)}")
  
    # accuracy on test set
    print(f"Accuracy on the test set: \
{classify.accuracy(classifier, test_set)}")
  
    # select top 50 most informative features to show
    print(classifier.show_most_informative_features(50)) 

In [19]:
# use evaluate function on classifier
evaluate(train_set, test_set, classifier)

Accuracy on the training set: 0.9608411892675852
Accuracy on the test set: 0.9420289855072463
Most Informative Features
               forwarded = True              ham : spam   =    198.3 : 1.0
                    2004 = True             spam : ham    =    143.8 : 1.0
                     nom = True              ham : spam   =    126.0 : 1.0
            prescription = True             spam : ham    =    122.9 : 1.0
                    pain = True             spam : ham    =     98.8 : 1.0
                  health = True             spam : ham    =     82.7 : 1.0
                     ect = True              ham : spam   =     76.8 : 1.0
                    2001 = True              ham : spam   =     75.8 : 1.0
                featured = True             spam : ham    =     74.7 : 1.0
              nomination = True              ham : spam   =     72.1 : 1.0
             medications = True             spam : ham    =     69.9 : 1.0
                  differ = True             spam : ham 

In [20]:
# create function to check context of specific words

def concordance(data_list, search_word):
    for email in data_list:
        word_list = [word for word in word_tokenize(email.lower())]
        text_list = Text(word_list)
        if search_word in word_list:
            # default prints out 36 characters before and after search_word
            text_list.concordance(search_word)

print("STOCKS in HAM:")
concordance(ham_list, "stocks")

print()
print("STOCKS in SPAM:")
concordance(spam_list, "stocks")

STOCKS in HAM:
Displaying 1 of 1 matches:
ad my portfolio is diversified into stocks that have lost even more money than
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files

STOCKS in SPAM:
Displaying 2 of 2 matches:
ims and do your own due diligence . stocks to play ( s 2 p ) profiles are not 
s obtained . investing in micro cap stocks is extremely risky and , investors 
Displaying 1 of 1 matches:
cautions that small and micro - cap stocks are high - risk investments and tha
Displaying 1 of 1 matches:
s obtained . investing in micro cap stocks is extremely risky and , investors 
Displaying 3 of 3 matches:
ancements but may be one of the few stocks left in this industry group that is
his email pertaining to investing , stock

Displaying 2 of 2 matches:
his email pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . none o
Displaying 4 of 4 matches:
ck monday some of these little voip stocks have been rea | | y moving lately .
 statements . as with many microcap stocks , today ' s company has additiona |
is report pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this report . none 
Displaying 1 of 1 matches:
the | ast 12 months , many of these stocks made triple and even quadruple retu
Displaying 1 of 1 matches:
or information puposes only . penny stocks are considered highly speculative a
Displaying 1 of 1 matches:
 one trade monday ! go wysk . penny stocks are considered highiy specuiative a
Displaying 1 of 1 matches:
 the last 12 months , many of these stocks made tripie and even quadruple retu
Displaying 1 of 1 matches:
 one trade monday ! g

# Step 6: Run classifier on new data

Read in new data (enron2) and apply the trained classifier to it.

In [21]:
# read in the contents of the ham files

# initialize empty ham_list2
ham_list2 = []

# use os.walk to access files that have been uploaded to drive
for root, dirs, files in os.walk("Data/enron2/ham"):
    # iterate through each separate file
    for file in files:

        # if the file is a text file
        if file.endswith('.txt'):

            # open the file as f
            with open(os.path.join(root, file), 'r', 
                      encoding = "ISO-8859-1", 
                      errors="ignore") as f:
                
                # read the file and save as variable text
                text = f.read()

                # append text to ham_list
                ham_list2.append(text)

In [22]:
# repeat step above for spam_list2

spam_list2 = []
for root, dirs, files in os.walk("Data/enron2/spam"):
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r', 
                      encoding = "ISO-8859-1", 
                      errors="ignore") as f:
                text = f.read()
                spam_list2.append(text)

In [23]:
# read both list contents into single list all_emails2, storing email content 
# and content label (ham or spam) in tuples

# create all_emails2 with spam_list2
all_emails2 = [(email_content, "spam") for email_content in spam_list2]

# add ham_list2 to all_emails2
all_emails2 += [(email_content, "ham") for email_content in ham_list2]

In [24]:
# create a list of tuples called all_features2
# that iterates over all texts (emails) in the all_emails2 list
# and adds a tuple for each email
# the tuple contains the features dictionary (from get_features) and the 
# label for each email (ham or spam) (from all_emails2)

all_features2 = [(get_features(email), label) for (email, label) in all_emails2]

In [25]:
# accuracy on new test set enron2
print(f"Accuracy on the test set: \
{classify.accuracy(classifier, all_features2)}")

Accuracy on the test set: 0.759433156906266


# Step 7: More data

Combine both enron1 and enron2 data sets into one set, create a new classifier and train it on 80% of the larger set and test it on 20%. This will show you results from a classifier that has trained on a lot more data. 

In [26]:
# read in the contents of the spam files

# initialize empty spam_list_all
spam_list_all = []

# use os.walk to access files that have been uploaded to drive
for root, dirs, files in os.walk("Data/enron_all/spam"):
    
    # iterate through each separate file
    for file in files:

        # if the file is a text file
        if file.endswith('.txt'):

            # open the file as f
            with open(os.path.join(root, file), 'r', 
                      encoding = "ISO-8859-1", 
                      errors="ignore") as f:
                
                # read the file and save as variable text
                text = f.read()

                # append text to ham_list
                spam_list_all.append(text)

In [27]:
# repeat step above for ham_list_all

ham_list_all = []
for root, dirs, files in os.walk("Data/enron_all/ham"):
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r', 
                      encoding = "ISO-8859-1", 
                      errors="ignore") as f:
                text = f.read()
                ham_list_all.append(text)

In [28]:
# read both list contents into single list all_emails3
# and content label (ham or spam) in tuples

# create all_emails2 with spam_list2
all_emails3 = [(email_content, "spam") for email_content in spam_list_all]

# add ham_list2 to all_emails2
all_emails3 += [(email_content, "ham") for email_content in ham_list_all]

In [29]:
# create a list of tuples called all_features3
# that iterates over all texts (emails) in the all_emails3 list
# and adds a tuple for each email
# the tuple contains the features dictionary (from get_features) and the 
# label for each email (ham or spam) (from all_emails3)

all_features3 = [(get_features(email), label) 
                 for (email, label) 
                 in all_emails3]

In [30]:
# apply the train function using 80% of emails for training
train_set3, test_set3, classifier3 = train(all_features3, 0.8)

Training set size: 7015 emails
Test set size: 1754 emails


In [31]:
# accuracy on new test set enron_all
print(f"Accuracy on the test set: \
{classify.accuracy(classifier3, all_features3)}")

Accuracy on the test set: 0.9786748774090547
