In [None]:
# import necessary libraries
# may need to add the nltk library
# !pip install nltk
import os
from nltk.corpus import stopwords
import string
import pandas as pd

In [None]:
# if you have issues reading in the stopwords, you may need to run the
# following two lines of code
# import nltk
# nltk.download('stopwords')

In [None]:
# before we get going, be sure to download the data (i.e. the corpus)
#
# this file get unzipped and put into a subfolder within this directory called easy_ham
# https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2
#
# this file get unzipped and put into a subfolder within this directory called spam
# https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2
#
# defining some global variables
STOPS = stopwords.words("english")
EXCLUDE = set(string.punctuation) | set(["''", "BR", "--", "/td", "nbsp", "2002", "localhost", "&nbsp"])
SPAMPATH = "./spam/"
HAMPATH = "./easy_ham/"

In [None]:
# these stopwords are added because of the nature of the corpus
# basically, the email text includes word or tags that are part of the
# email headder so we don't want to use them when analyzing the
# text of the emails
STOPS.extend(['delivery-date', 'jalapeno', 'return-path', 'delivered-to', 'received', 'esmtp', 'smtp'])
STOPS.extend(['message-id', 'imap', 'date', 'subject', 'list-help', 'list-post', 'list-subscribe', 'list-archive'])
STOPS.extend(['x-beenthere', 'mime-version', 'errors-to', 'list-unsubscribe', 'x-mailman-version'])
STOPS.extend(['content-type', 'list-id', 'dogmaslashnullorg', 'xentcom', 'in-reply-to'])
STOPS.extend(['jmasonorg', 'sender', 'bulk', 'content-transfer-encoding'])
STOPS.extend(['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'])
STOPS.extend(['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat'])

In [None]:
# this function takes a list of words and makes them a string
def ListToString(inputlist):
    cleanstring = ""
    for each in inputlist:
        cleanstring = cleanstring + each + " "
    return cleanstring.strip()

In [None]:
def FilterWord(s):
    """
    Receives a string and determines whether or not it is a string that should be used
    for analysis (keep = 1) and passes back a clean version of the string.
    """
    keep = 1 #default is to keep the word
    cleanstring = s.lower()
    drop_chars = ",.:;!"
    for c in drop_chars:
        cleanstring = cleanstring.replace(c, "")
    bad_chars = "><#$/=0123456789+()&*@?"
    for c in bad_chars:
        if cleanstring.find(c)!=-1:
            keep=0
    if len(cleanstring)<3:
        keep=0
    if cleanstring in STOPS:
        keep =0
    if cleanstring in EXCLUDE:
        keep=0
    return keep, cleanstring

In [None]:
def CleanWords(wordlist):
    """
    Process go go through a list of words and clean each of them up.  Passes back
    a list of clean words.
    """
    filtered_words =[]
    for each in wordlist:
        flag, cleanstring = FilterWord(each)
        if flag==1:
            filtered_words.append(cleanstring)
    return filtered_words

In [None]:
# this is the main part of the code
# it will go through all of the documents
# found within each of the folders and add each file to the dataframe
#
# the dataframe has the following variables:
# id:  this is just a number that gets incremented for each record
# spam:  binary variable; coded as 1 when the email is spam
# file:  the name of the file
# words:  a string that contains all of the filtered words in the email
#
df = pd.DataFrame(columns=['id', 'spam', 'file', 'words'])
count = 0
for (dirpath, dirnames, filenames) in os.walk(SPAMPATH):
    for file in filenames:
        infile=(SPAMPATH + file)
        f = open(infile, 'r', encoding="Latin-1")
        s = f.read()
        f.close()
        words = s.split()
        words = CleanWords(words)
        cleanstring = ListToString(words)
        df = df.append({'id':count, 'spam':1, 'file':file, 'words':cleanstring}, ignore_index=True)
        count = count+1
for (dirpath, dirnames, filenames) in os.walk(HAMPATH):
    for file in filenames:
        infile=(HAMPATH + file)
        f = open(infile, 'r', encoding="Latin-1")
        s = f.read()
        f.close()
        words = s.split()
        words = CleanWords(words)
        cleanstring = ListToString(words)
        df = df.append({'id':count, 'spam':0, 'file':file, 'words':cleanstring}, ignore_index=True)
        count = count+1

In [None]:
# once we have the dataframe all set for the next step, we
# save the dataframe to a csv file
df.to_csv('email_corpus.csv')