In [108]:
import numpy as np
import pandas as pd
import email
import re
import os
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

## Load the Data ##
Two datasets will be used, the Enron email corpus, for not-spam emails, and a collection of "419" fraudulent emails, which are spam.

In [24]:
filepath1 = "emails.csv"
filepath2 = "fraudulent_emails.txt"

In [23]:
# Load emails from Enron email corpus
emails = pd.read_csv(filepath1)
print("Successfully loaded {} rows and {} columns!".format(emails.shape[0],emails.shape[1]))
print(emails.head(n=5))

Successfully loaded 517401 rows and 2 columns!
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


In [17]:
# Separates and extracts bodies of messages from header information
def extract_messages(df):
    messages = []
    for msg in df["message"]:
        e = email.message_from_string(msg)
        msg_body = e.get_payload()
        messages.append(msg_body)
    print("Success!")
    return messages

In [25]:
# Create DataFrame with email bodies
bodies = extract_messages(emails)
bodies_df = pd.DataFrame(bodies)
print(bodies_df.head())

Success!
                                                   0
0                          Here is our forecast\n\n 
1  Traveling to have a business meeting takes the...
2                     test successful.  way to go!!!
3  Randy,\n\n Can you send me a schedule of the s...
4                Let's shoot for Tuesday at 11:45.  


In [26]:
# Load fraudulent (spam) emails
with open(filepath2, 'r', encoding="latin1") as file:
    data = file.read()
    
fraud_emails = data.split("From r")

print("Successfully loaded {} spam emails.".format(len(fraud_emails)))

Successfully loaded 3978 spam emails.


In [27]:
# Convert fraudulent email data into a pandas DataFrame
fraud_bodies = extract_messages(pd.DataFrame(fraud_emails, columns=["message"], dtype=str))
fraud_bodies_df = pd.DataFrame(fraud_bodies[1:]) # DataFrame with just the message bodies (no header)
print(fraud_bodies_df.head())

Success!
                                                   0
0  FROM:MR. JAMES NGOLA.\nCONFIDENTIAL TEL: 233-2...
1  Dear Friend,\n\nI am Mr. Ben Suleman a custom ...
2  FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...
3  FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...
4  Dear sir, \n \nIt is with a heart full of hope...


## Data Preprocessing ##
We convert the emails into sample of roughly equal size and take `Nsamp` emails from each category. Every sample will be `maxtokens` length and each token will be no longer than `maxtokenlen`. We also perform the following steps:
1. Tokenization
2. Punctuation removal
3. Lowercasing
2. Stop word removal

In [28]:
# Tokenization of emails
Nsamp = 1000 # number of samples to generate in each class (spam and non-spam)
maxtokens = 50 # maximum number of tokens in each email
maxtokenlen = 20 # max length of each token

def tokenize(row):
    if row in (None, ''):
        tokens = ""
    else:
        tokens = str(row).split()[:maxtokens] # Split on whitespace
    return tokens

In [29]:
# Preprocess: remove punctuation, and uppercase
#             remove stop words
def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower()
            token = re.sub(r'[\W\d]', "", token)
            token = token[:maxtokenlen]
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    token = filter(None, token)
    return token

In [30]:
# Build dataset of emails by performing all preprocessing for the two sets of email data
EnronEmails = bodies_df.iloc[:,0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_word_removal)
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)

SpamEmails = fraud_bodies_df.iloc[:,0].apply(tokenize)
SpamEmails = SpamEmails.apply(stop_word_removal)
SpamEmails = SpamEmails.apply(reg_expressions)
SpamEmails = SpamEmails.sample(Nsamp)

# Convert the data to a single dataset by concatenating the two sets
raw_data = pd.concat([SpamEmails, EnronEmails], axis=0).to_numpy() # convert to numpy array

In [31]:
print("Shape of combined data represented as numpy array is:")
print(raw_data.shape)
print("Data represented as numpy array is")
print(raw_data)

Shape of combined data represented as numpy array is:
(2000,)
Data represented as numpy array is
[list(['attnbrbrmy', 'name', 'is', 'kelvin', 'brown', 'i', 'am', 'from', 'british', 'and', 'also', 'a', 'special', 'adviser', 'to', 'the', 'former', 'liberian', 'president', 'in', 'africa', 'mr', 'charles', 'taylor', 'before', 'he', 'left', 'office', 'he', 'instructed', 'me', 'being', 'his', 'special', 'adviser', 'to', 'look', 'for', 'capable', 'hand', 'some', 'one', 'who', 'will', 'invest', 'his', 'money', '', 'fifteen', 'million'])
 list(['emailmessagemessage', 'object', 'xfbdeb', 'emailmessagemessage', 'object', 'xfbddab', 'emailmessagemessage', 'object', 'xfbde'])
 list(['emailmessagemessage', 'object', 'xfbd', 'emailmessagemessage', 'object', 'xfbde'])
 ...
 list(['if', 'consenting', 'corporate', 'action', 'habitually', 'yes', 'original', 'message', 'from', 'haedicke', 'mark', 'sent', 'tuesday', 'may', '', '', '', 'pm', 'to', 'douglas', 'stephen', 'h', 'subject', 're', 'canadian', 'ret

In [32]:
# Create headers corresponding to these emails 
Categories = ['spam','notspam']

# Create a list of size 2*Nsamp
header = ([1]*Nsamp)
header.extend(([0]*Nsamp))
print(header)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [37]:
# Assembling bag of words
def assemble_bow(data):
    used_tokens = []
    all_tokens = []
    
    for item in data:
        for token in item:
            if token in all_tokens:
                if token not in used_tokens:
                    used_tokens.append(token)
            else:
                all_tokens.append(token)
                
    df = pd.DataFrame(0, index = np.arange(len(data)), columns = used_tokens)
    
    for i, item in enumerate(data):
        for token in item:
            if token in used_tokens:
                df.iloc[i][token] += 1
                
    return df

In [38]:
# EnronSpamBag is a dataframe where columns are all unique words, rows are email samples
# EnronSpamBag contains information about the occurrence of each word
EnronSpamBag = assemble_bow(raw_data)
print(EnronSpamBag)
predictors = [column for column in EnronSpamBag.columns] # A list containing all unique tokens

      he  special  adviser  to  his  emailmessagemessage  object  xfbde  i  \
0      2        2        2   2    2                    0       0      0  1   
1      0        0        0   0    0                    3       3      1  0   
2      0        0        0   0    0                    2       2      1  0   
3      0        0        0   0    0                    0       0      0  1   
4      0        0        0   0    0                    0       0      0  2   
...   ..      ...      ...  ..  ...                  ...     ...    ... ..   
1995   0        0        0   0    0                    0       0      0  0   
1996   0        0        0   0    0                    0       0      0  0   
1997   0        0        0   2    0                    0       0      0  0   
1998   0        0        0   0    0                    0       0      0  0   
1999   0        0        0   0    0                    0       0      0  0   

      federal  ...  iso  noticedoc  feature  picks  jobs  rober

In [133]:
# We need to shuffle the data before we split it into training and test sets
data, headers = shuffle(EnronSpamBag.values, header, random_state=44)

In [135]:
print("There are", data.shape[0], "samples in the dataset.")
slice = int(0.7*data.shape[0])
train_data = data[:slice]
train_target = headers[:slice]
test_data = data[slice:]
test_target = headers[slice:]

There are 2000 samples in the dataset.
[0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 

### IMDB Dataset ###
Next, we load the IMDB Moview Review Dataset, which contains pre-tagged positive and negative reviews. 

In [94]:
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

In [99]:
def load_data(path):
    imdb_data, sentiments = [], []
    for folder, sentiment in (('neg', 0), ('pos', 1)):
        folder = os.path.join(path, folder)
        for name in os.listdir(folder):
            with open(os.path.join(folder, name), 'r') as reader:
                  text = reader.read()
            text = tokenize(text)
            text = stop_word_removal(text)
            text = reg_expressions(text)
            imdb_data.append(text)
            sentiments.append(sentiment)
    data_np = np.array(imdb_data, dtype=object)
    imdb_data, sentiments = shuffle(data_np, sentiments)
    
    return imdb_data, sentiments

train_path = os.path.join('aclImdb', 'train')
test_path = os.path.join('aclImdb', 'test')
raw_data, raw_header = load_data(train_path)

# raw_data is an numpt dtype array of variously sized lists, each containing an imdb review
print(raw_data.shape)
print(len(raw_header))

(25000,)
25000


In [100]:
# Sample the tokens in the first review
raw_data[0]

['my',
 'favorite',
 'movie',
 'what',
 'great',
 'story',
 'really',
 'was',
 'id',
 'like',
 'able',
 'buy',
 'copy',
 'seem',
 'possible']

In [138]:
# Subsample required number of samples
random_indices = np.random.choice(range(len(raw_header)), size=(Nsamp*2, ), replace=False)
random_indices = random_indices.tolist()

imdb_dataset = [raw_data[i] for i in random_indices]
imdb_headers = [raw_header[i] for i in random_indices]

In [139]:
print(imdb_dataset[0])

['ive', 'rarely', 'annoyed', 'leading', 'performance', 'i', 'ali', 'mcgraws', 'movie', 'god', 'bothersome', 'what', 'she', 'says', 'everything', 'tone', 'horrible', 'horrible', 'fact', 'that', 'contrast', 'ryan', 'oneal', 'brilliant', 'br', 'br', 'there', 'much', 'story', 'hes', 'rich', 'wooden', 'sacrifice', 'a', 'lot', 'love', 'his', 'father', 'stonewall', 'jackson', 'called', 'first', 'name', 'case', 'notice', 'difference', 'the', 'two', 'them', 'they', 'overcame', 'name', 'love', 'br', 'br', 'the', 'oscar', 'nominations', 'movie', 'indicate', 'bad', 'year', 'john', 'marley', 'fine', 'woodens', 'father', 'supporting', 'nomination', 'at', 'least', 'ali', 'win', 'br', 'br', 'i', 'still', 'think', 'katharine', 'ross', 'played', 'jennifer', 'again', 'me', 'katharine', 'ross', 'would', 'lot', 'movies', 'shes', 'certainly', 'better', 'actress', 'mcgraw', 'br', 'br', 'i', 'even', 'cry', 'got', 'sick', 'never', 'occured', 'even', 'feel', 'sad', 'br', 'br', 'it', 'nice', 'see', 'tommy', 'lee

In [140]:
# Check the balance of the resulting data to ensure we haven't overwhelmingly selected from one of the labels
# The data should be roughly split 50/50
unique_elements, counts_elements = np.unique(imdb_headers, return_counts=True)
print("Sentiments and their frequencies:")
print(unique_elements)
print(counts_elements)

Sentiments and their frequencies:
[0 1]
[1024  976]


In [141]:
BagOfReviews = assemble_bow(imdb_dataset)
print(BagOfReviews)

      horrible  br  name  love  the  movie  father  ali  i  katharine  ...  \
0            2  10     2     2    2      2       2    2  3          2  ...   
1            0   0     0     0    1      0       0    0  0          0  ...   
2            0   1     0     0    2      0       0    0  7          0  ...   
3            0   0     0     0    1      0       0    0  1          0  ...   
4            0   1     0     0    1      2       0    0  0          0  ...   
...        ...  ..   ...   ...  ...    ...     ...  ... ..        ...  ...   
1995         0   5     0     0    0      0       0    0  5          0  ...   
1996         0   0     0     0    0      2       0    0  2          0  ...   
1997         0   0     0     0    0      1       0    0  1          0  ...   
1998         0   0     0     0    0      0       0    0  8          0  ...   
1999         0   2     0     0    0      5       0    0  4          0  ...   

      studded  ranging  gleeson  cartoonsbr  selfawareness  spa

In [142]:
# Split the DataFrame into training and test sets
print("There are", BagOfReviews.shape[0], "samples in the dataset.")
slice = int(0.7*BagOfReviews.shape[0])
data = BagOfReviews.values

imdb_train_data = data[:slice]
imdb_train_target = imdb_headers[:slice]
imdb_test_data = data[slice:]
imdb_test_target = imdb_headers[slice:]

There are 2000 samples in the dataset.


## Generalized Linear Models ##
We will use logistic regression and Singular Value Decomposition (SVD) to build a classifier for our email and movie datasets.

In [144]:
email_model = LogisticRegression(max_iter=5000)
email_model.fit(train_data, train_target)
predicted = email_model.predict(test_data)
accuracy = accuracy_score(test_target, predicted)
print("The accuracy score for logistic regression on the email dataset is", accuracy)

The accuracy score for logistic regression on the email dataset is 0.9816666666666667


In [145]:
movie_model = LogisticRegression(max_iter=1000)
movie_model.fit(imdb_train_data, imdb_train_target)
predicted = movie_model.predict(imdb_test_data)
accuracy = accuracy_score(imdb_test_target, predicted)
print("The accuracy score for logistic regression on the IMDB dataset is", accuracy)


The accuracy score for logistic regression on the IMDB dataset is 0.795
