In [2]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import email
import re
from collections import Counter

#sklearn libraries
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix 

## Preprocessing

In [3]:
#import dataset and the stop words
#put items from the labels file to the labels dictionary separating the label and its corresponding data file
email_labels_dict = {'data_file_path':[], 'label':[]}

with open("trec06/trec06p-cs280/labels") as labels:
    for line in labels:
        #splitting ham/spam and the data file path item in labels file. Example: label= ham and file= ../data/000/000
        label, file = line.split()
        if label == 'spam':
            email_labels_dict['data_file_path'].append(file.replace("../data/",""))
            email_labels_dict['label'].append('spam')
        elif label == 'ham':
            email_labels_dict['data_file_path'].append(file.replace("../data/",""))
            email_labels_dict['label'].append('ham')
            
#convert the email_label_dict dictionary as a dataframe             
emails = pd.DataFrame.from_dict(email_labels_dict)
emails.head(5)

Unnamed: 0,data_file_path,label
0,000/000,ham
1,000/001,spam
2,000/002,spam
3,000/003,ham
4,000/004,spam


In [4]:
#import stop words from stop_words file and put these words in an array.

with open('stop_words.txt','r') as words:
    stop_words = words.read().splitlines()

#display only a few elements in the stop_words. Just enough to show that each words are separated in the array
stop_words[:6]

['a', 'able', 'about', 'above', 'abst', 'accordance']

In [5]:
#removing stop words and other unecessary characters from email content

email_message = []

def clean_email_content(content):
    cleaned_words = []
    words = content.split()
    
    for word in words:
        uncleaned_word = re.sub(r'<[^<>]+>','', word)
        uncleaned_word = re.sub(r'[^a-zA-Z\n ]','', uncleaned_word)
        uncleaned_word = re.sub(r'http\S+', '', uncleaned_word)
        uncleaned_word = re.sub(r'www', '', uncleaned_word)
        uncleaned_word = re.sub(r'goo\S+', '', uncleaned_word)
        
        if uncleaned_word not in stop_words:
            cleaned_words.append(uncleaned_word)

    cleaned_content = list(filter(None, cleaned_words))
    str_cleaned_content = ' '.join(cleaned_content)
    return str_cleaned_content
    
    
# go through the files in data
for file_path in emails['data_file_path']:
    
    # open the file with encoiding set to ISO-8859-1 
    with open(f"trec06/trec06p-cs280/data/{file_path}", "r", encoding="ISO-8859-1") as email_content:
        message = ""   
        email_msg = email.message_from_file(email_content)

        #if emails are multipart
        if email_msg.is_multipart():
            #loop through the email's parts
            for part in email_msg.walk():
                #if the content-type is text/plain then get content
                if part.get_content_type() == "text/plain": 
                    message = part.get_payload() 
                    break
        #if the email is not a multipart
        else:
            message = email_msg.get_payload()
        
        #remove stop words and unnecessary characters in email content
        cleaned_msg= clean_email_content(message)
        
        #the cleaned message is converted to lowercase before being appended to the email_message list
        email_message.append(cleaned_msg.lower())

In [6]:
emails['email_content'] = email_message

emails.head(5)

Unnamed: 0,data_file_path,label,email_content
0,000/000,ham,the mailing list i queried weeks ago running i...
1,000/001,spam,luxury watches buy your own rolex for only rol...
2,000/002,spam,academic qualifications prestigious nonacc red...
3,000/003,ham,greetings this verify subscription planfans li...
4,000/004,spam,chauncey conferred luscious continued tonsillitis


## Splitting datasets for training and testing

In [7]:
#train datasets. Folders 0-70
email_train = emails[emails['data_file_path'] < '071']

#test datasets. Folders 0
email_test = emails[emails['data_file_path'] >= '071']

#check if shape is corect base on what is stated in the instructions
print("The shape of the training set is: ", email_train.shape)
print("The shape of testing set is: ", email_test.shape)

The shape of the training set is:  (21300, 3)
The shape of testing set is:  (16522, 3)


In [8]:
#Split training dataset for ham and spam

ham_train = emails[emails['label'] == 'ham'].reset_index()
spam_train = emails[emails['label']== 'spam'].reset_index()

print("The shape of the training set is: ", ham_train.shape)
print("The shape of testing set is: ", spam_train.shape)

The shape of the training set is:  (12910, 4)
The shape of testing set is:  (24912, 4)


In [9]:
#extract the most common (1000) words from the training dataset
most_used_words = Counter(" ".join(email_train['email_content']).split()).most_common(1000)

top_used_words = pd.DataFrame(most_used_words, columns = ['words', 'num_of_occurences'])
top_used_words.head(5)

Unnamed: 0,words,num_of_occurences
0,i,37617
1,bb,18521
2,the,14554
3,a,12342
4,td,11367


## Creating the feature matrices

In [10]:
#creating the feature dictionary for ham and spam
ham_word_counts = {unique_words: [0] * len(ham_train) for unique_words, _ in most_used_words}
spam_word_counts = {unique_words: [0] * len(spam_train) for unique_words, _ in most_used_words}

top_words_list = [key for key, _ in most_used_words]

In [11]:
# Spam Feature Set
spam_word_count = pd.DataFrame(spam_word_counts)

#loop through the spam train set index
for i in spam_train.index:
        # count the word frequency per row in the train ham set
        frequency = dict(Counter(spam_train['email_content'][i].split()))
        for key, val in frequency.items():
            if key in top_words_list:
                spam_word_count.loc[i, key] += val

spam_feat_matrix = spam_word_count.to_numpy()
spam_feat_matrix

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
# Ham Feature Set
ham_word_count = pd.DataFrame(ham_word_counts)

for i in ham_train.index:
        frequency = dict(Counter(spam_train['email_content'][i].split()))
        for key, val in frequency.items():
            if key in top_words_list:  # add the word frequency to the row and column where the word is found
                ham_word_count.loc[i, key] += val

ham_feat_matrix = ham_word_count.to_numpy()
ham_feat_matrix

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Computing priors

In [13]:
spam_train_size = spam_train.shape[0]        #number of spam emails
ham_train_size = ham_train.shape[0]          #number of ham emails
total_train_size = email_train.shape[0]      #total number of emails for training

spam_prior =  spam_train_size / total_train_size
ham_prior = ham_train_size / total_train_size

print("Prior probabilities for spam: ", spam_prior)
print("Prior probabilities for ham: ", ham_prior)

Prior probabilities for spam:  1.1695774647887325
Prior probabilities for ham:  0.6061032863849766


## Computing the Likelihood of each word

In [14]:
#sum of each words in spam
spam_train_words_sum = np.sum(spam_feat_matrix, axis=0)
#sum of each words in ham
ham_train_words_sum = np.sum(ham_feat_matrix, axis=0)

#total sum of words in spam
spam_train_word_total = spam_train_words_sum.sum()
#total sum of words in ham
ham_train_word_total = ham_train_words_sum.sum()

#spelled as lamda because lambda is a given function
lamda = 1 #laplace smoothing
ham_likelihood = {}
spam_likelihood = {}
#formula based on the given formula in the instructions

for i in range(len(top_words_list)):
    
    c_ham = (ham_train_words_sum[i]+lamda) / (ham_train_word_total + lamda*len(top_words_list))
    c_spam = (spam_train_words_sum[i]+lamda) / (spam_train_word_total + lamda*len(top_words_list))
    
    ham_likelihood[top_words_list[i]] = c_ham
    spam_likelihood[top_words_list[i]] = c_spam

## Classifying Emails

In [15]:
#function for classifying the emails

def classifying_emails(email_content, spam_prior, ham_prior, spam_likelihood, ham_likelihood, top_words_list):
    
    #get log values of spam and ham probabilities
    spam_prob_log = np.log(spam_prior)
    ham_prob_log = np.log(ham_prior)
    
    email_words = str(email_content).split()
    
    for word in email_words:

        if word in top_words_list:
            ham_prob_log += np.log(ham_likelihood[word])
            spam_prob_log += np.log(spam_likelihood[word])
    
    if spam_prob_log > ham_prob_log:
        return "spam"
    else:
        return "ham"

## Testing the classifier

In [16]:
#creating a dictionary with the predicted label with the corresponding data file path
predicted_dict = {'data_file_path':[], 'predicted_label': []}

for path, content in zip(email_test['data_file_path'], email_test['email_content']):
    predicted_dict['data_file_path'].append(path) 
    classification = classifying_emails(content, spam_prior, ham_prior, spam_likelihood, ham_likelihood, top_words_list)
    #add the predicted label by the classfying_emails function
    predicted_dict['predicted_label'].append(classification) 

In [17]:
predicted_test = pd.DataFrame.from_dict(predicted_dict)

merged_emails = pd.merge(email_test, predicted_test, on='data_file_path')
merged_emails

Unnamed: 0,data_file_path,label,email_content,predicted_label
0,071/000,spam,where hesitantly derive perverse satisfaction ...,spam
1,071/001,ham,there things perform experiment first display ...,spam
2,071/002,spam,best offer month viggra ci ialis vaiium xa naa...,spam
3,071/003,spam,de ar home o wne your cr doesnt matter if ow n...,ham
4,071/004,spam,special offer adobe video collection adobe pre...,ham
...,...,...,...,...
16517,126/017,spam,great news expec ted infinex ventures inc infx...,ham
16518,126/018,spam,the oil sector going crazy this weekly gift ge...,ham
16519,126/019,spam,suffering pain depression heartburn well help ...,spam
16520,126/020,spam,u n i v e r s i t y d i p l o m a s do prosper...,ham


## Performance Evaluation

In [18]:
#performance evaluation for accuracy, recall, precision and f1_score.
true_label = merged_emails['label'].to_numpy()
predicted_label = merged_emails['predicted_label'].to_numpy()
print("The test set accuracy score:", accuracy_score(true_label, predicted_label))
print("The test set recall score:", recall_score(true_label, predicted_label, pos_label="spam"))
print("The test set precision score:", precision_score(true_label, predicted_label, pos_label="spam"))
print("The test set F1 score:", f1_score(true_label, predicted_label, pos_label="spam"))

The test set accuracy score: 0.5430940564096356
The test set recall score: 0.7900314324202964
The test set precision score: 0.6279982866933181
The test set F1 score: 0.6997573877421152


In [19]:
#confusion_matrix
c_matrix = confusion_matrix(merged_emails['label'].to_numpy(), merged_emails['predicted_label'].to_numpy(), labels=["spam", "ham"])
c_matrix

array([[8797, 2338],
       [5211,  176]], dtype=int64)