# Your name: Krissy Wong
## Assignment Name: CA02 - Spam eMail Detection using Naive Bayes Classification Algorithm 

# Program Inititialization Section
## Enter your import packages here

In [85]:
# import packages 
import os 
# The os package allows for creating and deleting a dictionary  
import numpy as np 
from collections import Counter 
from sklearn.naive_bayes import GaussianNB
# Gaussian model training is used for classification and 
# it assumes the features follow a normal distribution
from sklearn.metrics import accuracy_score
# Accuracy score is used to calculate the percentage of correct
# predictions

# Cleaning and Preparing the data
## Build a Dictionary of most common 3500 words from all the email content.

In [86]:
# The functions first adds all the words and symbols in the 
# dictionary. Next it removes all non-alpha-numeric characters 
# and any single character alpha-numeric characters. Then it shrinks
# the Dictionary to keep only the most common 3500 words in the dictionary.
# Finally, the dictionary is returned. 


In [87]:
# Create a function named make_Dictionary to initially include 
# all words
def make_Dictionary(root_dir):
    # root_dir refers to base dictionary of output file or working directory
    all_words = []
    emails = [os.path.join(root_dir, f) for f in os.listdir(root_dir)]
    # os.listdir: list everything in the dictionary 
    for mail in emails: 
        # Check for only files
        with open(mail) as m:
            for line in m:
                words = line.split()
                all_words += words 
    dictionary = Counter(all_words)
    list_to_remove = list(dictionary)

    for item in list_to_remove:
        if item.isalpha() == False: 
          # The line above returns False if some of the characters in the strings are not 
          # the alphabet   
            del dictionary[item]
            # If the the result returned is a character that is not an alphabet then it
            # will be deleted
        elif len(item) == 1:
            del dictionary[item]
            # If the length of the item is equal to 1 then it will be deleted from 
            # the dictionary 
    dictionary = dictionary.most_common(3500)
    # The above function returns a lit of the 3500 most common words and their counts 
    # from most common to the least common. 
    return dictionary   



# Extract Feature columns and populate values 

## Feature Matrix of 3500 columns and rows equal to the number of email files. 

In [88]:
# The function analyzes the file names of each email file and decides if it is a spam or not 
# based on the naming convention. Based on this the function creates a labeled data column.
# The function extracts from the training and testing dataset and
# returns the feature fataset and the label column. 


In [89]:
def extract_features(mail_dir):
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),3500))
    # The function above returns a matrix of any shape and type filled with zeros
    train_labels = np.zeros(len(files))
    count = 1;
    docID = 0;
    # Creates a for loop to loop over lines and split them into words and then adding them 
    # into a dictionary. The words that are split are also counted using the function enumerate(). 
    for fil in files:
        with open(fil) as fi:
            for i, line in enumerate(fi):
                if i ==2:
                    words = line.split()
                    # Split the line into words. 
                    for word in words:
                        wordID = 0
                        for i, d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                features_matrix[docID,wordID] = words.count(word)
        train_labels[docID] = 0
        filepathTokens = fil.split('/')
        # Split the file by '/'
        lastToken = filepathTokens[len(filepathTokens)-1]
        if lastToken.startswith("spmsg"):
            train_labels[docID] = 1;
            count = count + 1
        docID = docID + 1
    return features_matrix, train_labels
            

In [90]:
# The section is the main program that calls the above two functions and 
# gets executed first. First it "trains" the model using model.fit function
# and Training Dataset. Next it scores the Test Data set by running the 
# Trained Model with the Test Data set. Finally, it prints the model performance 
# in terms of the accuracy score. 

In [91]:
from google.colab import drive
drive.mount('/gdrive')
# Connect google.colab to google drive to access the path of the folder which holds the train-mails and
# test-mails data. 

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [92]:
train_dir = '/gdrive/MyDrive/BSAN6070/CA02/train-mails'
# The path for the train mails data 

In [93]:
test_dir = '/gdrive/MyDrive/BSAN6070/CA02/test-mails'
# The path for the test mails data 

In [94]:
print("Create a dictionary using the train data.")

Create a dictionary using the train data.


In [95]:
dictionary = make_Dictionary(train_dir)

In [96]:
print("Read and process emails from train and test folders. The training dictionary is put into a matrix labeled features_matrix. The testing dictionary is put into a matrix named test_features_matrix.")

Read and process emails from train and test folders. The training dictionary is put into a matrix labeled features_matrix. The testing dictionary is put into a matrix named test_features_matrix.


In [97]:
features_matrix, labels = extract_features(train_dir)

In [98]:
test_features_matrix, test_labels = extract_features(test_dir)

In [99]:
model = GaussianNB()

In [100]:
print('"The Training Model will use a Gaussian Naive Bayes algorithm which assumes a normal distribution of the 3500 most common words. "')

"The Training Model will use a Gaussian Naive Bayes algorithm which assumes a normal distribution of the 3500 most common words. "


In [101]:
model.fit(features_matrix, labels)

GaussianNB()

In [102]:
print ("The training is complete once the features_matrix and labels are fitted into the Gaussian Naive Bayes Model.")

The training is complete once the features_matrix and labels are fitted into the Gaussian Naive Bayes Model.


In [103]:
print ("Now we are going to test the trained model to predict Test Data labels using model.predict() function.")

Now we are going to test the trained model to predict Test Data labels using model.predict() function.


In [104]:
predicted_labels = model.predict(test_features_matrix)

In [105]:
print ("Completed classification of the Test Data. Now we are going to print the Accuracy Score by comparing the Predicted Labels with the Test Labels.")

Completed classification of the Test Data. Now we are going to print the Accuracy Score by comparing the Predicted Labels with the Test Labels.


In [107]:
print (accuracy_score(test_labels, predicted_labels))

0.9615384615384616


In [108]:
print("The model has an accuracy score of 96% in predicting labels for the testing dataset from the training dataset using the Gaussian Naive Bayes Model. This indicates that the set of samples predicted using the model matches 96% of the corresponding set of labels in the test dataset. ")

The model has an accuracy score of 96% in predicting labels for the testing dataset from the training dataset using the Gaussian Naive Bayes Model. This indicates that the set of samples predicted using the model matches 96% of the corresponding set of labels in the test dataset. 
