In [0]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [0]:
#The goal of this cell is to build a Dictionary of the 3000 most common words from all the email content. 
def make_Dictionary(root_dir):
  all_words = []
  #construct a dictionary for all words and symbols
  emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
  #create a path for the folder that contain the emails
  for mail in emails:
    with open(mail) as m:
      #check for directories and list everything in the directory (files and directories)
      for line in m:
        words = line.split()
        all_words += words
        #separate each word in the email 
  dictionary = Counter(all_words)
  #count all the words in the dictionary
  list_to_remove = list(dictionary)

  for item in list_to_remove:
    if item.isalpha() == False: 
      del dictionary[item]
      #remove non-alpha numeric characters
    elif len(item) == 1:
      del dictionary[item]
      #remove single character (words of length 1)
  dictionary = dictionary.most_common(3000)
  #extract only 3000 of the most common words
  return dictionary
  #return to Dictionary

In [0]:
#The goal of this cell is to extract feature columns and populates their values 
def extract_features(mail_dir):
  files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
  features_matrix = np.zeros((len(files),3000))
  #generate feature matrix of 3000 columns and rows tthat are equal to the number of email files
  train_labels = np.zeros(len(files))
  #generate label for training dataset
  count = 1;
  docID = 0;
  for fil in files:
    with open(fil) as fi:
      for i, line in enumerate(fi):
        #check each line in the email
        if i ==2:
          #content start from the third line
          words = line.split()
          #separate line into words
          for word in words:
            wordID = 0
            for i, d in enumerate(dictionary):
              #check the dictionary with the 3000 most common words from the previous cell
              if d[0] == word:
                wordID = i
                features_matrix[docID,wordID] = words.count(word)
                #add the common word to a feature matrix
      train_labels[docID] = 0;
      #create a labelled data column
      filepathTokens = fil.split('/')
      lastToken = filepathTokens[len(filepathTokens)-1]
      if lastToken.startswith("spmsg"):
        #categorize email that contains 'spmsmg' as spam
        train_labels[docID] = 1;
        count = count + 1
      docID = docID + 1
  return features_matrix, train_labels
  #return the feature dataset and the label column

In [0]:
#The following cell is the main program that call the above two cells and execute 
TRAIN_DIR = '/content/drive/My Drive/MSBA_Colab_2020/ML_Algorithms/CA02/Data/train-mails'
TEST_DIR = '/content/drive/My Drive/MSBA_Colab_2020/ML_Algorithms/CA02/Data/test-mails'
#separate the data into trainining and testing dataset

dictionary = make_Dictionary(TRAIN_DIR)
#create a dictionary for the frequent words

print ("reading and processing emails from TRAIN and TEST folders")
features_matrix, labels = extract_features(TRAIN_DIR)
#create feature matrix for training data
test_features_matrix, test_labels = extract_features(TEST_DIR)
#create feature matrix for testing data

model = GaussianNB()
#use Gausian model algorithm
#Gausian model is used in classification and assume the data follows normal distribution

print ("Training Model using Gaussian Naibe Bayes algorithm .....")
model.fit(features_matrix, labels)
#train the model with traning dataset
print ("Training completed")
print ("testing trained model to predict Test Data labels")
predicted_labels = model.predict(test_features_matrix)
#test for spam messages
print ("Completed classification of the Test Data .... now printing Accuracy Score by comparing the Predicted Labels with the Test Labels:")
print (accuracy_score(test_labels, predicted_labels))
#print the model performance based on the accuracy score (percentage of correct prediction)


reading and processing emails from TRAIN and TEST folders
Training Model using Gaussian Naibe Bayes algorithm .....
Training completed
testing trained model to predict Test Data labels
Completed classification of the Test Data .... now printing Accuracy Score by comparing the Predicted Labels with the Test Labels:
0.9615384615384616
