# Project: Spam Email Filtering

In [61]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import os

### Feature extraction

In [2]:
def mail_dictionary(path):

    def mail_aggregate(path, folder):
        folder_path = path + '/' + folder
        file_list = os.listdir(folder_path)

        mails = []
        for file_name in file_list:
            file_path = folder_path + '/' + file_name
            with open(file_path) as f:
                mail = f.read()
                mails.append(mail)

        return mails
    
    nonspam_emails_train = mail_aggregate(path, 'nonspam-train')
    nonspam_emails_test = mail_aggregate(path, 'nonspam-test')
    spam_emails_train = mail_aggregate(path, 'spam-train')
    spam_emails_test = mail_aggregate(path, 'spam-test')

    nonspam_email = nonspam_emails_train + nonspam_emails_test
    spam_email = spam_emails_train + spam_emails_test
    emails = nonspam_email + spam_email

    def build_dictionary(emails):
        vectorizer = CountVectorizer()
        term_document_vectorize = vectorizer.fit_transform(emails)
        term_document_array = term_document_vectorize.toarray()

        # Count the occurrences of each word in total
        word_counts = term_document_array.sum(axis=0)

        # Sorting list of word in descending order of the occurrences
        sorted_indices = np.argsort(word_counts)[::-1]
        sorted_word_counts_array = word_counts[sorted_indices]
        
        # Get list of words
        feature_names = vectorizer.get_feature_names_out()

        # Generate the dictionary
        word_dict = {feature_names[sorted_indices[i]]: sorted_word_counts_array[i] for i in range(len(sorted_indices))}
        
        return word_dict
    
    word_dictionary = build_dictionary(emails)
    most_frequency_dictionary = dict(list(word_dictionary.items())[:2500])

    with open('dictionary.txt', 'w') as dict_f:
        for key, value in most_frequency_dictionary.items():
            dict_f.write(f'{key}: {value}\n')

        # Truncate the final abundant line in the file
        dict_f.seek(0, os.SEEK_END)
        dict_f.seek(dict_f.tell() - 2, os.SEEK_SET)
        dict_f.truncate()

    return most_frequency_dictionary

dictionary = mail_dictionary('ex6DataEmails')

In [39]:
def extract_features(path, dataset_folder, dictionary):
    def find_word_index(word, dictionary):
        for index, (key,value) in enumerate(dictionary.items()):
            if(word == key):
                return index
        return -1

    file_list = os.listdir(path + '/' + dataset_folder)
    tokens_features = np.zeros((len(file_list), 2501), dtype=int)

    document_index = 0
    for file in file_list:
        with open(path + '/' + dataset_folder + '/' + file, 'r') as f:
            document = [f.read()]

            vectorizer = CountVectorizer()
            term_document_vectorizer = vectorizer.fit_transform(document)
            word_features = vectorizer.get_feature_names_out()
            word_counts = term_document_vectorizer.toarray()

            # Occurrences of the non-existent word in the dictionary
            others = 0

            # Fill the occurrences of the word of the document in the corresponding postion in the feature vector
            for i in range(word_counts.shape[1]):
                index = find_word_index(word_features[i], dictionary)
                if index != -1:
                    tokens_features[document_index, index] = word_counts[0,i]
                else:
                    others = word_counts[0,i]
            # Fill the total occurrences of all the non-existent words in the dictionary
            tokens_features[document_index, -1] = others

            document_index += 1

    return tokens_features

nonspam_train_feature = extract_features('ex6DataEmails', 'nonspam-train', dictionary)
spam_train_feature = extract_features('ex6DataEmails', 'spam-train', dictionary)
X_train = np.concatenate((nonspam_train_feature, spam_train_feature), axis = 0)
y_train = np.concatenate((np.zeros(350), np.ones(350)))

nonspam_test_feature = extract_features('ex6DataEmails', 'nonspam-test', dictionary)
spam_test_feature = extract_features('ex6DataEmails', 'spam-test', dictionary)
X_test = np.concatenate((nonspam_test_feature, spam_test_feature), axis = 0)
y_test = np.concatenate((np.zeros(130), np.ones(130)))

### Model selection

In [58]:
seed = 0
models = [
    MultinomialNB(),
    SVC(random_state=seed)
]
kfold = StratifiedKFold(shuffle=True, random_state=seed)

data_compare = []
for model in models:
    model_name = model.__class__.__name__
    score = cross_val_score(estimator=model, X=X_train, y=y_train, scoring='accuracy', cv=kfold)
    data_compare.append((model_name, score.mean(), score.std()))

df = pd.DataFrame(data=data_compare, columns=['Model name', 'Mean accuracy', 'Standard deviation'])
df

Unnamed: 0,Model name,Mean accuracy,Standard deviation
0,MultinomialNB,0.982857,0.01069
1,SVC,0.975714,0.017261


### Tuning hyperparameters

In [62]:
NBC = MultinomialNB()
param_dist = {
    'alpha' : [1e-1, 1, 10]
}

tuner = GridSearchCV(estimator=NBC, param_grid=param_dist, scoring='accuracy', cv=kfold, verbose=3)
tuner.fit(X_train, y_train)
tuner.best_params_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END .........................alpha=0.1;, score=1.000 total time=   0.0s
[CV 2/5] END .........................alpha=0.1;, score=0.993 total time=   0.0s
[CV 3/5] END .........................alpha=0.1;, score=0.986 total time=   0.0s
[CV 4/5] END .........................alpha=0.1;, score=0.986 total time=   0.0s
[CV 5/5] END .........................alpha=0.1;, score=0.986 total time=   0.0s
[CV 1/5] END ...........................alpha=1;, score=1.000 total time=   0.0s
[CV 2/5] END ...........................alpha=1;, score=0.971 total time=   0.0s
[CV 3/5] END ...........................alpha=1;, score=0.986 total time=   0.0s
[CV 4/5] END ...........................alpha=1;, score=0.986 total time=   0.0s
[CV 5/5] END ...........................alpha=1;, score=0.971 total time=   0.0s
[CV 1/5] END ..........................alpha=10;, score=0.993 total time=   0.0s
[CV 2/5] END ..........................alpha=10;,

{'alpha': 0.1}

### Prediction

In [64]:
y_pred = tuner.predict(X_test)
print(f'Accuracy score of tuned model: {(accuracy_score(y_test, y_pred)*100):.2f}%')

NBC.fit(X_train, y_train)
y_pred = NBC.predict(X_test)
print(f'Accuracy score of original model: {(accuracy_score(y_test, y_pred)*100):.2f}%')

Accuracy score of tuned model: 98.46%
Accuracy score of original model: 98.08%
