# Spam Filter

This is an implementation of a Spam Filter using a Naïve Bayes classifier.

In [1]:
import os
import re
import csv
import random

In [2]:
master_vocabulary = []
ham_line_paths = []
spam_line_paths = []
with open('./labels', 'r') as file:
    for line in file.readlines():
        if re.match(r'ham', line):
            ham_line_paths.append(line)
        elif re.match(r'spam', line):
            spam_line_paths.append(line)

In [3]:
def write_word_vectors(filename, line_paths, classification, master_vocabulary):
    full_vocabulary = {}
    full_file_vocabs = []
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        for line_path in line_paths:
            path = os.path.join(os.getcwd(), re.sub(r'{} \.\.\/'.format(classification), '', line_path)).strip()
            with open(path, 'rb') as file:
                file_vocab = {'[[file_path]]': line_path.strip()}

                for line in file.readlines():
                    try:
                        split_words = re.findall(r'[A-Za-z]+', line.decode('utf-8'))
                    except UnicodeDecodeError:
                        continue
                    for w in split_words:
                        if full_vocabulary.get(w.lower(), None):
                            full_vocabulary[w.lower()] += 1
                        else:
                            full_vocabulary[w.lower()] = 1

                        if file_vocab.get(w.lower(), None):
                            file_vocab[w.lower()] += 1
                        else:
                            file_vocab[w.lower()] = 1
                        master_vocabulary.append(w)
                full_file_vocabs.append(file_vocab)

#         full_vocabulary_wordset = sorted(full_vocabulary.keys())
#         rows_to_add = [['DOCUMENT', *full_vocabulary_wordset]]
        print('vocabulary sorted!')
#         for file_path in line_paths:
#             row = [file_path]
#             print(file_path)
#             vocab_dict = next((vocab for vocab in full_file_vocabs if vocab['[[file_path]]'] == file_path), None)
#             if vocab_dict:
#                 row.extend([vocab_dict.get(word, 0) for word in full_vocabulary_set])
#             else:
#                 row.extend([0] * len(full_vocabulary))
#             rows_to_add.append(row)
#         print('rows loaded!')
#         writer.writerows(rows_to_add)
    
    return full_vocabulary, full_file_vocabs, list(set(master_vocabulary))

In [4]:
ham_vocabulary, ham_file_vocabs, master_vocabulary = write_word_vectors('hamvectors.csv', ham_line_paths, 'ham', master_vocabulary)
print(len(ham_vocabulary.values()))
ham_full = [*ham_file_vocabs]

vocabulary sorted!
2130014


In [5]:
spam_vocabulary, spam_file_vocabs, master_vocabulary = write_word_vectors('spamvectors.csv', spam_line_paths, 'spam', master_vocabulary)
print(len(spam_vocabulary.values()))
spam_full = [*spam_file_vocabs]

vocabulary sorted!
260921


In [6]:
def take_split(dataset, training_percentage):
    dataset_length = len(dataset)
    sample_count = (dataset_length * (training_percentage / 100)) // 1
    training_sample = []

    while sample_count > 0:
        current_index = random.randint(0, len(dataset) - 1)
        training_sample.append(dataset[current_index])
        del dataset[current_index]
        sample_count -= 1

    return training_sample, dataset

In [7]:
# Split into 70% training and 30% test set
ham_training, ham_test = take_split(ham_full, 70)
print(len(ham_file_vocabs))
print(len(ham_training), len(ham_test))

spam_training, spam_test = take_split(spam_full, 70)
print(len(spam_file_vocabs))
print(len(spam_training), len(spam_test))

12910
9037 3873
24912
17438 7474


In [8]:
print('prior probabilities for Spam and Ham: ')
print('ham class: ', len(ham_training), '/', len(ham_training) + len(spam_training))
print('spam class: ', len(spam_training),  ' / ', len(ham_training) + len(spam_training))

prior probabilities for Spam and Ham: 
ham class:  9037 / 26475
spam class:  17438  /  26475


In [28]:
# We only care about how many times this word occurred across different document Ds
rebuilt_ham_vocabulary_probabilities = {}
for item in ham_training:
    for word in item:
        if rebuilt_ham_vocabulary_probabilities.get(word, None):
            rebuilt_ham_vocabulary_probabilities[word] += 1
        else:
            rebuilt_ham_vocabulary_probabilities[word] = 1

for key in rebuilt_ham_vocabulary_probabilities.keys():
    rebuilt_ham_vocabulary_probabilities[key] = (rebuilt_ham_vocabulary_probabilities[key] + 1) / (len(ham_training) + len(master_vocabulary))
    
ham_vocabulary_size = len(rebuilt_ham_vocabulary_probabilities.keys())

rebuilt_spam_vocabulary_probabilities = {}
for item in spam_training:
    for word in item:
        if rebuilt_spam_vocabulary_probabilities.get(word, None):
            rebuilt_spam_vocabulary_probabilities[word] += 1
        else:
            rebuilt_spam_vocabulary_probabilities[word] = 1

for key in rebuilt_spam_vocabulary_probabilities:
    rebuilt_spam_vocabulary_probabilities[key] = (rebuilt_spam_vocabulary_probabilities[key] + 1) / (len(spam_training) + len(master_vocabulary))

spam_vocabulary_size = len(rebuilt_spam_vocabulary_probabilities.keys())

full_vocabulary_size = len(set(list(rebuilt_ham_vocabulary_probabilities.keys()) + list(rebuilt_spam_vocabulary_probabilities.keys())))

In [32]:

def apply_to_test(test_set, ham_training, spam_training, lamb):
    classified_as_ham = 0
    classified_as_spam = 0
    for item in test_set:
        ham_probability = len(ham_training) / (len(ham_training) + len(ham_test))
        spam_probability = len(spam_training) / (len(spam_training) + len(spam_test))

        ham_default = 1 / (len(ham_training) + lamb * len(ham_training))
        spam_default = 1 / (len(spam_training) + lamb * len(spam_training))
        for word in item.keys():
            ham_component = rebuilt_ham_vocabulary_probabilities.get(word)
            spam_component = rebuilt_spam_vocabulary_probabilities.get(word)
            ham_percent = ham_component if ham_component else ham_default
            spam_percent = spam_component if spam_component else spam_default

            if ham_probability * ham_percent == 0:
                break
            if spam_probability * spam_percent == 0:
                break
            ham_probability *= ham_percent
            spam_probability *= spam_percent

        if ham_probability > spam_probability:
            classified_as_ham += 1
        else:
            classified_as_spam += 1

    return classified_as_ham, classified_as_spam

In [33]:
classified_as_ham = 0
classified_as_spam = 0

classified_as_ham, classified_as_spam = apply_to_test(ham_test, ham_training, spam_training, 1)
print('Classified ham (TN): ', classified_as_ham, '/', len(spam_test))
tn = classified_as_ham
print('Classified spam (FP): ', classified_as_spam, '/', len(spam_test))
fp = classified_as_spam

classified_as_ham, classified_as_spam = apply_to_test(spam_test, ham_training, spam_training, 1)
print('Classified ham (FN): ', classified_as_ham, '/', len(spam_test))
fn = classified_as_ham
print('Classified spam (TP): ', classified_as_spam, '/', len(spam_test))
tp = classified_as_spam

Classified ham (TN):  1701 / 7474
Classified spam (FP):  2172 / 7474
Classified ham (FN):  181 / 7474
Classified spam (TP):  7293 / 7474


In [34]:
print('precision: ', tp / (tp + fp))
print('recall: ', tp / (tp + fn))

precision:  0.7705229793977812
recall:  0.9757827134064758


# Lambda Smoothing tests

We will now be applying different values of lambda into different runs of our classifier to measure differences in precision and recall.

## At λ = 1

In [35]:
classified_as_ham = 0
classified_as_spam = 0

classified_as_ham, classified_as_spam = apply_to_test(ham_test, ham_training, spam_training, lamb=1)
print('Classified ham (TN): ', classified_as_ham, '/', len(spam_test))
tn = classified_as_ham
print('Classified spam (FP): ', classified_as_spam, '/', len(spam_test))
fp = classified_as_spam

classified_as_ham, classified_as_spam = apply_to_test(spam_test, ham_training, spam_training, lamb=1)
print('Classified ham (FN): ', classified_as_ham, '/', len(spam_test))
fn = classified_as_ham
print('Classified spam (TP): ', classified_as_spam, '/', len(spam_test))
tp = classified_as_spam

print('precision: ', tp / (tp + fp))
print('recall: ', tp / (tp + fn))

Classified ham (TN):  1701 / 7474
Classified spam (FP):  2172 / 7474
Classified ham (FN):  181 / 7474
Classified spam (TP):  7293 / 7474
precision:  0.7705229793977812
recall:  0.9757827134064758


## At λ = 2

In [36]:
classified_as_ham = 0
classified_as_spam = 0

classified_as_ham, classified_as_spam = apply_to_test(ham_test, ham_training, spam_training, lamb=2)
print('Classified ham (TN): ', classified_as_ham, '/', len(spam_test))
tn = classified_as_ham
print('Classified spam (FP): ', classified_as_spam, '/', len(spam_test))
fp = classified_as_spam

classified_as_ham, classified_as_spam = apply_to_test(spam_test, ham_training, spam_training, lamb=2)
print('Classified ham (FN): ', classified_as_ham, '/', len(spam_test))
fn = classified_as_ham
print('Classified spam (TP): ', classified_as_spam, '/', len(spam_test))
tp = classified_as_spam

print('precision: ', tp / (tp + fp))
print('recall: ', tp / (tp + fn))

Classified ham (TN):  1881 / 7474
Classified spam (FP):  1992 / 7474
Classified ham (FN):  114 / 7474
Classified spam (TP):  7360 / 7474
precision:  0.7869974337040205
recall:  0.9847471233609848


## At λ = 0.5

In [37]:
classified_as_ham = 0
classified_as_spam = 0

classified_as_ham, classified_as_spam = apply_to_test(ham_test, ham_training, spam_training, lamb=0.5)
print('Classified ham (TN): ', classified_as_ham, '/', len(spam_test))
tn = classified_as_ham
print('Classified spam (FP): ', classified_as_spam, '/', len(spam_test))
fp = classified_as_spam

classified_as_ham, classified_as_spam = apply_to_test(spam_test, ham_training, spam_training, lamb=0.5)
print('Classified ham (FN): ', classified_as_ham, '/', len(spam_test))
fn = classified_as_ham
print('Classified spam (TP): ', classified_as_spam, '/', len(spam_test))
tp = classified_as_spam

print('precision: ', tp / (tp + fp))
print('recall: ', tp / (tp + fn))

Classified ham (TN):  1599 / 7474
Classified spam (FP):  2274 / 7474
Classified ham (FN):  235 / 7474
Classified spam (TP):  7239 / 7474
precision:  0.760958688111006
recall:  0.9685576665774686


## At λ = 0.1

In [38]:
classified_as_ham = 0
classified_as_spam = 0

classified_as_ham, classified_as_spam = apply_to_test(ham_test, ham_training, spam_training, lamb=0.1)
print('Classified ham (TN): ', classified_as_ham, '/', len(spam_test))
tn = classified_as_ham
print('Classified spam (FP): ', classified_as_spam, '/', len(spam_test))
fp = classified_as_spam

classified_as_ham, classified_as_spam = apply_to_test(spam_test, ham_training, spam_training, lamb=0.1)
print('Classified ham (FN): ', classified_as_ham, '/', len(spam_test))
fn = classified_as_ham
print('Classified spam (TP): ', classified_as_spam, '/', len(spam_test))
tp = classified_as_spam

print('precision: ', tp / (tp + fp))
print('recall: ', tp / (tp + fn))

Classified ham (TN):  1478 / 7474
Classified spam (FP):  2395 / 7474
Classified ham (FN):  292 / 7474
Classified spam (TP):  7182 / 7474
precision:  0.7499216873760051
recall:  0.960931228257961


## At λ = 0.005

In [None]:
classified_as_ham = 0
classified_as_spam = 0

classified_as_ham, classified_as_spam = apply_to_test(ham_test, ham_training, spam_training, lamb=0.005)
print('Classified ham (TN): ', classified_as_ham, '/', len(spam_test))
tn = classified_as_ham
print('Classified spam (FP): ', classified_as_spam, '/', len(spam_test))
fp = classified_as_spam

classified_as_ham, classified_as_spam = apply_to_test(spam_test, ham_training, spam_training, lamb=0.005)
print('Classified ham (FN): ', classified_as_ham, '/', len(spam_test))
fn = classified_as_ham
print('Classified spam (TP): ', classified_as_spam, '/', len(spam_test))
tp = classified_as_spam

print('precision: ', tp / (tp + fp))
print('recall: ', tp / (tp + fn))