In [1]:
from IPython.core.display import HTML, display
display(HTML('<style>.container { width:100%; } </style>'))

# Spam Detection  Using the Naive Bayes Algorithm

## Step 1: Create Word Dictionary

We need the module `os` for reading directories and the module `re` for 
<em style='color:blue;'>regular expressions</em>.

In [2]:
import os
import re
import numpy as np
import math

An object of class <a href='https://docs.python.org/2/library/collections.html#counter-objects'>`Counter`</a> is a special form of a `dictionary` that is used for counting.

In [3]:
from collections import Counter

In [4]:
spam_dir_train = 'EmailData/spam-train/'
ham__dir_train = 'EmailData/ham-train/'
spam_dir_test  = 'EmailData/spam-test/'
ham__dir_test  = 'EmailData/ham-test/'
Directories    = [spam_dir_train, ham__dir_train, spam_dir_test, ham__dir_test]

In [5]:
no_spam    = len(os.listdir(spam_dir_train))
no_ham     = len(os.listdir(ham__dir_train))
spam_prior = no_spam / (no_spam + no_ham)
ham__prior = no_ham  / (no_spam + no_ham)
spam_prior, ham__prior

(0.5, 0.5)

The function $\texttt{get_words}(\texttt{fn})$ takes a filename $\texttt{fn}$ as its argument.  It reads the file and returns a set of all words that are found in the file.  The words are transformed to lower case.

In [6]:
def get_words(fn):
    file = open(fn)
    text = file.read()
    text = text.lower()
    return set(re.findall(r"[\w']+", text))

In [7]:
get_words('EmailData/ham-train/3-380msg4.txt')

{'anyone',
 'article',
 'berkeley',
 'book',
 'consonant',
 'edu',
 'english',
 'garnet',
 'hard',
 'helpful',
 'hi',
 'interest',
 'irish',
 'laurel',
 'm',
 'modern',
 'palatal',
 'phonetics',
 'posting',
 'project',
 'recommend',
 'slender',
 'source',
 'specifically',
 'sutton',
 'thank',
 'too',
 'work'}

Given a list of `Directories`, the function `read_all_files` reads all files contained in these directories.  It returns a `Counter` that for every word $w$ contains the number of those files that contain $w$. 

In [8]:
def read_all_files(Directories):
    Words = Counter()
    for directory in Directories:
        for file_name in os.listdir(directory):
            Words.update(get_words(directory + file_name))
    return Words

In [9]:
Word_Counter = read_all_files(Directories)
Common_Words = { w for w, _ in Word_Counter.most_common(2500) }

The function $\texttt{get_common_words}(\texttt{fn}, \texttt{Common_Words})$ takes a filename $\texttt{fn}$ and a set of
$\texttt{Common_Words}$ as its argument.  It reads the file and returns set of all words in `Common_Words` that are found in the file.  The words are transformed to lower case.

In [10]:
def get_common_words(fn):
    return get_words(fn) & Common_Words

In [11]:
get_common_words('EmailData/ham-train/3-380msg4.txt')

{'anyone',
 'article',
 'berkeley',
 'book',
 'consonant',
 'edu',
 'english',
 'hard',
 'helpful',
 'hi',
 'interest',
 'm',
 'modern',
 'phonetics',
 'project',
 'recommend',
 'source',
 'specifically',
 'thank',
 'too',
 'work'}

The function `count_words` takes a string specifying a `directory`.  It returns a 
`Counter` that counts how often the words in `Common_Words` occur in any of the files in `directory`.

In [12]:
def count_commmon_words(directory):
    Words = Counter()
    for file_name in os.listdir(directory):
        Words.update(get_common_words(directory + file_name))
    return Words

In [13]:
spam_counter = count_commmon_words(spam_dir_train)
ham__counter = count_commmon_words(ham__dir_train)

In [14]:
Ham__Probability = {}
Spam_Probability = {}
for w in Common_Words:
    hc = min(ham__counter[w], 1)
    sc = min(spam_counter[w], 1)
    Ham__Probability[w] = (hc + 1) / (no_spam + no_ham + 2)
    Spam_Probability[w] = (sc + 1) / (no_spam + no_ham + 2)

Given a file name `fn`, this function returns the probability the message contained in the file is spam.

In [15]:
def spam_probability(fn):
    log_p_spam  = 0.0
    log_p_ham   = 0.0
    words = get_common_words(fn)
    for w in Common_Words:
        if w in words:
            log_p_spam += math.log(Spam_Probability[w])
            log_p_ham  += math.log(Ham__Probability[w])
        else:
            log_p_spam += math.log(1.0 - Spam_Probability[w])
            log_p_ham  += math.log(1.0 - Ham__Probability[w])
    alpha = abs(min(log_p_spam, log_p_ham))
    p_spam = math.exp(log_p_spam + alpha) * spam_prior
    p_ham  = math.exp(log_p_ham  + alpha)  * ham__prior
    return p_spam / (p_spam + p_ham)

In [16]:
spam_probability('EmailData/ham-train/3-430msg1.txt')

0.003830658606113412

In [17]:
def precission_recall(spam_dir, ham_dir):
    TN = 0 # true negatives
    FP = 0 # false positives
    for email in os.listdir(spam_dir):
        if spam_probability(spam_dir + email) > 0.5:
            TN += 1
        else:
            FP += 1
    FN = 0 # false negatives
    TP = 0 # true positives
    for email in os.listdir(ham_dir):
        if spam_probability(ham_dir + email) > 0.5:
            FN += 1
        else:
            TP += 1
    precision = TP / (TP + FP)
    recall    = TP / (TP + FN)
    accuracy  = (TN + TP) / (TN + TP + FN + FP)
    return precision, recall, accuracy

In [18]:
precission_recall(spam_dir_train, ham__dir_train)

(0.938337801608579, 1.0, 0.9671428571428572)

In [19]:
precission_recall(spam_dir_test, ham__dir_test)

(0.9020979020979021, 0.9923076923076923, 0.9423076923076923)