# M2608.001300 Machine Learning <br> Assignment #4 Naïve Bayes Classifier for Spam Filtering

Copyright (C) Data Science Laboratory, Seoul National University. This material is for educational uses only. Some contents are based on the material provided by other paper/book authors and may be copyrighted by them. Written by Jangho Lee, May 2018

**Note**: certain details are missing or ambiguous on purpose, in order to test your knowledge on the related materials. However, if you really feel that something essential is missing and cannot proceed to the next step, then contact the teaching staff with clear description of your problem.

### Submitting your work:
<font color=red>**DO NOT clear the final outputs**</font> so that TAs can grade both your code and results.  
Once you have done **all parts**, run the *CollectSubmission.sh* script with your **student_id** as input argument. <br>
This will produce a zipped file called *[student_id].zip*. Please submit this file on ETL. &nbsp;&nbsp; (Usage: ./*CollectSubmission.sh* &nbsp; student_id)

In [849]:
import os
import sys
import collections
import re
import math
import copy
import codecs

In [850]:
# Prepare data

# Set the data paths (training, test, and stop words)
train_spam_dir = './data/train/spam'
train_ham_dir = './data/train/ham'
test_spam_dir = './data/test/spam'
test_ham_dir = './data/test/ham'

stop_words_path = './data/stop_words.txt'

In [851]:
# Declare some variables to put the training/test data

# Define an empty dictionary to put the emails
train_data = {}
test_data = {}

# Define classes
classes = ['ham', 'spam']

# Define an empty dictionary to put conditional probability
cond_prob = {}
cond_prob['ham'] = {}
cond_prob['spam'] = {}

# Define an empty dictionary to put prior
prior = {}

In [852]:
# Write down 'Document' class to store email instance
class Document:
    text = ''
    word_frequency = {}
    
    # spam / ham
    true_class = ''
    learned_class = ''
    
    # Constructor
    def __init__(self, text, word_count, true_class):
        self.text = text
        self.word_frequency = word_count
                    
        self.true_class = true_class
        
    # return email content
    def get_text(self):
        return self.text
    
    # return word frequency
    def get_word_frequency(self):
        return self.word_frequency
    
    # return true class
    def get_true_class(self):
        return self.true_class
    
    # return predicted class
    def get_predicted_class(self):
        return self.learned_class
    
    # set the prediction
    def set_predicted_class(self, prediction):
        self.learned_class = prediction
        
    

In [853]:
# Read all text files in the given dictionary and construct the dataset, D
def generate_dataset(storage_dict, path, true_class):
    """
    Input: storage_dict, path, true_class
    (We defined the some dictionaries to put the data and each dictionary has the form 
    {dir_path: Document(text, bag_of_words(text), true_class)}
    """
    dirs = os.listdir(path)
    for dir in dirs:
        dir_path = os.path.join(path, dir)
        if os.path.isfile(dir_path):
            # open with 'codecs' package in order to suppress the error
            with codecs.open(dir_path, 'r', encoding='utf-8', errors='ignore') as text_file:
                text = text_file.read()
                
                # update storage
                storage_dict[dir_path] = Document(text, bag_of_words(text), true_class)

In [854]:
# Count frequency of each word in the text in the text files 
def bag_of_words(text):
    """
    Input: text (email contents)
    Return: ditionray (key: word, value: count)
    """
    dic = {}
    
    text_list = text.split()
    
    for w in text_list:
        if w in dic:
            dic[w] = dic[w] + 1
        else:
            dic[w] = 1
            
    return dic

In [855]:
# Extract the vocabulary of all the text in a data set
def extract_vocabulary(data_set):
    """
    Input: dictionary of data 
          (updated dictionary after generate_dataset and remove_stop_words functions)
    Return: vocabulary list
    
    """
    all_text = ''
    v = []
    
    # concatenate all texts
    for data in data_set:
        all_text = all_text+" "+data_set[data].get_text()
    # create words list using 'bag_of_words' previously defined function
    temp = bag_of_words(all_text)
    
    for t in temp:
        v.append(t)
        
    return v

In [856]:
# Set the stop words using the 'stop_words.txt'
# Open the txt file and put each stop word into the stop words list
def set_stop_words():
    
    stops = [] # stop words list
    
    # insert each word into stop words list
    data_set = None
    with codecs.open(stop_words_path, 'r', encoding='utf-8', errors='ignore') as text_file:
        text = text_file.read()
        stops = text.split()
        
    return stops

In [857]:
# Remove stop words from data using the property of 'Document' class
def remove_stop_words(stops, data_set):
    """
    Input: list of stop words
    Return: filtered out data set
    """
    new_data = {}
    # remove the stop words in email contents, iteratively
    for data in data_set:
        
        rmv_word = data_set[data].get_text().split()
        
        temp = []
        for r in rmv_word:
            if r not in stops:
                temp.append(r)
                
        txt = ''
        for t in temp:
            txt = txt +' '+ t + ' '
        
        new_data[data] = Document(txt, bag_of_words(txt), data_set[data].get_true_class()) 
        
        #print(txt)
    return new_data

In [858]:
# Training
def train(training, priors, cond_prob, alpha=0.1):
    
    # the vocabulary of the training set
    v = extract_vocabulary(training) # type: list
    
    # the number of documents
    n = len(training)
    
    # for each class in classes (spam / ham)
    for c in list(classes):
        # n_c is number of documents with true class c
        n_c = 0.0
        
        # text_concatenation of text of all docs in class (D, c)
        text_c = ""
        
        for data in training:
            if training[data].get_true_class() == c:
                n_c = n_c+1
                text_c = text_c +" "+ training[data].get_text()
        
        priors[c] = n_c/n
        
        # Count frequencies/tokens of each term in text_c in dictionary form (i.e. token : frequency)
        token_freqs = bag_of_words(text_c)
        
        N = 0.
        for t in token_freqs:
            
            N += token_freqs[t]
        
        # Calculate conditional probabilities for each token and sum using laplace smoothing and log-scale
        for t in list(v):
            if t in token_freqs:
                cond_prob[c][t] = math.log10(float((token_freqs[t]+alpha)/(N+alpha*len(v))))
            else:
                cond_prob[c][t] = math.log10(float(alpha/(N+alpha*len(v))))
                

In [859]:
# Test
def test(data_instance, priors, cond_prob):
    score = {}
    for c in classes:
        score[c] = math.log10(float(priors[c]))
        
        # For each data instance, 
        # get the word frequency and calculate the conditional probability
        # to compare the score (spam or not)
        
        freq = data_instance.get_word_frequency()
        for f in freq:
            if f in cond_prob[c]:
                score[c] += (cond_prob[c][f]) * freq[f]
        
    if score["spam"] > score["ham"]:
        return "spam"
    else:
        return "ham"

In [860]:
# Prepare data
generate_dataset(train_data, train_spam_dir, classes[1])
generate_dataset(train_data, train_ham_dir, classes[0])
generate_dataset(test_data, test_spam_dir, classes[1])
generate_dataset(test_data, test_ham_dir, classes[0])


stop_words = set_stop_words() # type: list


In [842]:
# Remove stop words

train_data = remove_stop_words(stop_words, train_data)
test_data = remove_stop_words(stop_words, test_data)


In [844]:
alphas = [0.02]

for a in alphas:
    # Train
    train(train_data, prior, cond_prob, a)
    # Count the corretly classified emails
    correct_predictions = 0
    for i in test_data:
        # predict the class (spam or not)
        pred = test(test_data[i], prior, cond_prob)
        test_data[i].set_predicted_class(pred)
    
        # calculate the accuracy
        if pred == test_data[i].get_true_class():
            correct_predictions += 1

    print("Spam filtering accuracy:\t\t\t%.4f%%" % (100.0 * float(correct_predictions) / float(len(test_data))))

Spam filtering accuracy:			96.0251%
