In [1]:
from collections import defaultdict
import re


def tokenize(message):
    """ Removes all punctuation and returns a simple set
    of all the words in the given string (pushed to lower case)
    """
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return set(all_words)


def count_words(training_set):
    """ training set consists of pairs (message, is_spam) """
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[words][0 if is_spam else 1] += 1
    return counts


In [4]:
# Here is a quick test of tokenize

mess = """Hey Carolyn,
How are you doing? Are things good?
Things are good? Ah, good. Glad to hear it.
Listen if you can loren ipsum, blah blah blah...
-John"""

print(tokenize(mess))

{'hey', 'can', 'how', 'listen', 'john', 'carolyn', 'good', 'doing', 'ipsum', 'glad', 'things', 'it', 'are', 'hear', 'loren', 'blah', 'if', 'to', 'you', 'ah'}


In [3]:
import math


def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """ turn the word counts into a list of triples:
    w, p(w | spam) and p(w | ~spam)
    """
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.iteritems()]


def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_is_spam = log_prob_if_not_spam = 0.0
    
    # iterate through each word in our vocab
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        # if "word" in message, add the log prob of seeing it
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(log_prob_if_not_spam)
        else:
            # if "word" not in message, ad log prob of Not seeing it
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
        
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)


class NaiveBayesClassifier:
    
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
    
    def train(self, training_set):
        """train the classifier"""
        # count spam and non-spam messages
        num_spams = len([is_spam for message, is_spam in training_set if is_spam])
        num_non_spams = len(training_set) - num_spams
        
        # run training data through our "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)
    
    def classify(self, message):
        return spam_probability(self.word_probs, message)



In [5]:
""" We want a lot of test spam emails. The good folks at Apache have us covered:
https://spamassassin.apache.org/publiccorpus/
"""
import glob
import re

path = 'emails/*/*'
data = []

# glob.glob returns file names in a path, allowing for wild cards
for file_path in glob.glob(path):
    is_spam = 'ham' not in file_path
    
    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('Subject:'):
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))

# TODO: Learn to deal with UTF-8 text files in Python v3:
#       http://stackoverflow.com/questions/11918512/python-unicodedecodeerror-utf8-codec-cant-decode-byte

print(len(data))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa3 in position 803: invalid start byte