In [1]:
from collections import Counter, defaultdict
import math, random, re, glob

In [2]:
def tokenize(message):
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    return set(all_words)  

In [3]:
tokenize("This is the way we wash our face, wash our face, wash our face")

{'face', 'is', 'our', 'the', 'this', 'wash', 'way', 'we'}

In [4]:
def count_words(training_set):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [5]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | spam) and p(w | ~spam)"""
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

In [6]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # for each word in the message,
        # add the log probability of seeing it
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # for each word that's not in the message
        # add the log probability of _not_ seeing it
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [7]:
class NaiveBayesClassifier:

    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):

        # count spam and non-spam messages
        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams

        # run training data through our "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [8]:
def get_subject_data(path):

    data = []

    # regex for stripping out the leading "Subject:" and any spaces after it
    subject_regex = re.compile(r"^Subject:\s+")

    # glob.glob returns every filename that matches the wildcarded path
    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))

    return data

In [9]:
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [10]:
def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

In [11]:
def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)      # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)
    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print(counts)

    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print("hammiest_words", hammiest_words)

In [12]:
path = r"/root/spam_data/*/*"

In [13]:
data = get_subject_data(path)

In [14]:
train_and_test_model(path)

Counter({(False, False): 724, (True, True): 76, (True, False): 46, (False, True): 30})
spammiest_hams [('FREE SHIPPING! No Minimum Purchase* at Buy.com', False, 0.9971039598526326), ('=?iso-2022-jp?B?GyRCRnxLXDhsJE43b0w+IUolNSVWJTglJyUvJUghSyEhJTkbKEI=?=', False, 0.9994663932534893), ('[ILUG-Social] Re: Important - reenactor insurance needed', False, 0.9998076814974624), ('[ILUG-Social] Re: Important - reenactor insurance needed', False, 0.9998076814974624), ('Four free e-mailers reviewed, Get the gear you need to burn DVDs', False, 0.9998459802644436)]
hammiest_spams [('I was so scared... my very first DP', True, 6.860927085172968e-05), ('Re: Hi', True, 0.0005049046739866188), ('.Message report from your contact page....//ytu855 rkq', True, 0.0034184359874956517), ('Chicago Meeting Site', True, 0.0054877371963912595), ('Lease Deal', True, 0.006482022422482024)]
spammiest_words [('money', 0.0274869109947644, 0.00023073373327180433), ('rates', 0.030104712041884817, 0.0002307337332718043

In [15]:
data = get_subject_data(path)
random.seed(0)      # just so you get the same answers as me
train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)

classified = [(subject, is_spam, classifier.classify(subject))
          for subject, is_spam in test_data]

counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                 for _, is_spam, spam_probability in classified)

In [16]:
classifier.word_probs

[('tech', 0.0013089005235602095, 0.008075680664513151),
 ('caffeine', 0.0013089005235602095, 0.0011536686663590216),
 ('geothermal', 0.0013089005235602095, 0.0006922011998154131),
 ('specialist', 0.003926701570680628, 0.0029995385325334565),
 ('lockergnome', 0.0013089005235602095, 0.009921550530687587),
 ('19', 0.0013089005235602095, 0.002076603599446239),
 ('friday', 0.0013089005235602095, 0.0016151361329026304),
 ('july', 0.0013089005235602095, 0.004845408398707891),
 ('reg', 0.0013089005235602095, 0.004383940932164283),
 ('headlines', 0.0013089005235602095, 0.014997692662667282),
 ('investor', 0.0013089005235602095, 0.003461005999077065),
 ('with', 0.04057591623036649, 0.04407014305491463),
 ('earnings', 0.0013089005235602095, 0.0011536686663590216),
 ('techs', 0.0013089005235602095, 0.0025380710659898475),
 ('seibel', 0.0013089005235602095, 0.0006922011998154131),
 ('lose', 0.014397905759162303, 0.0006922011998154131),
 ('questions', 0.0013089005235602095, 0.0025380710659898475),
 