In [26]:
from collections import Counter, defaultdict
from machine_learning import split_data
import math, random, re, glob

In [2]:
def tokenize(message):
    message=message.lower()
    all_words=re.findall("[a-z0-9']+", message)
    return set(all_words)

In [3]:
def count_words(training_set):
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [4]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    return [(w,
            (spam + k) / (total_spams + 2 * k),
            (non_spam + k) / (total_non_spams + 2 * k))
            for w, (spam, non_spam) in counts.items()]

In [5]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0
    
    for word, prob_if_spam, prob_if_not_spam in word_probs:
    
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [49]:
class NaiveBayesClassifier:
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
    def train(self, training_set):
       
        num_spams = len([is_spam
                        for message, is_spam in training_set
                        if is_spam])
        num_non_spams = len(training_set) - num_spams

        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                            num_spams,
                                            num_non_spams,
                                            self.k)
    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [50]:
def get_subject_data(path):

    data = []

    # regex for stripping out the leading "Subject:" and any spaces after it
    subject_regex = re.compile(r"^Subject:\s+")

    # glob.glob returns every filename that matches the wildcarded path
    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))

    return data

In [51]:
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [52]:

def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0.1)      # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.65)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print(counts)

    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print("hammiest_words", hammiest_words)

In [53]:
if __name__ == "__main__":
    train_and_test_model(r"/Users/HeX/Untitled Folder/spamm/*/*")

Counter({(False, False): 1014, (True, True): 97, (True, False): 69, (False, True): 30})
spammiest_hams [('Save up to 70% on international calls!', False, 0.9922321437129689), ('[ILUG-Social] Re: Important - reenactor insurance needed', False, 0.9958051113229955), ("A-level student sues for £100,000 over 'grade fixing'", False, 0.9973545693494492), ('=?iso-2022-jp?B?UmU6IBskQjswSSkyPTNYJSglcyU4JUslIiVqJXMlME1NJVcbKEI=?=', False, 0.999487739219205), ('=?iso-8859-1?Q?Matrox_Parhelia=99_now_available?=', False, 0.9995773028202543)]
hammiest_spams [('Invite: Content Management Summit, Oct. 10th New York City', True, 0.0011306915739269403), ('Re: This Weekend', True, 0.0014128036067802506), ('Re: Hi', True, 0.0023240193010965163), ('NEW TECHNOLOGY - DIGITAL VIDEO RECORDER (Smart IP Technology)', True, 0.003315296647926699), ('Industry Forum #136', True, 0.0037038190707676492)]
spammiest_words [('zzzz', 0.02514792899408284, 0.0002663825253063399), ('000', 0.02514792899408284, 0.00026638252530