In [1]:
import string
import math

In [2]:
def tokenize(text):
    text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
    text = text.lower()
    tokens = text.split()
    return tokens

def tokenize_messages(messages):
    message = []
    for i in messages:
        message.append(tokenize(i))
    return message

def create_vocabulary(tokenized_messages):
    vocabulary = set()
    for i in tokenized_messages:
        vocabulary.update(i)
    return sorted(vocabulary)

def filter_messages(messages, classes):
    hams = [message for message, label in zip(messages, classes) if label == "0"]
    spams = [message for message, label in zip(messages, classes) if label == "1"]
    return hams, spams

def count_word(messages, word):
    count = sum(tokens.count(word) for tokens in messages)
    return count

In [3]:
class NaiveBayesSpamFilter(object):

    def __init__(self, alpha = 1):
        self.alpha = alpha
        self.ham_msgs = []
        self.spam_msgs = []
        self.tokenized_spam = []
        self.tokenized_ham = []
        self.vocab = set()
        self.p_ham = 0
        self.p_spam = 0
        self.log_p_ham = {}
        self.log_p_spam = {}
        self.total_ham_words = 0
        self.total_spam_words = 0

    def set_parameters(self, messages, labels):
        self.ham_msgs, self.spam_msgs = filter_messages(messages, labels)
        total_msgs = len(messages)
        self.p_ham = (len(self.ham_msgs) + self.alpha) / (total_msgs + 2 * self.alpha)
        self.p_spam = (len(self.spam_msgs) + self.alpha) / (total_msgs + 2 * self.alpha)

        self.tokenized_ham = tokenize_messages(self.ham_msgs)
        self.tokenized_spam = tokenize_messages(self.spam_msgs)
        self.vocab = create_vocabulary(tokenize_messages(messages))
        self.total_ham_words = len(self.tokenized_ham)
        self.total_spam_words = len(self.tokenized_spam)

        for word in self.vocab:
            ham_word_count = count_word(self.tokenized_ham, word)
            spam_word_count = count_word(self.tokenized_spam, word)
            
            p_ham_word = (ham_word_count + self.alpha) / (self.total_ham_words + 2 * self.alpha)
            p_spam_word = (spam_word_count + self.alpha) / (self.total_spam_words + 2 * self.alpha)
            
            self.log_p_ham[word] = math.log(p_ham_word)
            self.log_p_spam[word] = math.log(p_spam_word)

    def classify(self, message):
        words = tokenize(message)
        log_ham = math.log(self.p_ham)
        log_spam = math.log(self.p_spam)

        for word in words:
            if word in self.log_p_ham and word in self.log_p_spam:
                log_ham += self.log_p_ham[word]
                log_spam += self.log_p_spam[word]

        p_ham_msg = math.exp(log_ham)
        p_spam_msg = math.exp(log_spam)
        total = p_ham_msg + p_spam_msg
        p_ham_msg /= total
        p_spam_msg /= total

        print(f"P(spam | message): {p_spam_msg}")
        print(f"P(ham | message): {p_ham_msg}")
        return 1 if p_spam_msg > p_ham_msg else 0

In [None]:
naiveBayesModel = NaiveBayesSpamFilter(1)
naiveBayesModel.set_parameters(["  Winner! Claim rare secret prize now!  ", "'Medium-rare,' she said NOW!!"],["1","0"])
naiveBayesModel.classify("secret secret secret rare medium")