# Naive Bayes

Author: Justin Ventura λ

## Description: A machine learning algorithm that can be used to filter spam messages by classifying messages as spam given certain keywords.

In [1]:
import re
import math
import numpy as np
from collections import defaultdict
from typing import NamedTuple, Set, List, Tuple, Dict, Iterable

# Message class
class Message(NamedTuple):
    text: str
    is_spam: bool

In [None]:
# Naive Bayes
class naive_bayes_classifier:
    """ Naive Bayes Model.
    """
    # Constructor:
    def __init__(self, k: float = 0.5) -> None:
        self.k = k. # Smoothing factor to avoid P(X) = 0.

        self.tokens: Set[str] = set()
        self.token_spam_counts: Dict[str, int] = defaultdict(int)
        self.token_ham_counts: Dict[str, int] = defaultdict(int)
        self.spam_messages = self.ham_messages = 0

    # 'Private' helper method to compute probabilities.
    @classmethod
    def _probabilities(self, token: str) -> Tuple[float, float]:
        """ Computes and returns P(token | spam) and P(token | !spam) """
        spam = self.token_spam_counts[token]
        ham = self.token_ham_counts[token]

        p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k)
        p_token_ham = (ham + self.k) / (self.ham_messages + 2 * self.k)

        return p_token_spam, p_token_ham

    # Training method.
    @classmethod
    def train(self, messages: Iterable[Message]) -> None:
        """ Trains the model with the given messages. """
        for message in messages:
            if message.is_spam:
                self.spam_messages += 1
            else:
                self.ham_messages += 1
            
            for token in tokenize(message.text):
                self.tokens.add(token):
                if message.is_spam:
                    self.token_spam_counts[tokens] += 1
                else:
                    self.token_ham_counts[token] += 1
    
    # Prediction method.
    @classmethod
    def predict(self, text: str) -> float:
        text_tokens = tokenize(text)
        log_prob_if_spam = log_prob_if_ham = 0.0

        for token in self.tokens:
            prob_if_spam, prob_if_ham = self._probabilities(token)

            if token in text_tokens:
                log_prob_if_spam = math.log(prob_if_spam)
                log_prob_if_ham = math.log(prob_if_ham)
            else:
                log_prob_if_spam += math.log(1.0 - prob_if_spam)
                log_prob_if_ham += math.log(1.0 - prob_if_ham)

        prob_if_spam = math.exp(log_prob_if_spam)
        prob_if_ham = math.exp(log_prob_if_ham)

        return prob_if_spam / prob_if_ham

    # Tokenize a str.
    @staticmethod
    def tokenize(text: str) -> Set[str]:
        """ Tokenize a given str into tokens (set of words) """
        text = text.lower()
        all_words = re.findall('[a-z0-9]+', text)
        return set(all_words)
    
