<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Naive Bayes for spam filter
from typing import Set
import re

def tokenize(text: str) -> Set[str]:
  text = text.lower()
  all_words = re.findall("[a-z0-9']+", text) # extract the words
  return set(all_words) # and remove duplicates

assert tokenize("Data Science is science") == {"data", "science", "is"}

In [0]:
from typing import NamedTuple

class Message(NamedTuple):
  text: str
  is_spam: bool
  

In [0]:
from typing import List, Tuple, Dict, Iterable
import math
from collections import defaultdict

class NaiveBayesClassifier:
  def __init__(self, k: float = 0.5) -> None:
    self.k = k # smoothing factor
    self.tokens: Set[str] = set()
    self.token_spam_counts: Dict[str, int] = defaultdict(int)
    self.token_ham_counts: Dict[str, int] = defaultdict(int)
    self.spam_messages = self.ham_messages = 0
  
  def train(self, messages: Iterable[Message]) -> None:
    for message in messages:
      # Increment message counts
      if message.is_spam:
        self.spam_messages += 1
      else:
        self.ham_messages += 1

      # Increment word counts
      for token in tokenize(message.text):
        self.tokens.add(token)
        if message.is_spam:
          self.token_spam_counts[token] += 1
        else:
          self.token_ham_counts[token] += 1
  
  def _probabilities(self, token: str) -> Tuple[float, float]:
    """returns P(token | spam) and P(token | ham)"""
    spam = self.token_spam_counts[token]
    ham = self.token_ham_counts[token]

    p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k)
    p_token_ham = (ham + self.k) / (self.ham_messages) + 2 * self.k)

    return p_token_spam, p_token_ham
    