# Chapter 13 - Naives Bayes

### A Really Dumb Spam Filter

#### Import Libraries

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Set
from typing import NamedTuple
from typing import List, Tuple, Dict, Iterable
import re
import math
import requests
import tarfile
from io import BytesIO
import glob, re
import random
from collections import defaultdict

### A Really Dumb Spam Filter

In [13]:
#P(S|B) = [P(B|S)P(S)/[P(B|S)P(S) + P(B|9,-S)P(-S)]]

### A more Sophisticated Spam Filter


In [14]:
# P(S|X = x) = P(X = x|S)/[P(X = x|S) + P(X = x|-S)]

# Implementation

In [15]:
def tokenize(text: str) -> Set[str]:
    text = text.lower()
    all_words = re.findall("[a-z0-9]+", text)
    return set(all_words)

assert tokenize("Data Science is science") == {"data", "science", "is"}

In [16]:
class Message(NamedTuple):
    text: str
    is_spam: bool

In [17]:
class NaiveBayesClassifier:
    def __init__(self, k: float = 0.5) -> None:
        self.k = k
        
        self.tokens: Set[str] = set()
        self.token_spam_counts: Dict[str, int] = defaultdict(int)
        self.token_ham_counts: Dict[str, int] = defaultdict(int)
        self.spam_message = self.ham_message = 0

In [18]:
def train(self, messages: Iterable[Message]) -> None:
    for message in message:
        # Increment message counts
        if message.is_spam:
            self.spam_messages += 1
        else:
            self.ham_messages += 1
            
        # Increment word counts
        for token in tokenize(message.text):
            self.tokens.add(token)
            if message.is_spam:
                self.token_spam_counts[token] =+ 1
            else:
                self.token_ham_counts[token] += 1

In [19]:
# "Private" Helper Function

def _probabilities(self, toke: str) -> Tuple[float, float]:
    spam = self.token_spam_counts[token]
    ham = self.token_ham_counts[token]
    
    p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k)
    p_token_ham = (ham + self.k) / (self.ham_messages + 2 * self.k)
    
    return p_token_spam, p_token_ham

In [20]:
def predict(self, text: str) -> float:
    text_tokens = tokenize(text)
    log_prob_if_spam = log_prob_if_ham = 0.0
    
    # Iterate through each word in our vocabulary
    for token in self.tokens:
        prob_if_spam, prob_if_ham = self._probabilities(token)
        
        if token in text_tokens:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_ham += math.log(prob_if_ham)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_ham += math.log(1.0 - prob_if_ham)
            prob_if_spam = math.exp(log_prob_if_spam)
            prob_if_ham = math.exp(log_prob_if_ham)
            return prob_if_spam / (prob_if_spam + prob_if_ham)

### Testing Model

In [31]:
message = [Message("spam rules", is_spam=True),
           Message("ham rules", is_spam=False),
           Message("hello ham", is_spam=False)]

model = NaiveBayesClassifier(k=0.5)

In [24]:
model.train(message)

In [26]:
assert model.tokens == {"spam", "ham", "rules", "hello"}
assert model.spam_messages == 1
assert model.ham_messages == 2
assert model.token_spam_counts == {"spam": 1, "rules": 1}
assert model.token_ham_counts == {"ham": 2, "rules": 1, "hello": 1}

In [28]:
text = "hello spam"

prof_if_spam = [
    (1 + 0.5) / (1 + 2 * 0.5),
    1 - (0 + 0.5) / (1 + 2 * 0.5),
    1 - (1 + 0.5) / (1 + 2 * 0.5),
    (0 + 0.5) / (1 + 2 * 0.5),
]

prof_if_ham = [
    (0 + 0.5) / (2 + 2 * 0.5),
    1 - (2 + 0.5) / (2 + 2 * 0.5),
    1 - (1 + 0.5) / (2 + 2 * 0.5),
    (1 + 0.5) / (2 + 2 * 0.5),
]

p_if_spam = math.exp(sum(math.log(p) for p in probs_if_spam))
p_if_ham = math.exp(sum(math.log(p) for p in probs_if_ham))

assert model.predict(text) == p_if_spam / (p_if_spam + p_if_ham)

### Using Our Model

In [32]:
BASE_URL = "https://spamassassin.apache.org/old/publiccorpus"
FILES = ["20021010_easy_ham.tar/bz2",
         "20021010_hard_ham.tar/bz2",
         "20021010_spam.tar/bz2"]
OUTPUT_DIR = 'spam_data'

for filename in FILES:
    content = requests.get(f"{BASE_URL}/{filename}").content
    
    fin = BytesIO(content)
    
    with tarfile.open(fileobj=fin, mode='r:bz2') as tf:
        tf.extractall(OUTPUT_DIR)

In [33]:
import glob, re

In [34]:
path = 'spam_data/*/*'

data = List[Message] = []

In [35]:
for filename in glob.glob(path):
    is_spam = "ham" not in filename
    
    with open(filename, errors='ignore') as email_file:
        for line in email_file:
            if line.startswith("Subject"):
                subject = line.lstrip("Subject: ")
                data.append(Message(subject, is_spam))
                breakrandom.seed

In [36]:
X = TypeVar('X')  # generic type to represent a data point

def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]:
    """Split data into fractions [prob, 1 - prob]"""
    data = data[:]                    # Make a shallow copy
    random.shuffle(data)              # because shuffle modifies the list.
    cut = int(len(data) * prob)       # Use prob to find a cutoff
    return data[:cut], data[cut:]     # and split the shuffled list there.

data = [n for n in range(1000)]
train, test = split_data(data, 0.75)

# The proportions should be correct
assert len(train) == 750
assert len(test) == 250

# And the original data should be preserved (in some order)
assert sorted(train + test) == data

Y = TypeVar('Y')  # generic type to represent output variables

In [37]:
random.seed(0)
train_message, test_messages = split_data(data, 0.75)

model = NaiveBayesClassifier()
model.train(train_message)

In [38]:
predictions = [(message, model.predict(message.text))
               for message in test_messages]

In [39]:
confusion_matrix = Counter((message.is_spam, spam_probability > 0.5)
                           for message, spam_probability in predictions)

In [40]:
print(confusion_matrix)

In [41]:
def p_soam_given_token(token:str, model: NaiveBayesClassifier) -> float:
    prob_if_spam, prob_if_ham = model._probabilities(token)
    
    return prob_if_spam / (prob_if_spam + prb_if_ham)

In [42]:
words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model))

print("spammiest_words", words[-10])
print("hammiest_words", words[:10])