### Navie Bayes
<b>Spam and Ham Classifier</b>

In [1]:
from typing import Set
import re
def tokenize(text: str) -> Set[str]:
    text = text.lower()
    all_words = re.findall("[a-z0-9]+",text)
    return set(all_words)

In [2]:
from typing import NamedTuple
class Message(NamedTuple):
    text: str
    is_spam: bool

In [29]:
#Building the spam classifier

from typing import List, Tuple, Dict,Iterable
import math
from collections import defaultdict

class NavieBayesClassifer:
    def __init__(self, k:float=0.5)-> None:
        self.k = k
        self.tokens: Set[str] = set()
        self.token_spam_counts: Dict[str, int] = defaultdict(int)
        self.token_ham_counts: Dict[str, int] = defaultdict(int)
        self.spam_messages = self.ham_messages = 0
        
    def train(self, message: Iterable[Message]) -> None:
        for message in messages:
            if message.is_spam:
                self.spam_messages += 1
            else:
                self.ham_messages += 1

            for token in tokenize(message.text):
                self.tokens.add(token)
                if message.is_spam:
                    self.token_spam_counts[token] += 1
                else:
                    self.token_ham_counts[token] += 1
                    
    def _probabilities(self,token: str) -> Tuple[float,float]:
        spam = self.token_spam_counts[token]
        ham = self.token_ham_counts[token]
        
        p_token_spam = (spam + self.k) / (self.spam_messages + 2*self.k)
        p_token_ham = (ham + self.k) / (self.ham_messages + 2*self.k)
        
        return p_token_spam,p_token_ham
    
    def predict(self,text:str) -> float:
        text_tokens = tokenize(text)
        log_prob_if_spam = log_prob_if_ham = 0.0
        
        for token in self.tokens:
            prob_if_spam, prob_if_ham = self._probabilities(token)
            if token in text_tokens:
                log_prob_if_spam += math.log(prob_if_spam)
                log_prob_if_ham += math.log(prob_if_ham)
                # Otherwise add the log probability of _not_ seeing it,
                # which is log(1 - probability of seeing it)
            else:
                log_prob_if_spam += math.log(1.0 - prob_if_spam)
                log_prob_if_ham += math.log(1.0 - prob_if_ham)
            
        prob_if_spam = math.exp(log_prob_if_spam)
        prob_if_ham = math.exp(log_prob_if_ham)
        return prob_if_spam / (prob_if_spam + prob_if_ham)

In [30]:
# Creating a small dataset and training the dataset
messages = [Message("spam rules", is_spam=True),
                Message("ham rules", is_spam=False),
                Message("hello ham", is_spam=False)]

model = NavieBayesClassifer(k=0.5)
model.train(messages)

In [31]:
#Checking the outputs and the training result.

assert model.tokens == {"spam", "ham", "rules", "hello"}
assert model.spam_messages == 1
assert model.ham_messages == 2
assert model.token_spam_counts == {"spam": 1, "rules": 1}
assert model.token_ham_counts == {"ham": 2, "rules": 1, "hello": 1}

In [32]:
#Making a prediction

text = "hello spam"
probs_if_spam = [
                (1 + 0.5) / (1 + 2 * 0.5), # "spam" (present)
                1 - (0 + 0.5) / (1 + 2 * 0.5), # "ham" (not present)
                1 - (1 + 0.5) / (1 + 2 * 0.5), # "rules" (not present)
                (0 + 0.5) / (1 + 2 * 0.5) # "hello" (present)
            ]
probs_if_ham = [
                (0 + 0.5) / (2 + 2 * 0.5), # "spam" (present)
                1 - (2 + 0.5) / (2 + 2 * 0.5), # "ham" (not present)
                1 - (1 + 0.5) / (2 + 2 * 0.5), # "rules" (not present)
                (1 + 0.5) / (2 + 2 * 0.5), # "hello" (present)
            ]

p_if_spam = math.exp(sum(math.log(p) for p in probs_if_spam))
p_if_ham = math.exp(sum(math.log(p) for p in probs_if_ham))
# Should be about 0.83
assert model.predict(text) == p_if_spam / (p_if_spam + p_if_ham)

In [33]:
# the Navie Classifier works well...

#### Using the model on a dataset
<b>Dataset - <a href="https://spamassassin.apache.org/old/publiccorpus/">SpamAssassin Public Corpus</a></b>

In [35]:
from io import BytesIO
import requests
import tarfile

BASE_URL = "https://spamassassin.apache.org/old/publiccorpus"
FILES = ["20021010_easy_ham.tar.bz2",
        "20021010_hard_ham.tar.bz2",
        "20021010_spam.tar.bz2"]

OUTPUT_DIR = 'spam_data'

for filename in FILES:
    content = requests.get(f"{BASE_URL}/{filename}").content
    fin = BytesIO(content)
    with tarfile.open(fileobj = fin, mode='r:bz2') as tf:
        tf.extractall(OUTPUT_DIR)

In [36]:
import glob, re
path = "spam_data/*/*"
data : List[Message] = []
for filename in glob.glob(path):
    is_spam = "ham" not in filename
    
    with open (filename, errors='ignore') as email_file:
        for line in email_file:
            if line.startswith("Subject:"):
                subject = line.lstrip("Subject: ")
                data.append(Message(subject, is_spam))
                break

In [38]:
import random
from scratch.machine_learning import split_data
random.seed(0)
train_messages, test_messages = split_data(data,0.75)
model = NavieBayesClassifer()
model.train(train_messages)

In [41]:
from collections import Counter
predictions = [(message, model.predict(message.text))
              for message in test_messages]

confusion_martix = Counter((message.is_spam, spam_probability >0.5)
                          for message, spam_probability in predictions)
print(confusion_martix)

Counter({(False, True): 699, (True, True): 126})


In [43]:
def p_spam_given_token(token: str, model:NavieBayesClassifer) -> float:
    prob_if_spam, prob_if_ham = model._probabilities(token)
    return prob_if_spam / (prob_if_spam + prob_if_ham)

word = sorted(model.tokens, key=lambda t: p_spam_given_token(t,model))
print("spammies_words",word[-10:])
print("hammiest_word", word[:10])

spammies_words ['ham', 'hello', 'rules', 'spam']
hammiest_word ['ham', 'hello', 'rules', 'spam']
