# Simple Naive Bayes Classifier


## T1. Load a dataset

The following code loads a dataset consisting of text messages and spam-ham labels.



In [None]:
from typing import List, Tuple, Dict, Iterable, Set
from collections import defaultdict
import re
import math
import pandas as pd

url = 'https://raw.githubusercontent.com/mlee-pnu/IDS/main/spam_dataset.csv'
df = pd.read_csv(url)

# TODOs
hams = df['Category'].value_counts()["ham"]
spams = df['Category'].value_counts()["spam"]
print(df['Category'].value_counts())

ham     4825
spam     747
Name: Category, dtype: int64


## T2. Spam filter for individual words

We first defined a function ***tokenize()*** to convert a given text into a set of words. 

Using the function, we now try to count the frequency of each word in each class (spam and ham).

Complete the following code and answer the following questions:
 



In [None]:
def tokenize(text: str) -> Set[str]:
    text = text.lower()                         
    all_words = re.findall("[a-z0-9']+", text)  
    return set(all_words)                       

In [None]:
tokens: Set[str] = set()
token_spam_counts: Dict[str, int] = defaultdict(int)
token_ham_counts: Dict[str, int] = defaultdict(int)

spam = df[df.Category == 'spam']
ham = df[df.Category == 'ham']

spam_word_list = []

for msg in spam['Message'].to_list():
  for token in tokenize(msg):
    tokens.add(token)
    token_spam_counts[token] += 1
    spam_word_list.append(token)

for msg in ham['Message'].to_list():
  for token in tokenize(msg):
    tokens.add(token)
    token_ham_counts[token] += 1

from collections import Counter
spam_dict = dict(Counter(spam_word_list))

# TODOs
word = "free"
n_word_spam = token_spam_counts["free"] # frequency of the word in spam messages
n_word_ham =  token_ham_counts["free"] # frequency of the word in ham messages

# print(n_word_spam, n_word_ham)

p_spam = spam['Message'].count()/df['Message'].count() # P(spam)
p_ham = ham['Message'].count()/df['Message'].count() # P(ham)
# print(p_spam, p_ham)
p_word_given_spam = (n_word_spam/df['Message'].count())/p_spam # P(word|spam)
p_word_given_ham = (n_word_ham/df['Message'].count())/p_ham # P(word|ham)
# print(p_word_given_spam, p_word_given_ham)

# p(spam|word)
p_word = (n_word_ham+n_word_spam)
p_spam_given_word = n_word_spam/p_word
# P(ham|word)
p_ham_given_word = n_word_ham/p_word
print(p_spam_given_word, p_ham_given_word)


0.74235807860262 0.2576419213973799


## T3. Spam filter that combines words: Naive Bayes

You received a text message "just do it" from an unknown sender.

Complete the function ***predict()*** that outputs the probability of the message being spam and the predicted label of the message. 


In [None]:
text = "just do it"

# TODOs
# solution 1.  
def predict(text: str):
  prob = 1
  label = "spam"

  k = 0.0 # smoothing factor
  log_spam = log_ham = 0.0
  
  for token in tokens:
    # Calculate p(token|spam), p(token|ham) 
    word = token
    n_word_spam = token_spam_counts[word] # frequency of the word in spam messages
    n_word_ham = token_ham_counts[word]   # frequency of the word in ham messages

    p_spam = spams/(hams+spams)  # P(spam)
    p_ham = hams/(hams+spams)    # P(ham)
    p_word_given_spam = (n_word_spam + k) / (spams + 2*k)  # P(word|spam)
    p_word_given_ham = (n_word_ham + k) / (hams + 2*k)     # P(word|ham)

    # iterating on the bag of words 
    if token in tokenize(text):
      log_spam += math.log(p_word_given_spam)
      log_ham += math.log(p_word_given_ham)
    else:
      log_spam += math.log(1.0 - p_word_given_spam)
      log_ham += math.log(1.0 - p_word_given_ham)

  p_if_spam = math.exp(log_spam + math.log(p_spam))
  p_if_ham = math.exp(log_ham + math.log(p_ham))
  prob = p_if_spam / (p_if_spam + p_if_ham)
  label = "spam" if prob > 0.5 else "ham"
  return prob, label


print(predict(text))

(5.132694869879663e-07, 'ham')


## T4. Smoothing method

You again received two text messages from unknown senders.

Complete the function ***spamFilter()*** that classifies a given message. 

You may want to apply a smoothing method for this task.


In [None]:
########## OKAY BUT NOT CORRECT
textA = "reward! download your free ticket from our website www.pnu.edu"
textB = "call me and get your money back"

# TODOs
def spamFilter2(text: str):
  k = 1.0 # smoothing factor
  log_spam = log_ham = 0.0
  
  for token in tokens:
    # Calculate p(token|spam), p(token|ham) 
    word = token
    n_word_spam = token_spam_counts[word] # frequency of the word in spam messages
    n_word_ham = token_ham_counts[word]   # frequency of the word in ham messages

    p_spam = spams/(hams+spams)  # P(spam)
    p_ham = hams/(hams+spams)    # P(ham)
    p_word_given_spam = (n_word_spam + k) / (spams + 2*k)  # P(word|spam)
    p_word_given_ham = (n_word_ham + k) / (hams + 2*k)     # P(word|ham)

    # iterating on the bag of words 
    if token in tokenize(text):
      log_spam += math.log(p_word_given_spam)
      log_ham += math.log(p_word_given_ham)
    else:
      log_spam += math.log(1.0 - p_word_given_spam)
      log_ham += math.log(1.0 - p_word_given_ham)

  p_if_spam = math.exp(log_spam + math.log(p_spam))
  p_if_ham = math.exp(log_ham + math.log(p_ham))
  # p_if_spam = math.exp(log_spam)
  # p_if_ham = math.exp(log_ham)
  print(p_if_spam, p_if_ham)
  prob =  p_if_spam / (p_if_spam + p_if_ham)
  label = "spam" if prob > 0.5 else "ham" 
  return label, prob

print(spamFilter2(textA))
print(spamFilter2(textB))

5.5983428595874115e-33 1.53351339674579e-32
('ham', 0.267434927596669)
7.855596234887694e-25 3.766074044359178e-16
('ham', 2.085884697425939e-09)
