In [None]:
import requests
import zipfile
import random
from collections import Counter
import re

In [None]:
# data download, unzip

r = requests.get("http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/smsspamcollection.zip")
with open("spam.zip", "wb") as f:
    f.write(r.content)

In [None]:
# loading data
t = zipfile.ZipFile("spam.zip").extractall("./")

data = []
with open("SMSSpamCollection.txt", "r") as f:
    for line in f:
        cls, txt = line.strip().split('\t')
        bow = set(re.findall("[0-9a-z_]+", txt.lower()))
        data.append([cls, bow])

In [None]:
# split data into train and test

random.shuffle(data)

train_size = int(0.8 * len(data))
test_size = len(data) - train_size

train = data[:train_size]
test = data[train_size:]

print(len(train), len(test))
print(train)

In [None]:
# prior probability

n_total = train_size
n_spam = sum(1 for cls, bow in train if cls == 'spam')
n_ham = sum(1 for cls, bow in train if cls == 'ham')

alpha = 0.0005

prior_spam = (n_spam + alpha) / (n_total + 2*alpha)
prior_ham = (n_ham + alpha) / (n_total + 2*alpha)

print(prior_spam, prior_ham)

In [None]:
# spam_words = {}
# ham_words = {}

# # print(train)

# for cls, bow in train:
#     for word in bow:
#         if cls == 'spam':
#             if word not in spam_words:
#                 spam_words[word] = 0
#             spam_words[word] += 1

#         else:
#             if word not in ham_words:
#                 ham_words[word] = 0
#             ham_words[word] += 1

# ham_words

In [None]:
spam_words = Counter(word for cls, bow in train for word in bow if cls == 'spam')
ham_words = Counter(word for cls, bow in train for word in bow if cls == 'ham')

In [None]:
def predict(bow):
    spam_score = prior_spam
    ham_score = prior_ham

    for word in bow:
        spam_score *= (spam_words[word] + alpha) / (n_spam + alpha)
        ham_score *= (ham_words[word] + alpha) / (n_ham + alpha)
        
    if spam_score < ham_score: return 'ham'
    else: return 'spam'

In [None]:
tp, tn, fp, fn = 0,0,0,0

for ans, bow in test:
    pred = predict(bow)
    # print(ans, pred)
    if pred == 'spam' and ans == 'spam': tp += 1
    elif pred == 'spam' and ans == 'ham': fp += 1
    elif pred == 'ham' and ans == 'ham': tn += 1
    else: fn += 1

print("tp: ", tp)
print("tn: ", tn)
print("fp: ", fp)
print("fn: ", fn)

In [None]:
accuracy = (tp + tn) / (tp + fp + tn + fn)
precision = tp/(tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)

print("accuracy: ", accuracy)
print("precision: ", precision)
print("recall: ", recall)
print("f1: ", f1)