In [None]:
import zipfile
import requests
from collections import Counter
import re
import math

x = requests.get("http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/smsspamcollection.zip")

with open("spam.zip", "wb") as f:
    f.write(x.content)

zipfile.ZipFile("spam.zip").extractall("./")
data = [line.strip().split("\t") for line in open("SMSSpamCollection.txt", "r")]

In [None]:
# 데이터 나누기
train_size = int(len(data) * 0.8)

train = data[:train_size]
test = data[train_size:]

num_spams = sum(1 for s, c in train if s == "spam")

In [None]:
# prior probability
prior_spam = sum(1 for s, c in train if s == "spam") / train_size
prior_ham = 1 - prior_spam

print(prior_spam, prior_ham)

0.13500784929356358 0.8649921507064364


In [None]:
# likelyhood
num_spam_words = sum(len(re.findall("[0-9a-z_]+", c.lower())) for s, c in train if s == "spam")
num_ham_words = sum(len(re.findall("[0-9a-z_]+", c.lower())) for s, c in train if s != "spam")

spams = [c for s, c in train if s == "spam"]
hams = [c for s, c in train if s == "ham"]

spam_words = Counter([w for txt in spams for w in re.findall("[0-9a-z_]+", txt.lower())])
ham_words = Counter([w for txt in hams for w in re.findall("[0-9a-z_]+", txt.lower())])

num_unique_words = len(set(w for s, c in train for w in re.findall("[0-9a-z_]+", c.lower())))

# P(w | S) = (w가 S에서 등장하는 횟수) / num_spam_words
# P(w | H) = (w가 H에서 등장하는 횟수) / num_ham_words

In [None]:
# predict
def predict(txt, k=0.5):
    words = re.findall("[0-9a-z_]+", txt.lower())

    spam_score = math.log(prior_spam)
    ham_score = math.log(prior_ham)

    for w in words:
        spam_score += math.log((spam_words[w] + k) / (num_spam_words + num_unique_words * k))
        ham_score += math.log((ham_words[w] + k) / (num_ham_words + num_unique_words * k))

    return spam_score > ham_score

In [None]:
# 테스트 해보기
tp, tn, fp, fn = 0, 0, 0, 0

for s, c in test:
    isspam = predict(c)
    if isspam and s == "spam": tp += 1
    elif isspam and s != "spam": fp += 1
    elif not isspam and s == "spam": fn += 1
    else: tn += 1

print("tp:", tp)
print("tn:", tn)
print("fp:", fp)
print("fn:", fn)

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)

print("accuracy:", accuracy)
print("precision:", precision)
print("recall:", recall)
print("f1:", f1)

tp: 140
tn: 956
fp: 14
fn: 5
accuracy: 0.9829596412556054
precision: 0.9090909090909091
recall: 0.9655172413793104
f1: 0.9364548494983278
