# 手刻基本Naive Bayes模型

#### 學習重點：理解單純貝氏模型原理

---

In [1]:
import re
import numpy as np
import math
import os
import glob
import codecs

from collections import defaultdict
from collections import Counter

def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return set(all_words)

### 讀入資料並分割為 train/testset

In [7]:
X = []
Y = []
paths =[r'../../data/spam_data/spam', r'../../data/spam_data/easy_ham', r'../../spam_data/hard_ham'] 
for path in paths:
    for fn in glob.glob(path+"/*"):
        if "ham" not in fn:
            is_spam = True
        else:
            is_spam = False
        #codecs.open可以避開錯誤，用errors='ignore'
        with codecs.open(fn, encoding='utf-8', errors='ignore') as file:
            for line in file:
                #這個line的開頭為Subject:
                if line.startswith("Subject:"):
                    subject = re.sub(r"^Subject:", "", line).strip()
                    X.append(subject)
                    Y.append(is_spam)

In [9]:
from sklearn.model_selection import train_test_split
# random_state 是為了讓各為學員得到相同的結果，平時可以移除
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [10]:
train_data = []
test_data = []

for x_, y_ in zip(X_train, y_train):
    train_data.append([x_, y_])

for x_, y_ in zip(X_test, y_test):
    test_data.append([x_, y_])

In [11]:
train_data[:5]

[['Re: Goodbye Global Warming', False],
 ['Let us find the right mortgage lender for you      AFPE', True],
 ['[dgc.chat] First public release of NeuDist Distributed Transaction', False],
 ['Re: [VoID] a new low on the personals tip...', False],
 ['RE: Java is for kiddies', False]]

In [12]:
test_data[:5]

[['Re: From', False],
 ['DVD capture: Unbreaking the Mac', False],
 ['Re: Goodbye Global Warming', False],
 ['=?ISO-2022-JP?B?GyRCTCQ+NUJ6OS05cCIoPF5HLiEqPVAycSQkJE45LT5sGyhC?=', True],
 ['Re: My source: RE: A biblical digression', False]]

---

### defaultdict用法示範

In [13]:
from collections import defaultdict

mess = 'This is our first time in Taiwan,,,,, such a beautiful country'

counts = defaultdict(lambda:[0,0])
counts['you'][0] += 1
counts['hi'][0] += 1
counts['hi'][1] += 2
counts['no'][1] += 1
counts['no'][0] += 8
print('dic : {}'.format(counts))
print('you : {}'.format(counts['you']))

dic : defaultdict(<function <lambda> at 0x7ff67fb19a60>, {'you': [1, 0], 'hi': [1, 2], 'no': [8, 1]})
you : [1, 0]


### 創造一個字典，裡面是{'hi': [1, 0]}，對應第一個數字是是垃圾郵件的次數，對應第二個數字是不是垃圾郵件的次數

In [14]:
def count_words(training_set):
    counts = defaultdict(lambda:[0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            '''自行填入， list[0]為出現在spam中的次數，list[1]為出現在ham(非spam)中的次數'''
            counts[word][0 if is_spam else 1] += 1
    return counts

---

## 計算 p(w|spam) / p(w|non_spam)
* 其中 K 為超參數，為了確保分母/分子皆不為 0

In [15]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    #獲得三組數據，分別為w這個字，p(w|spam)，p(w|non_spam)
    #counts[w][0]=spam 
    #counts[w][1]=non_spam
    return [(w, (counts[w][0]+k)/(total_spams+2*k), (counts[w][1]+k)/(total_non_spams+2*k)) for w in counts]

---

## 計算貝氏結果

In [16]:
def spam_probability(word_probs, message, spam_prob, ham_prob):
    
    #先把這個mail的文字處理一下
    message_words = tokenize(message)
    
    #初始化值=0
    log_prob_spam = log_prob_ham = 0.0
    
    #將 w 這個字, p(w|spam), p(w|non_spam)依序引入
    for word, word_on_spam, word_on_ham in word_probs:
        
        #假如這個字有在這個 mail 中出現
        if word in message_words:
            
            #把他的 p(w|spam) 轉 log 值加上 log_prob_if_spam
            log_prob_spam = log_prob_spam + math.log(word_on_spam)
            
            #把他的 p(w|non_spam) 轉 log 值加上 log_prob_if_not_spam
            log_prob_ham = log_prob_ham + math.log(word_on_ham)
            
        else:
            
            #如果沒出現 log_prob_if_spam ➕ 上的值＝1-p(w|spam)
            #也就是這封信是垃圾郵件但是 w 這個字卻沒在裡面
            log_prob_spam = log_prob_spam + math.log(1 - word_on_spam)
            log_prob_ham = log_prob_ham + math.log(1 - word_on_ham)
            
    log_prob_spam = log_prob_spam + math.log(spam_prob)
    log_prob_ham = log_prob_ham + math.log(ham_prob)
    
    #把 + 起來的值轉成 exp 再算 NaiveBayes
    prob_spam = math.exp(log_prob_spam)
    prob_ham = math.exp(log_prob_ham)
    
    #貝氏
    return prob_spam / (prob_spam + prob_ham)

---

### 打包整個模型

In [34]:
class NaiveBayesClassifier:
    
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
        
    def train(self, training_set):
        #訓練的資料格式為(message, is_spam)
        
        #所有垃圾郵件的數量
        num_spams = len([is_spam for message, is_spam in training_set if is_spam])
        
        #所有不是垃圾郵件的數量
        num_hams = len(training_set) - num_spams
        
        self.spam_probability = num_spams / len(training_set)
        self.ham_probability = num_hams / len(training_set)
        
        #把 training_set 裡面的所有字體轉成 ('Bad', num_is_spam, num_not_spam)
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts, num_spams, num_hams, self.k)
        
    def classify(self, message):
        return spam_probability(self.word_probs, message, self.spam_probability, self.ham_probability)

---

### Fit 訓練集

In [35]:
classifier = NaiveBayesClassifier()

In [46]:
classifier.k

0.5

In [36]:
classifier.train(train_data)

In [43]:
classifier.spam_probability, classifier.ham_probability

(0.16250495441934204, 0.8374950455806579)

In [45]:
classifier.word_probs[:10]

[('re', 0.06204379562043796, 0.49692526017029326),
 ('global', 0.0012165450121654502, 0.011589403973509934),
 ('goodbye', 0.0012165450121654502, 0.011116367076631977),
 ('warming', 0.0012165450121654502, 0.01064333017975402),
 ('for', 0.10097323600973236, 0.11565752128666036),
 ('us', 0.00851581508515815, 0.007805108798486282),
 ('the', 0.11070559610705596, 0.14120151371807002),
 ('mortgage', 0.025547445255474453, 0.00023651844843897824),
 ('afpe', 0.0036496350364963502, 0.00023651844843897824),
 ('find', 0.010948905109489052, 0.004493850520340586)]

### 預測

In [37]:
classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data]
counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified)

In [38]:
counts

Counter({(False, False): 527,
         (True, True): 57,
         (False, True): 11,
         (True, False): 36})

In [40]:
precision = counts[(True, True)] / (counts[(True, True)] + counts[(False, True)])

recall = counts[(True, True)] / (counts[(True, True)] + counts[(True, False)])

all_result_counts = counts[(False, True)] + counts[(False, False)] + counts[(True, True)] + counts[(True, False)]
binary_accuracy = (counts[(True, True)] + counts[(False, False)]) / all_result_counts

print('accuracy : {:.2f}%'.format(binary_accuracy * 100))
print('precision : {:.2f}%'.format(precision * 100))
print('recall : {:.2f}%'.format(recall * 100))

accuracy : 92.55%
precision : 83.82%
recall : 61.29%
