## 朴素贝叶斯实现

<img src="../img/bayes1.png" width=600 height=400 align='left'/>  
<img src="../img/bayes2.png" width=500 height=400 align='left'/>  

In [1]:
import numpy as np

In [2]:
class NB:
    def __init__(self, lambda_):
        self.lambda_ = lambda_  # 拉普拉斯平滑
        self.feat_val = [0, 1]  # 文本分类中每一列特征的取值
        self.py = {}
        self.pxy = {}

    def fit(self, X, y):
        N, M = X.shape
        data = np.hstack((X, y.reshape(N, 1)))

        # 统计标签中的类别及其个数，即\sum{I(y_i=c_k)}
        unique_y, counts_y = np.unique(y, return_counts=True)
        y_info = dict(zip(unique_y, counts_y))

        # 对每一个类别进行遍历
        for ck, ck_count in y_info.items():
            # 计算P(Y=c_k)
            self.py['P(Y={})'.format(ck)] = (ck_count + self.lambda_) / (N + len(unique_y) * self.lambda_)

            # 取出标签=ck的所有行
            tmp_data = data[data[:, -1] == ck]

            # 对每一个特征遍历
            for col in range(M):
                # 统计类别为ck且该列特征下每个取值的个数，即\sum{I(x_ij=a_jl,y_i=c_k)}
                unique_feat, counts_feat = np.unique(tmp_data[:, col], return_counts=True)
                feat_info = dict(zip(unique_feat, counts_feat))
                # 如果该类别下的特征的取值全相等，那也需要把其它取值也加入到feat_info中
                if len(feat_info) != len(self.feat_val):
                    for v in self.feat_val:
                        feat_info[v] = feat_info.get(v, 0)
                # 对该特征下的每一个不同取值进行遍历
                for feat_val, feat_count in feat_info.items():
                    # 计算P(X^{j}=a_{j_l}|Y=c_k)
                    self.pxy['P(X({})={}|Y={})'.format(col + 1, feat_val, ck)] = (feat_count + self.lambda_) / (
                        (ck_count + len(feat_info) * self.lambda_))

    def predict(self, x):
        res = {}
        for k, v in self.py.items():
            p = np.log(v)
            ck = k.split('=')[-1][:-1]
            for i in range(len(x)):
                # 计算P(Y=c_k)\prod{P(X^{(j)}=x^{(j)}|Y=c_{k})}
                p = p + np.log(self.pxy['P(X({})={}|Y={})'.format(i + 1, x[i], ck)])
            res[ck] = p
        # print(res)

        max_p = float('-inf')
        max_cate = float('-inf')
        for cate, p in res.items():
            if p > max_p:
                max_p = p
                max_cate = cate

        return max_cate, max_p
    
    def score(self, Xtest, ytest):
        c = 0
        for x, y in zip(Xtest, ytest):
            cate, p = self.predict(x)
            if int(cate) == int(y):
                c += 1
        return c / len(Xtest)

In [3]:
# 统计学习方法例4.1
d = {'S': 0, 'M': 1, 'L': 2}
X = np.array([[1, d['S']], [1, d['M']], [1, d['M']],
              [1, d['S']], [1, d['S']], [2, d['S']],
              [2, d['M']], [2, d['M']], [2, d['L']],
              [2, d['L']], [3, d['L']], [3, d['M']],
              [3, d['M']], [3, d['L']], [3, d['L']]])
y = np.array([-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1])

model = NB(lambda_=0.2)
model.fit(X, y)
cate, p = model.predict(np.array([2, 0]))
print("最可能的类别: {}".format(cate))

最可能的类别: -1


## 实例：垃圾邮件检测

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import random
import re

In [5]:
# 数据清洗：转为小写，去掉少于2个字符的字符串
def text_parse(big_str):
    token_list = re.split(r'\W+', big_str)  # \W：匹配任何非单词字符
    if len(token_list) == 0:
        print(token_list)
    return [tok.lower() for tok in token_list if len(tok) > 2]

In [6]:
# 读入数据
doc_list = []
class_list = []
for filename in ['ham', 'spam']:
    for i in range(1, 26):
        with open("./email/" + filename + '/' + str(i) + '.txt') as f:
            words = f.read()
            words = text_parse(words)
        doc_list.append(' '.join(words))
        if filename == 'ham':
            class_list.append(1)
        else:
            class_list.append(-1)

In [7]:
# 单词计数
vec = CountVectorizer()
words = vec.fit_transform(doc_list)
words = pd.DataFrame(words.toarray(), columns=vec.get_feature_names())
# 转为二值，单词出现为0，没出现为1
words[words > 0] = 1

In [8]:
words.head()

Unnamed: 0,0nline,100,100m,100mg,10mg,119,120,129,130,138,...,yay,yeah,year,yesterday,york,you,your,yourpenis,zach,zolpidem
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# 构建数据集
X = words.values
y = np.array(class_list)
testing_set = [int(num) for num in random.sample(range(50), 10)]
training_set = list(set(range(50)) - set(testing_set))

# 训练集
Xtrain = []
ytrain = []
for doc_index in training_set:
    Xtrain.append(X[doc_index])
    ytrain.append(y[doc_index])

# 测试集
Xtest = []
ytest = []
for doc_index in testing_set:
    Xtest.append(X[doc_index])
    ytest.append(y[doc_index])

In [10]:
# 训练
model = NB(lambda_=0.2)
model.fit(np.array(Xtrain), np.array(ytrain))

In [11]:
# 测试
score = model.score(Xtest, ytest)
print('正确率：{}'.format(score))

正确率：0.9


In [12]:
# sklearn 实现
from sklearn.naive_bayes import BernoulliNB

In [13]:
bnb = BernoulliNB().fit(Xtrain, ytrain)
score = bnb.score(Xtest, ytest)
print('正确率：{}'.format(score))

正确率：0.9


## 高斯朴素贝叶斯

$P\left(x_{i}|y_{k}\right)=\frac{1}{\sqrt{2 \pi \sigma_{y_{k}, i}^{2}}} e^{-\frac{(x_{i}-\mu_{y_k, i}^2)^2}{2 \sigma_{y_k, i}^{2}}}$

In [14]:
class GNB:
    def __init__(self):
        self.parameters = {}
        self.classes = []

    def fit(self, X, y):
        self.classes = list(np.unique(y))

        for c in self.classes:
            # 计算每个类别的平均值，方差，先验概率
            X_Index_c = X[np.where(y == c)]
            X_index_c_mean = np.mean(X_Index_c, axis=0, keepdims=True)
            X_index_c_var = np.var(X_Index_c, axis=0, keepdims=True)
            prior = X_Index_c.shape[0] / X.shape[0]
            self.parameters["class" + str(c)] = {"mean": X_index_c_mean, "var": X_index_c_var, "prior": prior}
        # print(self.parameters)

    def predict(self, X):
        # 取概率最大的类别返回预测值
        output = []
        for y in self.classes:
            # 先验概率
            prior = np.log(self.parameters["class" + str(y)]["prior"])

            # 后验概率：一维高斯分布的概率密度函数
            mean = self.parameters["class" + str(y)]["mean"]
            var = self.parameters["class" + str(y)]["var"]

            eps = 1e-4
            numerator = np.exp(-(X - mean) ** 2 / (2 * var + eps))
            denominator = np.sqrt(2 * np.pi * var + eps)

            # 取对数防止数值溢出
            posterior = np.sum(np.log(numerator / denominator), axis=1, keepdims=True).T
            prediction = prior + posterior
            output.append(prediction)

        output = np.reshape(output, (len(self.classes), X.shape[0]))
        prediction = np.argmax(output, axis=0)
        return prediction

    def score(self, X_test, y_test):
        pred = self.predict(X_test)
        right = (y_test - pred == 0.0).sum()

        return right / float(len(X_test))

## 实例：鸢尾花分类

In [15]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()  # 鸢尾花数据集
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:, :])
# 构建数据集
X, y = data[:, :-1], data[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [23]:
np.unique(y_train)

array([0., 1., 2.])

In [22]:
X_train.shape

(105, 4)

In [16]:
# 训练
model = GNB()
model.fit(X_train, y_train)

In [17]:
# 测试
ck = model.predict(np.array([4.4, 3.2, 1.3, 0.2]).reshape(1, -1))
print("预测的类别是：{}".format(ck))

预测的类别是：[0]


In [18]:
score = model.score(X_test, y_test)
print('正确率：{}'.format(score))

正确率：0.9333333333333333


In [19]:
# sklearn实现
from sklearn.naive_bayes import GaussianNB

In [20]:
gnb = GaussianNB().fit(X_train, y_train)
ck = gnb.predict([[4.4, 3.2, 1.3, 0.2]])
print("预测的类别是：{}".format(ck))

预测的类别是：[0.]


In [21]:
score = gnb.score(X_test, y_test)
print('正确率：{}'.format(score))

正确率：0.9333333333333333
