# 第4章 朴素贝叶斯

基于贝叶斯定理与特征条件独立假设的分类方法。

模型：

- 高斯模型
- 多项式模型
- 伯努利模型

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math

In [0]:
# 例 4.1 
lambda_ = 0.2
x = [2, 'S']

X1 = [1,2,3]
X2 = ['S', 'M', 'L']
Y = [1, -1]

$P_\lambda(Y=1)=(9+lambda\_)/(15 + 2*lambda\_) = (9+0.2)/(15+2*0.2)=0.5974025974025974$
$P_\lambda(Y=-1)=(6+lambda\_)/(15 + 2*lambda\_) = (6+0.2)/(15+2*0.2)=0.40259740259740264$  
$P(X^{(1)}=1|Y=1) = (2+0.2)/(9+3*0.2)=0.22916666666666669 $  
$P(X^{(1)}=2|Y=1) = (3+0.2)/(9+3*0.2)=0.33333333333333337 $  
$P(X^{(1)}=3|Y=1) = (4+0.2)/(9+3*0.2)=0.43750000000000006 $  
$P(X^{(2)}=S|Y=1) = (1+0.2)/(9+3*0.2)=0.125 $   
$P(X^{(2)}=M|Y=1) = (4+0.2)/(9+3*0.2)=0.43750000000000006 $     
$P(X^{(2)}=L|Y=1) = (4+0.2)/(9+3*0.2)=0.43750000000000006 $  
$P(X^{(1)}=1|Y=-1) = (3+0.2)/(6+3*0.2)=0.4848484848484849 $  
$P(X^{(1)}=2|Y=-1) = (2+0.2)/(6+3*0.2)=0.33333333333333337 $   
$P(X^{(1)}=3|Y=-1) = (1+0.2)/(6+3*0.2)=0.18181818181818182 $   
$P(X^{(2)}=S|Y=-1) = (3+0.2)/(6+3*0.2)=0.4848484848484849 $  
$P(X^{(2)}=M|Y=-1) = (2+0.2)/(6+3*0.2)=0.33333333333333337 $   
$P(X^{(2)}=L|Y=-1) = (1+0.2)/(6+3*0.2)=0.18181818181818182 $   
so  
$P(Y=1)P(X^{(1)}=2|Y=1)P(X^{(2)}=S|Y=1) =0.5974025974025974* 0.33333333333333337*0.125=0.024891774891774892$  
$P(Y=-1)P(X^{(1)}=2|Y=-1)P(X^{(2)}=S|Y=-1) =0.40259740259740264* 0.33333333333333337*0.4848484848484849=0.06506624688442873$  

so, it should be -1.

In [0]:
class NB:
    def __init__(self, lambda_):
        self.lambda_ = lambda_
        
    def fit(self, X, y):
        N, M = X.shape
        data = np.hstack((X, y.reshape(N, 1)))
        
        py = {}
        pxy = {}
        uniquey, countsy = np.unique(y, return_counts=True)
        tmp = dict(zip(uniquey, countsy))
        for k,v in tmp.items():
            py[k] = (v + self.lambda_)/(N + len(uniquey) * self.lambda_)
            tmp_data = data[data[:, -1] == k]
            for col in range(M):
                uniquecol, countscol = np.unique(tmp_data[:,col], return_counts=True)
                tmp1 = dict(zip(uniquecol, countscol))
                for kk, vv in tmp1.items():
                    pxy['X({})={}|Y={}'.format(col+1, kk, k)] = (vv + self.lambda_)/(v + len(uniquecol) * self.lambda_)
                    
        self.py = py
        self.pxy = pxy

        #return self.py, self.pxy
    
    def predict(self, x):
        M = len(x)
        res = {}
        for k,v in self.py.items():
            p = v
            for i in range(len(x)):
                p = p * self.pxy['X({})={}|Y={}'.format(i+1, x[i], k)]
            res[k] = p
        print(res)
        maxp = -1
        maxk = -1
        for kk,vv in res.items():
            if vv > maxp:
                maxp = vv
                maxk = kk
                
        return maxk

In [0]:
lambda_ = 0.2
d = {'S':0, 'M':1, 'L':2}

X = np.array([[1, d['S']], [1, d['M']], [1, d['M']],
             [1, d['S']], [1, d['S']], [2, d['S']],
             [2, d['M']], [2, d['M']], [2, d['L']],
             [2, d['L']], [3, d['L']], [3, d['M']],
             [3, d['M']], [3, d['L']], [3, d['L']]])

y = np.array([-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1])

In [129]:
X

array([[1, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 0],
       [2, 0],
       [2, 1],
       [2, 1],
       [2, 2],
       [2, 2],
       [3, 2],
       [3, 1],
       [3, 1],
       [3, 2],
       [3, 2]])

In [130]:
y

array([-1, -1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1])

In [77]:
model = NB(lambda_)
model.fit(X,y)
model.predict(np.array([2, 0]))

{-1: 0.06506624688442873, 1: 0.024891774891774892}


-1

In [0]:
# data
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, :])
    # print(data)
    return data[:,:-1], data[:,-1]

In [0]:
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [80]:
X_test[0], y_test[0]

(array([5.6, 3. , 4.5, 1.5]), 1.0)

In [82]:
X_train.shape

(70, 4)

## GaussianNB 高斯朴素贝叶斯

特征的可能性被假设为高斯

概率密度函数：
$$P(x_i | y_k)=\frac{1}{\sqrt{2\pi\sigma^2_{yk}}}exp(-\frac{(x_i-\mu_{yk})^2}{2\sigma^2_{yk}})$$

数学期望(mean)：$\mu$，方差：$\sigma^2=\frac{\sum(X-\mu)^2}{N}$

In [0]:
class NaiveBayes:
    def fit(self, X, y):
        self.classes = list(np.unique(y))
        self.parameters = {}
        
        for c in self.classes:
            # 计算每个种类的平均值，方差，先验概率
            X_Index_c = X[np.where(y == c)]
            X_index_c_mean = np.mean(X_Index_c, axis=0, keepdims=True)
            X_index_c_var = np.var(X_Index_c, axis=0, keepdims=True)
            parameters = {"mean": X_index_c_mean, "var": X_index_c_var, "prior": X_Index_c.shape[0] / X.shape[0]}
            self.parameters["class" + str(c)] = parameters
            print(self.parameters)
            
    def _pdf(self, X, classes):
        # 一维高斯分布的概率密度函数
        eps = 1e-4
        mean = self.parameters["class" + str(classes)]["mean"]
        var = self.parameters["class" + str(classes)]["var"]
        
        numerator = np.exp(-(X - mean) ** 2 / (2 * var + eps))
        denominator = np.sqrt(2 * np.pi * var + eps)
        
        # 取对数防止数值溢出
        result = np.sum(np.log(numerator / denominator), axis=1, keepdims=True)
        
        return result.T
    
    def _predict(self, X):
        output = []
        for y in self.classes:
            prior = np.log(self.parameters["class" + str(y)]["prior"])
            posterior = self._pdf(X, y)
            prediction = prior + posterior
            output.append(prediction)
        return output
    
    def predict(self, X):
        # 取概率最大的类别返回预测值
        output = self._predict(X)
        output = np.reshape(output, (len(self.classes), X.shape[0]))
        prediction = np.argmax(output, axis=0)
        return prediction
    
    def score(self, X_test, y_test):
        right = 0
        pred = self.predict(X_test)
        right = (y_test - pred == 0).sum()

        return right / float(len(X_test))

In [0]:
model = NaiveBayes()

In [123]:
model.fit(X_train, y_train)

{'class0.0': {'mean': array([[5.02571429, 3.42857143, 1.49142857, 0.24857143]]), 'var': array([[0.10648163, 0.15918367, 0.02478367, 0.01278367]]), 'prior': 0.5}}
{'class0.0': {'mean': array([[5.02571429, 3.42857143, 1.49142857, 0.24857143]]), 'var': array([[0.10648163, 0.15918367, 0.02478367, 0.01278367]]), 'prior': 0.5}, 'class1.0': {'mean': array([[5.94285714, 2.77714286, 4.27428571, 1.34      ]]), 'var': array([[0.18816327, 0.09833469, 0.17505306, 0.03954286]]), 'prior': 0.5}}


In [124]:
print(model.predict(X_test))

[1 0 0 0 1 0 0 0 1 0 1 0 1 1 1 1 0 1 1 1 0 0 0 0 0 1 1 1 1 0]


In [125]:
model.score(X_test, y_test)

1.0

scikit-learn实例

# sklearn.naive_bayes

In [0]:
from sklearn.naive_bayes import GaussianNB

In [133]:
clf = GaussianNB()
clf.fit(X, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [134]:
clf.predict([[2, 0]])

array([-1])

In [0]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB # 伯努利模型和多项式模型

In [138]:
clf1 = BernoulliNB()
clf1.fit(X, y)
clf1.predict([[2, 0]])

array([-1])

In [139]:
clf2 = MultinomialNB()
clf2.fit(X, y)
clf2.predict([[2, 0]])

array([1])