In [1]:
import numpy as np
import pandas as pd

In [2]:
# 创建数据集
x1 = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]
x2 = ['S', 'M', 'M', 'S', 'S', 'S', 'M', 'M', 'L', 'L', 'L', 'M', 'M', 'L', 'L']
y = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]

In [5]:
print('x1.shape:', len(x1))
print('x2.shape:', len(x2))
print('y.shape:', len(y))

x1.shape: 15
x2.shape: 15
y.shape: 15


In [7]:
df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})

In [8]:
df

Unnamed: 0,x1,x2,y
0,1,S,-1
1,1,M,-1
2,1,M,1
3,1,S,1
4,1,S,-1
5,2,S,-1
6,2,M,-1
7,2,M,1
8,2,L,1
9,2,L,1


In [10]:
x, y = df[['x1', 'x2']], df[['y']]

In [13]:
x

Unnamed: 0,x1,x2
0,1,S
1,1,M
2,1,M
3,1,S
4,1,S
5,2,S
6,2,M
7,2,M
8,2,L
9,2,L


In [14]:
y

Unnamed: 0,y
0,-1
1,-1
2,1
3,1
4,-1
5,-1
6,-1
7,1
8,1
9,1


In [19]:
y[y.columns[0]].unique()

array([-1,  1], dtype=int64)

In [21]:
y[y.columns[0]].value_counts()

 1    9
-1    6
Name: y, dtype: int64

In [22]:
len(y)

15

In [39]:
x[(y == -1).values]['x1'].value_counts()

1    3
2    2
3    1
Name: x1, dtype: int64

In [42]:
# 定义朴素贝叶斯训练过程
def nb_fit(x, y):
    classes = y[y.columns[0]].unique() # [-1, 1]
    #print('##',classes)
    class_count = y[y.columns[0]].value_counts()   # 1: 9, -1 : 6
    #print('###', class_count)
    # 极大似然估计：先验概率
    class_prior = class_count / len(y)
    
    # 类条件概率： 字典初始化
    class_condition_prob = dict()

    # 遍历特征
    for col in x.columns:
        for j in classes:
            # 统计当前类别下特征的不同取值
            p_x_y = x[(y == j).values][col].value_counts()
            # 遍历计算类条件概率
            for i in p_x_y.index:
                class_condition_prob[(col, i, j)] = p_x_y[i] / class_count[j]
    return classes, class_prior, class_condition_prob

In [44]:
classes, class_prior, class_condition_prob = nb_fit(x, y)

In [45]:
print('classes"', classes)

classes" [-1  1]


In [46]:
print('class_prior:', class_prior)

class_prior:  1    0.6
-1    0.4
Name: y, dtype: float64


In [47]:
print('class_condition_prob:', class_condition_prob)

class_condition_prob: {('x1', 1, -1): 0.5, ('x1', 2, -1): 0.3333333333333333, ('x1', 3, -1): 0.16666666666666666, ('x1', 3, 1): 0.4444444444444444, ('x1', 2, 1): 0.3333333333333333, ('x1', 1, 1): 0.2222222222222222, ('x2', 'S', -1): 0.5, ('x2', 'M', -1): 0.3333333333333333, ('x2', 'L', -1): 0.16666666666666666, ('x2', 'M', 1): 0.4444444444444444, ('x2', 'L', 1): 0.4444444444444444, ('x2', 'S', 1): 0.1111111111111111}


In [56]:
# 朴素贝叶斯的预测

def predict(X_test):
    res = []
    for c in classes:
        p_y = class_prior[c]
        p_x_y = 1
        for i in X_test.items():
            p_x_y *= class_condition_prob[tuple(list(i)+[c])]
        res.append(p_y*p_x_y)
    return classes[np.argmax(res)]

In [57]:
x_test = {'x1': 2, 'x2': 'S'}
x_test

{'x1': 2, 'x2': 'S'}

In [58]:
print('测试数据的预测类别为:', predict(x_test))

测试数据的预测类别为: -1
