### 半朴素贝叶斯分类器

### Code --- AODE

In [161]:
# 导包

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 对中文的支持
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [162]:
# 西瓜数据集 3.0

df = pd.read_csv("watermelon.data")
df

Unnamed: 0,编号,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
0,1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.46,是
1,2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,是
2,3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,0.634,0.264,是
3,4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,0.608,0.318,是
4,5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,0.556,0.215,是
5,6,青绿,稍蜷,浊响,清晰,稍凹,软粘,0.403,0.237,是
6,7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,0.481,0.149,是
7,8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,0.437,0.211,是
8,9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,0.666,0.091,否
9,10,青绿,硬挺,清脆,清晰,平坦,软粘,0.243,0.267,否


In [169]:
# 不考虑连续属性
dt = df.iloc[:,[1,2,3,4,5,6,-1]]
dt.head()

Unnamed: 0,色泽,根蒂,敲声,纹理,脐部,触感,好瓜
0,青绿,蜷缩,浊响,清晰,凹陷,硬滑,是
1,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,是
2,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,是
3,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,是
4,浅白,蜷缩,浊响,清晰,凹陷,硬滑,是


In [170]:
# 获取所有属性对应的可能取值：
def get_attr(dt):
    """
    return like this:
    {'色泽':{'青绿','浅白','乌黑']}...}
    """
    attr = {}
    col = dt.columns
    for c in col:
        attr[c]= set(dt[c])
    
    return attr

In [171]:
# 以编号1 为测试样本
test = dt.iloc[0:1,:]
test

Unnamed: 0,色泽,根蒂,敲声,纹理,脐部,触感,好瓜
0,青绿,蜷缩,浊响,清晰,凹陷,硬滑,是


In [180]:
len(dt)

17

In [198]:
# AODE

def AODE(D,test):
    
    col = D.columns
    a = col[:-1]
    Y = col[-1]
    
    attr = get_attr(D)
    
    d = len(a)
    
    Pc = {}
    for y in attr[Y]:
        Pc[y] = 0
        for i in range(d):
            xi = a[i]
            Ni = len(attr[xi])
            D_c_xi = D[(D[Y]==y)&(D[xi]==test[xi][0])]
            
            # 计算 P(c,xi)
            p_c_xi = (len(D_c_xi)+1)/(len(D)+Ni)
            
            p_xj_cxi = 1
            for j in range(d):
                xj = a[j]
                Nj = len(attr[xj])
                D_c_xi_xj = D[(D[Y]==y)&(D[xi]==test[xi][0])&(D[xj]==test[xj][0])]
                
                # 计算 P(xj | c,xi)
                p_xj_cxi *= (len(D_c_xi_xj)+1)/(len(D_c_xi)+Nj)
                
            # 计算 P(c|x)    
            Pc[y] += p_c_xi*p_xj_cxi 
    
    return Pc

In [199]:
AODE(dt,test)

{'否': 0.0032687163042435024, '是': 0.09695613977659627}

---

### Code --- TAN

In [353]:
# 求属性之间的条件互信息 I(xi,xj)
def Info(D,test,xi,xj):
    """
    D:离散数据集 
    test:测试数据 
    xi:属性 
    xj:属性
    return:返回 xi 与 xj 的条件互信息 
    """
    columns = D.columns
    y = columns[-1]
    attr_v1 = test[xi][0]
    attr_v2 = test[xj][0]
    
    tv1 = type(attr_v1)
    
    c = D[y].value_counts()
    
    # 计算 P(xi,xj | c)
    P_12_c = D[(D[xi]==attr_v1)&(dt[xj]==attr_v2)][y].value_counts()[c.index]/c
    
    # 计算 P(xi|c) 和 P(xj|c)
    p_1_c = D[D[xi]==attr_v1][y].value_counts()[c.index]/c
    p_2_c = D[D[xj]==attr_v2][y].value_counts()[c.index]/c
    
    # 计算 I(xi,xj | y)
    I_12_c = (P_12_c*np.log2(P_12_c/(p1*p2))).sum()
    
    return I_12_c

In [354]:
Info(dt,test,'色泽','根蒂')

0.023277351097870363

In [355]:
col = test.columns
a = col[:-1]
a

Index(['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'], dtype='object')

In [356]:
d = len(a)
cg = np.zeros((d,d))
cg

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [357]:
# 构建带全完全图
for i in range(d):
    for j in range(i+1,d):
        xi = a[i]
        xj = a[j]
        cg[i][j] = Info(dt,test,xi,xj)
        cg[j][i] = cg[i][j]
cg

array([[0.        , 0.02327735, 0.02327735, 0.25427696, 0.02327735,
        0.24549957],
       [0.02327735, 0.        , 0.47649919, 0.88439844, 0.88439844,
        1.10662066],
       [0.02327735, 0.47649919, 0.        , 0.88439844, 0.25427696,
        0.76877692],
       [0.25427696, 0.88439844, 0.88439844, 0.        , 0.88439844,
        1.25855393],
       [0.02327735, 0.88439844, 0.25427696, 0.88439844, 0.        ,
        1.10662066],
       [0.24549957, 1.10662066, 0.76877692, 1.25855393, 1.10662066,
        0.        ]])