In [1]:
import numpy as np
import pandas as pd
import io
from scipy import stats

In [2]:
data_str = output = io.StringIO('''编号,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.46,是  
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,是  
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,0.634,0.264,是  
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,0.608,0.318,是  
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,0.556,0.215,是  
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,0.403,0.237,是  
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,0.481,0.149,是  
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,0.437,0.211,是  
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,0.666,0.091,否  
10,青绿,硬挺,清脆,清晰,平坦,软粘,0.243,0.267,否  
11,浅白,硬挺,清脆,模糊,平坦,硬滑,0.245,0.057,否  
12,浅白,蜷缩,浊响,模糊,平坦,软粘,0.343,0.099,否  
13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,0.639,0.161,否  
14,浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,0.657,0.198,否  
15,乌黑,稍蜷,浊响,清晰,稍凹,软粘,0.36,0.37,否  
16,浅白,蜷缩,浊响,模糊,平坦,硬滑,0.593,0.042,否  
17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,0.719,0.103,否''')

In [3]:
data = pd.read_csv(data_str)

In [4]:
data.set_index('编号',inplace=True)

In [5]:
data

Unnamed: 0_level_0,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
编号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.46,是
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,是
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,0.634,0.264,是
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,0.608,0.318,是
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,0.556,0.215,是
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,0.403,0.237,是
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,0.481,0.149,是
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,0.437,0.211,是
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,0.666,0.091,否
10,青绿,硬挺,清脆,清晰,平坦,软粘,0.243,0.267,否


In [10]:
# 信息熵
def entropy(data,att_name):
    levels = data[att_name].unique()
    # 信息熵
    ent = 0
    for lv in levels:
        pi = sum(data[att_name] == lv) /data.shape[0]
        ent += pi * np.log(pi)
    return -ent
print('好瓜的信息熵')
entropy(data,'好瓜')

好瓜的信息熵


0.8760918930634294

In [11]:
# 使用scipy内置的stats.entropy验证我们的函数
def entropy_scipy(data,att_name):
    n = data.shape[0]
    values = data[att_name].value_counts()
    return stats.entropy(values/n)
assert entropy(data,'好瓜') == entropy_scipy(data,'好瓜')

In [12]:
# 条件信息熵
def conditional_entropy(data, xname, yname):
    xs = data[xname].unique()
    ys = data[yname].unique()
    p_x = data[xname].value_counts() / data.shape[0]
    ce = 0
    for x in xs:
        ce += p_x[x]*entropy(data[data[xname]==x], yname)
    return ce

print('触感条件下, 好瓜的信息熵:')
conditional_entropy(data, '触感', '好瓜')

触感条件下, 好瓜的信息熵:


0.8462465738213797

In [13]:
# 信息增益
def gain(data, xname, yname):
    en = entropy(data, yname)
    ce = conditional_entropy(data, xname, yname)
    return en - ce

print('触感的引入导致的信息增益:')
gain(data, '触感', '好瓜')

触感的引入导致的信息增益:


0.029845319242049695

In [14]:
# 属性的信息增益与属性的类别数的关系
# 属性的类别越多, 根据该属性就可以把数据切分的更细, 这样往往导致信息的混乱程度降低, 所以类别多的属性的信息增益较大, 我们可以用代码实验一下:
data['testCol'] = 0
data.iloc[10:, -1] = 1
print(gain(data, 'testCol', '触感'))
data.iloc[14:, -1] = 2
print(gain(data, 'testCol', '触感'))
print(gain(data, '密度', '触感'))

0.00011925580490779186
0.0018253756129741339
0.605797499372304


In [15]:
# 信息增益率
def gain_ratio(data, xname, yname):
    g = gain(data, xname, yname)
    si = entropy(data, xname)
    return g / si
print('信息增益率:')
gain_ratio(data, '触感', '好瓜')

信息增益率:


0.04926616447405919