In [23]:
import numpy as np
import pandas as pd
from collections import Counter

def loadIris(address):  
    spf = pd.read_csv(address, sep=',', index_col=False, header=None)  # 加载数据集
    strs = spf[4] # 提取标签列
    spf.drop([4], axis=1, inplace=True)  # 去除标签列
    return spf.values, strs

def featureSelection(features, label):
    featureLen = len(features[0,:])
    label_count = Counter(label)  # 计数
    samples_energy = 0.0  
    data_len = len(label)
    for i in label_count.keys():  
        label_count[i] /= float(data_len)  # 将个数转换成比例
        samples_energy -= label_count[i] * np.log2(label_count[i])  # 计算熵

    informationGain = []

    for f in range(featureLen):
        af = features[:,f]
        minf = np.min(af)
        maxf = np.max(af) + 1e-4  # 防止maxf为0，导致width为0，进而导致无法计算d
        width = (maxf - minf)/10.0   # 将特征的区间分为10份

        d = (af - minf) / width  # 查看有多少个以width为单位的偏差
        dd = np.floor(d)   # 向下取整
        c = Counter(dd)    # 计数

        sub_energy = getEnergy(c, dd ,label)
        informationGain.append(samples_energy - sub_energy)
        
    return informationGain

# 计算熵
def getEnergy(c, data, label):
    dataLen = len(label)
    energy = 0.0
    for key, value in c.items():
        c[key] /= float(dataLen)  # 归一化
        label_picked = label[data == key]   # 找出标签为key的的数据
        l = Counter(label_picked)  # 计数
        e = 0.0
        for k,v in l.items():
            r = v/float(value)  # 归一化
            e -= r*np.log2(r)   # 计算熵
        energy += c[key] * e
    return energy

if __name__ == '__main__':
    filepath = 'iris.txt'
    data_matrix, str_name = loadIris(filepath)
    informationGain = featureSelection(data_matrix, str_name.values)
    print informationGain

[0.7299038051886471, 0.4304914571350462, 1.3510281744340518, 1.427355764201518]


In [13]:
label_count.keys()

['Iris-virginica', 'Iris-setosa', 'Iris-versicolor']

In [16]:
Counter(label)

Counter({'Iris-setosa': 50, 'Iris-versicolor': 50, 'Iris-virginica': 50})

In [5]:
features, label = data_matrix, str_name.values

featureLen = len(features[0,:])
label_count = Counter(label)  # 计数
samples_energy = 0.0  
data_len = len(label)
for i in label_count.keys():  
    label_count[i] /= float(data_len)  # 将个数转换成比例
    samples_energy -= label_count[i] * np.log2(label_count[i])  # 计算熵

informationGain = []

for f in range(featureLen):
    af = features[:,f]
    minf = np.min(af)
    maxf = np.max(af) + 1e-4  # 防止maxf为0，导致width为0，进而导致无法计算d
    width = (maxf - minf)/10.0   # 将特征的区间分为10份

    d = (af - minf) / width  # 查看有多少个以width为单位的偏差
    dd = np.floor(d)   # 向下取整
    c = Counter(dd)    # 计数

    sub_energy = getEnergy(c, dd ,label)
    informationGain.append(samples_energy - sub_energy)

In [22]:
width

0.24001