In [2]:
def createDataSet():
    dataSet = [     ['sunny',   'busy',     'male',     'no'],
            ['rainy',   'not busy', 'female',   'no'],
            ['cloudy',  'relax',    'male',     'maybe'],
            ['sunny',   'relax',    'male',     'yes'],
            ['cloudy',  'not busy', 'male',     'maybe'],
            ['sunny',   'not busy', 'female',   'yes']]
    return dataSet

In [3]:
from math import log

def calShannonEnt(dataSet):
    numEntries = len(dataSet) #计算样本集的总样本数量
    labelCounts = {} #设置一个空的dict类型的变量
    for featVec in dataSet:
        currentLabel = featVec[-1] #选取样本集最后一列，设置为labelCounts变量的key值
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0 #初始化信息熵
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob*log(prob,2) #计算信息熵
    return shannonEnt

dataSet = createDataSet()
shannonEnt = calShannonEnt(dataSet)
print(shannonEnt)

1.584962500721156


In [4]:
#对样本集进行划分
def splitDataSet(dataSet,axis,value):
    #dataSet为样本集
    #axis为子属性下标，如0代表子属性“色泽”
    #value为上述子属性取值
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

print(splitDataSet(dataSet,0,'sunny'))

[['busy', 'male', 'no'], ['relax', 'male', 'yes'], ['not busy', 'female', 'yes']]


In [6]:
# 基于信息增益率选择最优划分属性
def chooseBestFeatureToSplit_GainRatio(dataSet):
    numFeatures = len(dataSet[0])-1
    baseEntropy = calShannonEnt(dataSet)
    bestGainRatio = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals=set(featList)
        newEntropy = 0.0
        iv = 0.0#初始化“固有值”
        GainRatio = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob = len(subDataSet)/float(len(dataSet))
            iv -= prob * log(prob,2) #计算每个子属性“固有值”
            newEntropy += prob*calShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy
        GainRatio = infoGain/iv #计算信息增益率
        if GainRatio > bestGainRatio:
            bestGainRatio = GainRatio
            bestFeature = i
    return bestFeature

print(chooseBestFeatureToSplit_GainRatio(dataSet))

0


In [7]:
# 计算基尼指数
def calGini(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel] += 1
        Gini = 1.0
        for key in labelCounts:
            prob = float(labelCounts[key])/numEntries
            Gini -= prob*prob
    return Gini

In [9]:
# 基于基尼指数选择最优划分属性(只能对离散型特征进行处理)
def chooseBestFeatureToSplit_Gini(dataSet):
    numFeatures = len(dataSet[0]) - 1
    bestGini = 100000.0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newGiniIndex = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet) / float(len(dataSet))
            newGiniIndex += prob * calGini(subDataSet)
        if (newGiniIndex < bestGini):
            bestGini = newGiniIndex
            bestFeature = i
    return bestFeature

print(chooseBestFeatureToSplit_Gini(dataSet))

0
