In [5]:
import numpy as np
# 程序清单3-1 计算给定数据集的香农熵
from math import log

def calcShannonEnt(dataSet):
    numEntries = len(dataSet)#计算行数，即数据集的总数
    labelCounts = {}#定义一个字典
    for featVec in dataSet:
        currentLabel = featVec[-1]#最后一列是标签即结果是鱼或者不是鱼
        #print(currentLabel)#
        if currentLabel not in labelCounts.keys():#如果当前字典的值不在我们定义的字典中，则加入
            labelCounts[currentLabel] = 0#标签字典值为0，即当前键值不存在
        labelCounts[currentLabel] += 1#否则，标签字典值加1，加入字典中
    #print(labelCounts.keys())
    #print(labelCounts[currentLabel])
    shannonEnt = 0.0
    for key in labelCounts:#字典的值的遍历方式
        prob = float(labelCounts[key]) / numEntries#某分类的概率
        shannonEnt -= prob * log(prob, 2)#计算熵的公式
    return shannonEnt

In [6]:
def createDataSet():
    dataSet = [
              [1, 1, 'yes'],
              [1, 1, 'yes'],
              [1, 0, 'no'],
              [0, 1, 'no'],
              [0, 1, 'no']]
    labels = ['no surfacing', 'flippers']
    return dataSet, labels

In [7]:
myDat, labels = createDataSet()
#print(myDat)
print(calcShannonEnt(myDat))#计算信息熵

#myDat[0][-1] = 'maybe'  # 熵越高，则混合的数据也越多
#print(myDat)
#print(calcShannonEnt(myDat))

0.9709505944546686


In [8]:
#程序清单 3-2 按照给定特征划分数据集
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis] 
            #print(reducedFeatVec)
            reducedFeatVec.extend(featVec[axis + 1 : ])#增加一个元素
            #print(featVec[axis + 1 : ])
            retDataSet.append(reducedFeatVec)#添加一个列表
            #print(reducedFeatVec)
    return retDataSet 

In [9]:
myDat, labels = createDataSet()
print(myDat)

#print(splitDataSet(myDat, 0, 1))#如果第一列为value，则输出一行中的值(除去value)
#print(splitDataSet(myDat, 0, 0))
print(splitDataSet(myDat, 1, 1))
#print(splitDataSet(myDat, 1, 0))
#print(myDat[0])

[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
[[1, 'yes'], [1, 'yes'], [0, 'no'], [0, 'no']]


In [10]:
# 程序清单 3-3 选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1#dataSet[0]第一行[1, 1, 'yes']
#     print(dataSet[0])
#     print(numFeatures)
    baseEntropy = calcShannonEnt(dataSet)#计算香农熵
    bestInfoGain = 0.0; bestFeature = -1
    for i in np.arange(numFeatures):
        featList = [example[i] for example in dataSet]#按列输出数据
        print(featList)
        uniqueVals = set(featList)#set集合{}去掉重复元素
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet) / float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet) 
        infoGain = baseEntropy - newEntropy#信息增益
        if infoGain > bestInfoGain:#找到信息增益最大的列并返回
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

In [11]:
myDat, labels = createDataSet()
print(chooseBestFeatureToSplit(myDat))

[1, 1, 1, 0, 0]
[1, 1, 0, 1, 1]
0


In [12]:
# 如果数据集已经处理了所有属性，但是类标签依然不是唯一的，此时我们需要决定如何定义该叶子节点，
# 在这种情况下，我们通常会采用 多数表决的方法决定该叶子节点的分类
def majorityCnt(classList):
    classCount = {}#定义一个字典
    for vote in classList:
        if vote not in classCount.keys():#存储字典中的不同值并计算不同值的个数
            classCount[vote] = 0
        classCount[vote] += 1
    
    newvalue = -1
    for key in classCount:#找出最大值
        if newvalue < classCount[key]:
            newkey = key
            newvalue = classCount[key]
    return newkey

In [13]:
# 程序清单 3-4 创建树的函数代码
def createTree(dataSet, labels): # 两个输入参数-- 数据集， 标签列表
    classList = [example[-1] for example in dataSet]#把标签保存到classList中
    if classList.count(classList[0]) == len(classList):
        return classList[0]   # 如果类别完全相同则停止继续划分
    
    if len(dataSet[0]) == 1:  # 遍历完所有特征时返回出现次数最多的类别
        return majorityCnt(classList) 
    
    bestFeat = chooseBestFeatureToSplit(dataSet)#找出最好的用于划分的特征
    bestFeatLabel = labels[bestFeat]#'no surfacing
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]  # 这行代码复制了类标签
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) # 字典的嵌套
        #print('1  ',bestFeatLabel,' ',value,' ',myTree[bestFeatLabel][value])
    #print('2 ',myTree, '\n')
    return myTree

In [14]:
myDat, labels = createDataSet()
myTree = createTree(myDat, labels)
print(myTree)
# {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

[1, 1, 1, 0, 0]
[1, 1, 0, 1, 1]
[1, 1, 0]
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}


In [19]:
# 程序清单3-8 使用决策树的分类函数
def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]#字典中的键
    #print(inputTree.keys())
    #print(firstStr)
    secondDict = inputTree[firstStr]#no surfacing对应的值
    featIndex = featLabels.index(firstStr)#在featLabels中索引到no surfacing的位置
    #print('1 ',featIndex)
    for key in secondDict.keys():#
        if testVec[featIndex] == key:#若我们测试的值和树的值一样，说明找到对的位置
            if isinstance(secondDict[key], dict):#若该值是字典类型，继续找
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:#不然就找到了
                classLabel = secondDict[key]
    return classLabel

In [20]:
myDat, labels = createDataSet()
print(labels)
#myTree = retrieveTree(0)
print(myTree)
#print(inputTree)
#print(classify(myTree, labels, [1, 0]))
print(classify(myTree, labels, [1, 1]))

['no surfacing', 'flippers']
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
yes


In [24]:
# 程序清单 3-9 使用pickle模块存储决策树
import pickle
def storeTree(inputTree, filename):
    fw = open(filename, 'wb')
    pickle.dump(inputTree, fw)#存储
    fw.close()
    
def grabTree(filename):
    fr = open(filename, 'rb')
    return pickle.load(fr)#读取

In [27]:
myDat, labels = createDataSet()
#print(labels)
myTree = createTree(myDat, labels)

storeTree(myTree, r'E:\Program Files\Machine Learning\机器学习实战及配套代码\machinelearninginaction\Ch03\a.txt')#先存储

print(grabTree(r'E:\Program Files\Machine Learning\机器学习实战及配套代码\machinelearninginaction\Ch03\a.txt'))#后读取

{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}


In [28]:
fr = open(r'E:\Program Files\Machine Learning\机器学习实战及配套代码\machinelearninginaction\Ch03\lenses.txt')
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
# print(lenses)
lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']
lensesTree = createTree(lenses, lensesLabels)
print(lensesTree)
#createPlot(lensesTree)

{'tearRate': {'reduced': 'no lenses', 'normal': {'astigmatic': {'yes': {'prescript': {'hyper': {'age': {'pre': 'no lenses', 'young': 'hard', 'presbyopic': 'no lenses'}}, 'myope': 'hard'}}, 'no': {'age': {'pre': 'soft', 'young': 'soft', 'presbyopic': {'prescript': {'hyper': 'soft', 'myope': 'no lenses'}}}}}}}}


In [14]:
list = [1, 1, 'yes']
list.extend([3])
print(list)
list.append([5,6,7])
print(list)

[1, 1, 'yes', 3]
[1, 1, 'yes', 3, [5, 6, 7]]
