# 决策树实践

## 计算信息熵

In [1]:
"""计算香农信息熵的python程序代码
"""
from math import log
import pandas as pd

def calcShannonEnt(dataFrame):
    """
    功能：根据分类标记，计算某数据集的信息熵。
    输入：dataFrame，使用pandas.seriers类型给出的含有标记的数据集，标记信息为最后一列
    输出：shannonEnt，数据集按当前标记分类结果的信息熵值
    
    """  
    numEntries = dataFrame.shape[0] #s数据集示例数
    
    labelCounts = {} #定义字典，键为分类标记名，值为标记的计数值
    labelCounts.update(dataFrame.iloc[:,-1].value_counts())#为字典赋值,认为数据集最后一列为label
    
    shannonEnt = 0.0  # 设置香农信息熵初值为0.0
    for key in labelCounts:
        # 按公式求信息熵值
        prob = float(labelCounts[key])/numEntries
        
        shannonEnt -= prob * log(prob,2)    # 求以2为底的对数。
    return shannonEnt

def calcDatingDataSetEnt():
    dataframe = pd.read_csv("data\dating\datingTestSet.txt",header=None,sep='\t',names=['年飞机里程','周冰淇淋升数','游戏耗时比','心仪程度'])
    entropy = calcShannonEnt(dataframe)
    print("当前数据集的香农信息熵：%f"  % entropy)

calcDatingDataSetEnt()

当前数据集的香农信息熵：1.584702


## 计算信息增益

属性a对样本集D进行划分所获得的“信息增益”（information gain)

In [34]:
def calcInforGain(df,aFeature):
    """
    功能：按照上述公式计算用属性aFeature对样本集dataframe进行划分的信息增益。
    输入：数据集dataFrame；
          划分属性名aFeature；
    输出：（最好划分属性名称，最大信息增益值）
    """
    # 统计数据集的样本数量
    totalsampleCount = df.shape[0]
    # 统计属性a各值对应的样本数量
    sampleCounts = {} 
    sampleCounts.update(df[aFeature].value_counts())
    #print(sampleCounts)
    infogain = calcShannonEnt(df)
    for key,value in sampleCounts.items():
        subdf = df[df[aFeature] == key]
        infogain -= subdf.shape[0]/ totalsampleCount * calcShannonEnt(subdf)
    return infogain

def getBestDivideFeature(df):    
    featureInfoGains = {}    
    for colname in df.columns[:-1]:
        # 对非标记属性，计算其信息增益，标记属性为dataframe中最后一列
        infogain = calcInforGain(df,colname)
        featureInfoGains[colname] = infogain
    # 对已计算增益的结果进行排序
    print(featureInfoGains)
    bestFeature = sorted(featureInfoGains.items(),key =lambda item:item[1],reverse=True)[0]
    print(sorted(featureInfoGains.items(),key =lambda item:item[1],reverse=True))
    return bestFeature

#dataframe = pd.read_csv("data/vote/VoteTraining-cn.csv",header=0,sep=',')        
#getBestDivideFeature(dataframe)    
dataframe = pd.read_csv("data/maloon/maloon2.txt",header=0,sep=',')        
getBestDivideFeature(dataframe.iloc[:,1:]) 

{'色泽': 0.10812516526536525, '根蒂': 0.14267495956679288, '敲声': 0.1407814336149959, '纹理': 0.3805918973682686, '脐部': 0.289158782841679, '触感': 0.006046489176565639}
[('纹理', 0.3805918973682686), ('脐部', 0.289158782841679), ('根蒂', 0.14267495956679288), ('敲声', 0.1407814336149959), ('色泽', 0.10812516526536525), ('触感', 0.006046489176565639)]


('纹理', 0.3805918973682686)

In [31]:
dataframe
dv1 = dataframe[dataframe['色泽'] == '青绿']
dv1

Unnamed: 0,编号,色泽,根蒂,敲声,纹理,脐部,触感,标记
0,1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,好瓜
3,4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,好瓜
5,6,青绿,稍蜷,浊响,清晰,稍凹,软粘,好瓜
9,10,青绿,硬挺,清脆,清晰,平坦,软粘,坏瓜
12,13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,坏瓜
16,17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,坏瓜


In [38]:
import IPython, graphviz, re
from io import StringIO
from IPython.display import Image
import numpy as np
import pandas as pd
import math
from sklearn import tree
from sklearn.datasets import load_boston, load_iris
from collections import defaultdict

from dtreeviz.trees import *

regr = tree.DecisionTreeRegressor(max_depth=5, random_state=666)
boston = load_boston()

X_train = pd.DataFrame(boston.data, columns=boston.feature_names)
y_train = boston.target

regr = regr.fit(X_train, y_train)

viz = dtreeviz(regr, boston.data, boston.target, target_name="price", feature_names=boston.feature_names, orientation="TD")
viz

ExecutableNotFound: failed to execute ['dot', '-Tsvg', '-o', 'C:\\Users\\leo\\AppData\\Local\\Temp/DTreeViz_10700.svg', 'C:/Users/leo/AppData/Local/Temp\\DTreeViz_10700'], make sure the Graphviz executables are on your systems' PATH

<dtreeviz.trees.DTreeViz at 0x1e9096844e0>