# 张三1000-决策树作业



## 计算信息熵和信息增益

In [7]:
"""计算香农信息熵的python程序代码
"""
from math import log
from sklearn import neighbors,datasets
import pandas as pd

def calcShannonEnt(dataFrame):
    """
    功能：根据分类标记，计算某数据集的信息熵。
    输入：dataFrame，使用pandas.seriers类型给出的含有标记的数据集，标记信息为最后一列
    输出：shannonEnt，数据集按当前标记分类结果的信息熵值
    
    """  
    numEntries = dataFrame.shape[0] #s数据集示例数
    
    labelCounts = {} #定义字典，键为分类标记名，值为标记的计数值
    labelCounts.update(dataFrame.iloc[:,-1].value_counts())#为字典赋值,认为数据集最后一列为label
    
    shannonEnt = 0.0  # 设置香农信息熵初值为0.0
    for key in labelCounts:
        # 按公式求信息熵值
        prob = float(labelCounts[key])/numEntries
        
        shannonEnt -= prob * log(prob,2)    # 求以2为底的对数。
    return shannonEnt



def calcInforGain(df,aFeature):
    """
    功能：按照上述公式计算用属性aFeature对样本集dataframe进行划分的信息增益。
    输入：数据集dataFrame；
          划分属性名aFeature；
    输出：（最好划分属性名称，最大信息增益值）
    """
    # 统计数据集的样本数量
    totalsampleCount = df.shape[0]
    # 统计属性a各值对应的样本数量
    sampleCounts = {} 
    sampleCounts.update(df[aFeature].value_counts())
    #print(sampleCounts)
    infogain = calcShannonEnt(df)
    for key,value in sampleCounts.items():
        subdf = df[df[aFeature] == key]
        infogain -= subdf.shape[0]/ totalsampleCount * calcShannonEnt(subdf)
    return infogain

def getBestDivideFeature(df):    
    featureInfoGains = {}    
    for colname in df.columns[:-1]:
        # 对非标记属性，计算其信息增益，标记属性为dataframe中最后一列
        infogain = calcInforGain(df,colname)
        featureInfoGains[colname] = infogain
    # 对已计算增益的结果进行排序
    bestFeature = sorted(featureInfoGains.items(),key =lambda item:item[1],reverse=True)[0]
    #print(featureInfoGains)
    return bestFeature

#dataframe = pd.read_csv("data/vote/VoteTraining-cn.csv",header=0,sep=',')        
#getBestDivideFeature(dataframe)    
dataframe = pd.read_csv("data/maloon/maloon2.txt",header=0,sep=',')        
getBestDivideFeature(dataframe.iloc[:,1:])  

('纹理', 0.3805918973682686)

## 构建决策树结点类

In [2]:
import pandas as pd
class Node:
    """决策树结点类"""
    def __init__(self):
        self._type = None
        self._label = None
        self._samples = pd.DataFrame()
        self._children = {}        
    
    @property    
    def type(self):
        return self._type
    
    @type.setter
    def setType(self,type):
        self._type = type
    
    @property    
    def label(self):
        return self._label
    
    @label.setter
    def setLabel(self,label):
        self._label = label
        
    @property
    def samples(self):
        return self._samples
    
    @samples.setter
    def setSamples(self,samples):
        if isinstance(samples, pd.DataFrame):
            self._samples = samples
    
    @property    
    def children(self):
        return self._children
    

    def addChildren(self,key,children):
        self._children[key] = children

    def getChildrenTypeList(self):
        cl = []
        for k,v in self._children.items():
            cl.append(v.type + '['+k+']')
        return cl
    
    def __str__(self):        
        return "Type:{},Label:{},Samples:{},Children:{} ".format(
                self._type,self._label,
                self._samples.index.tolist(),
                self.getChildrenTypeList())

def bfs(rootNode,depth = 0):
    print("{}{}".format('\t'*depth,rootNode))
    #print("Type:{},Label:{},Samples:{}".format(rootNode.type,rootNode.label,rootNode.samples.index.tolist()))
    for k,v in rootNode.children.items():
        bfs(v,depth + 1)
    return depth
                
node = Node()
node.setType = 'root' 
child = Node()
node.addChildren('a=1',child)
child.setType = 'middle'
child2 = Node()
node.addChildren('a=2',child2)
child2.setType = 'middle'
ss = Node()
ss.setType = 'leaf'
ss1 = Node()
ss1.setType = 'leaf'
child.addChildren('b=1',ss)
child.addChildren('b=2',ss1 )

bfs(node)

Type:root,Label:None,Samples:[],Children:['middle[a=1]', 'middle[a=2]'] 
	Type:middle,Label:None,Samples:[],Children:['leaf[b=1]', 'leaf[b=2]'] 
		Type:leaf,Label:None,Samples:[],Children:[] 
		Type:leaf,Label:None,Samples:[],Children:[] 
	Type:middle,Label:None,Samples:[],Children:[] 


0

## 决策树构建

In [8]:
import pandas as pd

def createDT(dataFrame,depth):
    """
    功能：根据有标记数据集dataFrame，构建深度为depth的决策树
    输入：训练集 𝐷 = dataFrame ，属性名为dataframe的第0行
          树的深度为depth，默认值为3
    输出：一棵以嵌套字典表示的决策树  

    """
    # 生成结点
    node = Node()
         
    if len(dataFrame.iloc[:,-1].value_counts()) == 1:
        #若D 中样本全属于同一类别，则将node标记为C类叶结点
        node.setType = 'Leaf'
        node.setLabel = dataFrame.iloc[0,-1]
        node.setSamples = dataFrame    
        return node
   
    
    if len(dataFrame.iloc[:,:-1]) == 0 :
        # A = 空集 :
        # 将 node 标记为叶结点，其类别标记为 D 中样本数最多的类；
        # 注意，因为可能存在未知因素，会存在A上取值相同，但分类不同的示例。        
        tempdict = {}
        tempdict.update(dataFrame.iloc[:,-1].value_counts())  
        if tempdict:
            label = sorted(tempdict,key =lambda item:item[1],reverse=True)[0]  
        else:
            label = ''
        node.setType = 'Leaf' 
        node.setLabel = label
        node.setSamples = dataFrame
        return node
    
    valueIsUnique = False
   
    for feature in dataFrame.columns[:-1]:    
        if len(dataFrame[feature].value_counts()) > 1:
            # 属性集任一个属性的取值不唯一，就跳出
            break
        valueIsUnique = True
    if valueIsUnique :
        # D 中样本在 A 上取值相同 :
        # 将 node 标记为叶结点，其类别标记为 D 中样本数最多的类；
        # 注意，因为可能存在未知因素，会存在A上取值相同，但分类不同的示例。        
        tempdict = {}
        tempdict.update(dataFrame.iloc[:,-1].value_counts())        
        label = sorted(tempdict,key =lambda item:item[1],reverse=True)[0]        
        node.setType = 'Leaf' 
        node.setLabel = label
        node.setSamples = dataFrame      
        return node
  
    #从 A 中选择最优划分属性  𝑎∗ 
    
    aFeature = getBestDivideFeature(dataFrame)[0]    
    #print("  使用{}作为划分依据".format(aFeature))
    node.setType = "据\"{}\"划分".format(aFeature)
    node.setLabel = '未定'
    node.setSamples = dataFrame
    
    dict = {}
    dict.update(dataFrame[aFeature].value_counts())
    for colValue in dict.keys():
        keyname = aFeature+'='+colValue
        #对𝑎∗ 的每一个值  𝑎𝑣∗ ，为node生成一个分支 
        aBranchNode = Node()                      
        #令 𝐷𝑣 表示D中在 𝑎∗ 上取值为 𝑎𝑣∗ 的样本子集；
        dv = dataFrame[dataFrame[aFeature]==colValue]         
        if  dv.empty:
            # 若dv为空,将分支结点标记为叶节点，其类别标记为D中样本最多的类；
            tempdict = {}
            tempdict.update(dataFrame.iloc[:,-1].value_counts())        
            label = sorted(tempdict,key =lambda item:item[1],reverse=True)[0]              
            aBranchNode.setType = 'Leaf' 
            aBranchNode.setLabel = label
            aBranchNode.setSamples = dv
            # 将aBranchNode列为node的子节点           
            node.addChildren(keyname,aBranchNode)              
        else:    
            # 去除数据集Dv 中aFeature列后的样本
            subDv = dv.drop([aFeature],axis =1)            
            if depth == 0:
                #若当前树的深度已经达到要求，则将当前分支节点的其类别标记为D中样本最多的类
                tempdict = {}
                tempdict.update(subDv.iloc[:,-1].value_counts())        
                label = sorted(tempdict,key =lambda item:item[1],reverse=True)[0]              
                aBranchNode.setType = 'Leaf' 
                aBranchNode.setLabel = label    
                aBranchNode.setSamples = subDv
                # 将aBranchNode列为node的子节点                
                node.addChildren(keyname,aBranchNode)     
                continue
            # 以 createDT(subDv)  为分支结点；
            node.addChildren(keyname,createDT(subDv,depth-1))
    return node
    
#dataframe = pd.read_csv("data/vote/votesimple.txt",header=0,sep=',')    
df = pd.read_csv("data/maloon/maloon2.txt",header=0,sep=',')
d = df.iloc[:,1:]
dtree = createDT(d,depth = 5)
print('决策树：')
bfs(dtree)

决策树：
Type:据"纹理"划分,Label:未定,Samples:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],Children:['据"根蒂"划分[纹理=清晰]', '据"触感"划分[纹理=稍糊]', 'Leaf[纹理=模糊]'] 
	Type:据"根蒂"划分,Label:未定,Samples:[0, 1, 2, 3, 4, 5, 7, 9, 14],Children:['Leaf[根蒂=蜷缩]', '据"色泽"划分[根蒂=稍蜷]', 'Leaf[根蒂=硬挺]'] 
		Type:Leaf,Label:好瓜,Samples:[0, 1, 2, 3, 4],Children:[] 
		Type:据"色泽"划分,Label:未定,Samples:[5, 7, 14],Children:['Leaf[色泽=乌黑]', 'Leaf[色泽=青绿]'] 
			Type:Leaf,Label:好瓜,Samples:[7, 14],Children:[] 
			Type:Leaf,Label:好瓜,Samples:[5],Children:[] 
		Type:Leaf,Label:坏瓜,Samples:[9],Children:[] 
	Type:据"触感"划分,Label:未定,Samples:[6, 8, 12, 13, 16],Children:['Leaf[触感=硬滑]', 'Leaf[触感=软粘]'] 
		Type:Leaf,Label:坏瓜,Samples:[8, 12, 13, 16],Children:[] 
		Type:Leaf,Label:好瓜,Samples:[6],Children:[] 
	Type:Leaf,Label:坏瓜,Samples:[10, 11, 15],Children:[] 


0

In [10]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
print(sklearn.__file__)

D:\pythonspace\anaconda3\lib\site-packages\sklearn\__init__.py


## 使用sklearn中的决策树分类器