In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import pdir as pr

In [80]:
def loadDataSet(filePath):
    ''' 数据集读取函数'''
    data, label = [], []
    # 读取数据集
    with open(filePath) as f:
        for line in f.readlines():
            temp = line.strip().split(",")
            data.append([float(i) for i in temp[:-1]])
            if temp[-1] != '?':
                temp[-1] = float(temp[-1])
            label.append(temp[-1])
    ##### 输出数据集相关信息 ##########
    print("data dimension of dataset：", len(data[0]))
    print("number of sample in data :", len(data))
    print("label frequency:", dict(Counter(label)))
    ##### 输出数据集相关信息 ##########
    return np.array(data), np.array(label)

def calcInfoGain_or_InfoGainRate(dataSet, label, calcInfoGainRate=False):
    '''计算数据集每一列（特征）的信息增益或信息增益率'''
    def calcEntropy(data):
        '''计算单列数据的熵'''
        probs_ = np.array(list(Counter(data).values()))/data.shape[0]
        ans = -1*(probs_*np.log2(probs_)).sum()
        return ans
    #计算数据集的熵
    dataSetEntropy = calcEntropy(label)
    #得到数据集的 样本数 和 特征数
    sampleNum, featureNum = dataSet.shape
    infoGains = np.zeros(featureNum) #用于保存 信息增益的数组
    #对于数据集的每一个特征
    for featureId in range(featureNum):
        #得到当前的 特征
        curFeature = dataSet[:, featureId]
        #得到每个取值的统计次数
        counter = Counter(curFeature)
        #得到所有可能的取值
        values = list(counter.keys())
        #得到所有可能取值的概率
        probs = np.array(list(counter.values()))/sampleNum
        entropys = np.zeros(len(values)) #用于保存熵的数组
        #遍历每个可能的取值
        for index, val in enumerate(values):
            #得到 标签 中对应的 子数据集标签
            subLabel = label[np.argwhere(curFeature==val)[:,0]]
            #计算 子数据集标签 的 熵
            entropys[index] = calcEntropy(subLabel)
        #计算基于当前特征的 条件熵
        condEntropy = (probs*entropys).sum()
        #计算基于当前特征的 信息增益
        infoGains[featureId] = dataSetEntropy - condEntropy
        #若是计算 信息增益率，则要除以 当前特征的 熵 
        if calcInfoGainRate:
            denominator = calcEntropy(curFeature)
            #当 当前特征的 熵 为0时，信息增益也为0，此处避免除零错误
            if denominator != 0:
                infoGains[featureId] /= calcEntropy(curFeature)
    return infoGains

def calcGiniIndex(dataSet, label):
    '''计算数据集每一列（特征）的Gini指数'''
    #得到数据集的 样本数 和 特征数
    sampleNum, featureNum = dataSet.shape
    giniIndexs = np.zeros(featureNum) #用于保存 信息增益的数组
    #对于数据集的每一个特征
    for featureId in range(featureNum):
        #得到当前的 特征
        curFeature = dataSet[:, featureId]
        #得到每个取值的统计次数
        counter = Counter(curFeature)
        #得到所有可能的取值
        values = list(counter.keys())
        #得到所有可能取值的概率
        probs = np.array(list(counter.values()))/sampleNum
        subGiniIndexs = np.zeros(len(values)) #用于保存熵的数组
        #遍历每个可能的取值
        for index, val in enumerate(values):
            #得到 标签 中对应的 子数据集
            subLabel = label[np.argwhere(curFeature==val)[:,0]]
            #计算 每个取值下 数据集的 gini指数
            sub_values = np.array(list(Counter(subLabel).values()))
            sub_probs = sub_values / subLabel.shape[0]
            subGiniIndexs[index] = 1 - (sub_probs**2).sum()
        #计算基于当前特征的 gini 指数
        giniIndexs[featureId] = (probs*subGiniIndexs).sum()
    return giniIndexs

def calcInfoGain(dataSet, label):
    '''计算数据集每一列（特征）的信息增益'''
    return calcInfoGain_or_InfoGainRate(dataSet, label, calcInfoGainRate=False)

def calcInfoGainRate(dataSet, label):
    '''计算数据集每一列（特征）的信息增益率'''
    return calcInfoGain_or_InfoGainRate(dataSet, label, calcInfoGainRate=True)


class featureSelection:
    '''特征选取类：根据不同特征选取方法选取最优划分特征'''
    
    def __init__(self, method):
        self.method = method
        self.flag = True #验收需要输出第一次判断特征的信息增益（给予ID3），因此用一个变量来标记是否已经进行输出
        
    def getFeatureIndex(self, dataSet, label):
        '''得到最优划分属性的下标（从0开始）'''
        if self.method == 'ID3':
            temp = calcInfoGain(dataSet, label)
            if self.flag:
                print("info gain", temp)
                print("best one(index)", np.argmax(temp))
                self.flag=False
            return np.argmax(calcInfoGain(dataSet, label))
        elif self.method == 'C4.5':
            return np.argmax(calcInfoGainRate(dataSet, label))
        elif self.method == 'CART':
            return np.argmin(calcGiniIndex(dataSet, label))
        else:
            print("ERROR: method not define!")

class decisionTree:
    '''决策树类实现'''
    
    def __init__(self, method):
        self.featureSelectionMethod = featureSelection(method=method) #特征选取方法
    
    def __getBestSplitFeature(self, dataSet, label):
        '''得到数据集的最优划分属性的下标'''
        return self.featureSelectionMethod.getFeatureIndex(dataSet, label)
    
    def __getSubSet(self, dataSet, label, splitIndex, splitValue):
        '''根据划分属性的某个取值来得到对应的子数据集和标签'''
        #得到对应取值的样本下标
        sampleIndex = np.argwhere(dataSet[:, splitIndex]==splitValue)[:, 0]
        #得到在原数据集的基础上删除划分属性所在列对应的样本
        subDataSet = np.delete(dataSet, splitIndex, axis=1)[sampleIndex]
        #得到子标签
        subLabel = label[sampleIndex]
        return subDataSet, subLabel
    
    def __getMostCommonLabel(self, label):
        '''得到标签中出现最多次的数据'''
        return Counter(label).most_common(1)[0][0]
    
    def __buildTree(self, dataSet, label, featuresName):
        '''递归构建决策树'''
        #递归终止条件1：若数据都属于一个类别，则返回该类别
        if len(Counter(label)) == 1:
            return label[0]
        #递归终止条件2：若遍历完所有属性，则返回标签中出现次数最多的
        if len(dataSet) == 0:
            return self.__getMostCommonLabel(label)
        #递归终止条件3：若所有样本在所有特征上取值相同，则返回标签中出现次数最多的
        check = []
        ## 遍历所有特征，得到每个特征的取值的次数
        for featureIndex in range(dataSet.shape[1]):
            check.append(len(Counter(dataSet[:, featureIndex])))
        ## 若所有特征的取值都只有一个
        if Counter(check).get(1,-1)==len(check):
            return self.__getMostCommonLabel(label)
        
        #得到划分属性下标
        splitIndex = self.__getBestSplitFeature(dataSet, label)
        #得到划分属性
        splitFeature = featuresName[splitIndex]
        del(featuresName[splitIndex])
        #以划分属性为结点构建一颗空树
        tree = {splitFeature:{'tree':{}}} 
        #存储该结点对应的最可能出现的标签值（用于预测未知值）
        tree[splitFeature]['defaultLabel'] = self.__getMostCommonLabel(label)
        #遍历划分属性的所有可能取值
        for val in set(dataSet[:, splitIndex]):
            subDataSet, subLabel = self.__getSubSet(dataSet, label, splitIndex, val)
            tree[splitFeature]['tree'][val] = self.__buildTree(subDataSet, subLabel, 
                                                             featuresName[:])
        return tree
        
    def buildTree(self, dataSet, label, featuresName):
        '''得到决策树'''
        self.tree = self.__buildTree(dataSet, label, featuresName)
    
    def __apply(self, tree, sample, featuresName):
        '''递归应用构建好的决策树对数据集进行分类'''
        #得到根结点属性
        rootFeature = list(tree.keys())[0] 
        #对应根结点的树
        rootTree = tree[rootFeature]['tree']
        #根据结点名称找到结点对应的下标
        rootIndex = featuresName.index(rootFeature)
        #遍历树的所有可能的分支
        for val in rootTree.keys():
            if sample[rootIndex] == val:
                subTree = rootTree[val]
                #若接下来是一棵树
                if isinstance(subTree, dict):
                    return self.__apply(subTree, sample, featuresName)
                #若接下来是一个结点
                else:
                    return subTree
        #若出现未知值，则返回默认的标签值
        return tree[rootFeature]['defaultLabel']  
            
    def apply(self, dataSet, featuresName):
        '''对数据集进行分类'''
        ansLabel = np.zeros(dataSet.shape[0])
        #遍历测试数据集的每一个样本
        for index, sample in enumerate(dataSet):
            ansLabel[index] = self.__apply(self.tree, sample, featuresName)
        return ansLabel
        
    def getTree(self):
        '''返回训练好的以字典形式存储的决策树'''
        return self.tree         
    
    
testTree = decisionTree('ID3')
featuresName_ = ['F'+str(i) for i in list(range(trainSet.shape[1]))]
testTree.buildTree(trainSet, trainSet_label, featuresName_[:])
resLabel = testTree.apply(testSet, featuresName_[:])
ansTree = testTree.getTree()
resLabel
ansTree

info gain [ 0.2473004  0.0133154  0.0784958  0.0784958]
best one(index) 0


array([-1.,  1.,  1., -1.,  1., -1.])

{'F0': {'defaultLabel': 1.0,
  'tree': {1.0: {'F2': {'defaultLabel': -1.0, 'tree': {1.0: -1.0, 2.0: 1.0}}},
   2.0: 1.0,
   3.0: {'F3': {'defaultLabel': 1.0, 'tree': {1.0: 1.0, 2.0: -1.0}}}}}}

In [83]:
def runDecisionTree(trainSet_, trainSet_label_, validateSet_, validateSet_label_):
    '''运行决策树函数'''
    def run(method):
        test = decisionTree(method)
        featuresName_ = list(range(trainSet_.shape[1]))
        test.buildTree(trainSet_, trainSet_label_, featuresName_[:])
        ans = test.apply(validateSet_, featuresName_[:])
        diff = np.argwhere(ans == validateSet_label_)
        accur = 100*float(diff.shape[0]/validateSet_label_.shape[0])
        print(method+":", "%.3f%%" % accur)
        
    for method_ in ['ID3']:
        run(method_)
        
runDecisionTree(trainSet, trainSet_label, trainSet, trainSet_label)

info gain [ 0.2473004  0.0133154  0.0784958  0.0784958]
best one(index) 0
ID3: 100.000%


In [45]:
trainSet, trainSet_label = loadDataSet('.\\data\\YStrain3.csv')
trainSet

testSet, _ = loadDataSet('.\\data\\YStest3.csv')
testSet

data dimension of dataset： 4
number of sample in data : 15
label frequency: {1.0: 9, -1.0: 6}


array([[ 1.,  1.,  2.,  1.],
       [ 3.,  1.,  2.,  2.],
       [ 1.,  3.,  1.,  1.],
       [ 2.,  3.,  2.,  1.],
       [ 3.,  2.,  2.,  1.],
       [ 3.,  2.,  1.,  2.],
       [ 1.,  2.,  2.,  2.],
       [ 3.,  1.,  2.,  1.],
       [ 1.,  3.,  1.,  2.],
       [ 2.,  3.,  1.,  1.],
       [ 1.,  2.,  1.,  1.],
       [ 3.,  2.,  1.,  1.],
       [ 3.,  1.,  2.,  2.],
       [ 2.,  2.,  1.,  2.],
       [ 2.,  1.,  2.,  2.]])

data dimension of dataset： 4
number of sample in data : 6
label frequency: {'?': 6}


array([[ 3.,  3.,  2.,  2.],
       [ 2.,  1.,  2.,  2.],
       [ 2.,  2.,  2.,  2.],
       [ 1.,  1.,  1.,  1.],
       [ 3.,  2.,  1.,  1.],
       [ 1.,  3.,  1.,  1.]])