## Decision tree

### Code

In [None]:
class decision_tree:
    # 根据公式5.24计算基尼系数
    def calc_sub_Gini(self, y_sub):
        gini_sub = []
        for cls in set(y_sub):
            gini_sub.append((np.sum(y_sub == cls) / len(y_sub)) ** 2)
        return 1- sum(gini_sub)

    # 选择最优特征和最优切点
    def calc_best_feature(self, dataset, label):
        feature_gini = {}
        for feature in range(dataset.shape[1]):
            arr_len = len(np.unique(dataset[:,feature]))
            if (arr_len == 1):
                # 特征都为同一个数值，无切点（或者说该特征已被用尽）
                continue
            elif (arr_len == 2):
                # 二类分类问题
                feature_arr = np.unique(dataset[:,feature])[0]
                split1 = dataset[:,feature] == feature_arr
                split2 = dataset[:,feature] != feature_arr
                gini = sum(split1) / len(split1) * self.calc_sub_Gini(label[split1]) + sum(split2) / len(split2) * self.calc_sub_Gini(label[split2])
                feature_gini[(feature, feature_arr)] = gini 
            else:
                # 多类分类问题
                for feature_arr in set(dataset[:,feature]):
                    split1 = dataset[:,feature] == feature_arr
                    split2 = dataset[:,feature] != feature_arr
                    gini = sum(split1) / len(split1) * self.calc_sub_Gini(label[split1]) + sum(split2) / len(split2) * self.calc_sub_Gini(label[split2])
                    feature_gini[(feature, feature_arr)] = gini
        if (len(feature_gini) == 0):
            # 即该数据集的特征值为空了
            return None
        else:
            # 取最小基尼系数的特征值作为切点
            best_feature = min(feature_gini, key = feature_gini.get)
            return best_feature

    # 分割数据集到两个（左右）子节点中
    def split_dat(self, dataset, label, best_feature):
        feature = best_feature[0]
        feature_value = best_feature[1]
        
        left_label = label[dataset[:,feature] == feature_value]
        right_label = label[dataset[:,feature] != feature_value]
        left_dat = dataset[dataset[:,feature] == feature_value,:]
        right_dat = dataset[dataset[:,feature] != feature_value,:]
        
        return left_label,right_label,left_dat,right_dat

    # 生成决策树
    def create_Tree(self, dataset, label):
        # 样本属于同一类，分配到单节点，函数停止
        if (len(np.unique(label)) == 1):
            return label[0]
        
        best_feature = self.calc_best_feature(dataset, label)
        
        # 没有更多的特征，特征已用完，函数停止
        if (best_feature == None):
            label_number = {}
            label_number = dict(Counter(label))
            return max(label_number, key=label_number.get) 
    
        left_label,right_label,left_dat,right_dat = self.split_dat(dataset, label, best_feature)
        
        # 用字典构建树，并迭代函数
        tree = {best_feature:{}}
        tree[best_feature]["left"] = self.create_Tree(left_dat, left_label)
        tree[best_feature]["right"] = self.create_Tree(right_dat, right_label)
    
        return tree

    # 通过已构建的决策树预测
    def predict(self, test, tree):
        for k,v in tree.items():
            if (test[k[0]] == k[1]):
                left_leaf = v["left"]
                if (type(left_leaf) == dict):
                    return self.predict(test, v["left"])
                else:
                    return v["left"]
            else:
                right_leaf = v["right"]
                if (type(right_leaf) == dict):
                    return self.predict(test, v["right"])
                else:
                    return v["right"]

### 训练集

In [None]:
先以书中表5.1的训练数据集测试下：

In [None]:
import pandas as pd
import numpy as np

dataset = np.array(dataset)
X = dataset[:,:-1]
y = dataset[:,-1]
dt = decision_tree()
res = dt.create_Tree(X,y)

In [None]:
再结合测试数据集MNIST（数字识别），用上述决策树代码实现如下：

In [None]:
dataset = pd.read_csv("train.csv")
dataset = np.array(dataset)
dataset[:,1:][dataset[:,1:] != 0] = 1
label = dataset[:,0]

train_dat, test_dat, train_label, test_label = train_test_split(dataset[:,1:], label, test_size = 0.2, random_state = 123456

In [None]:
构建决策树，并计算测试误差

In [None]:
dtree = decision_tree()
dt = dtree.create_Tree(train_dat, train_label)
error = 0
for i in range(len(test_dat)):
    if (dtree.predict(test_dat[i], dt) != test_label[i]):
        error += 1
print(error / len(test_label) * 100)

In [None]:
测试误差为：13.5%