In [1]:
# 使用ID3算法做分类
# 使用信息增益准则分裂子节点。信息熵：sum(-plogp)

In [2]:
# 使用Mnist数据。训练数据60000个样本，测试数据10000个样本

In [3]:
import pandas as pd
import numpy as np
from math import log
from collections import Counter

In [4]:
def loadData(fileName):
    """
    从文件读取数据，并做一定处理
    param fileName: 数据文件路径
    return: 训练特征，对应的标签
    """
    data=pd.read_csv(fileName,header=None)
    data=data.values
    #数据第一行为分类结果
    label = data[:,0]
    data_X = data[:,1:]

    # 因为data_X的取值范围为0-255，则分裂子节点时可能性过多，计算过于繁杂，做二值化处理。
    data_X[data_X<128]=0
    data_X[data_X>=128]=1

    return data_X, label

In [5]:
X_train, y_train = loadData('../jupyter_files/Mnist_data/train.csv')
X_test, y_test = loadData('../jupyter_files/Mnist_data/test.csv')

In [6]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(60000, 784)
(60000,)
(10000, 784)
(10000,)


In [7]:
def entropy(label):
    """
    计算熵
    param label: 1维数据
    return: float值
    """
    label_counts = {}
    for i in range(len(label)):
        if label[i] not in label_counts.keys():
            label_counts[label[i]] = 0
        label_counts[label[i]] += 1
    ent = - sum([(p/len(label)) * log(p/len(label), 2) for p in label_counts.values()])
    return ent

In [8]:
entropy(y_train)

3.319870926755188

In [9]:
def con_entropy(data, label, col):
    """
    计算条件熵
    param data: numpy array特征数据
    param laebel: 1维标签数据
    param col: 用于对数据做划分的特征列序号
    return: float值
    """
    feature_sets = {}
    for i in range(len(data)):
        feature = data[i][col]
        if feature not in feature_sets.keys():
            feature_sets[feature] = []
        feature_sets[feature].append(data[i])
    con_ent = sum([(len(label[data[:, col]==p])/len(data)) * entropy(label[data[:, col]==p]) for p in feature_sets.keys()])
    return con_ent

In [10]:
print(con_entropy(X_train, y_train, 0))
print(con_entropy(X_train, y_train, 100))
print(con_entropy(X_train, y_train, 200))

3.319870926755188
3.21934834487652
3.3091512861967516


In [11]:
def find_best_feature(data, label):
    """
    寻找信息增益最大的特征
    param data: numpy array数据
    return: （信息增益最大特征所在列序号，最大信息增益值）
    """
    ent = entropy(label)
    info_gains = []
    for col in range(data.shape[1]):   
        con_ent = con_entropy(data, label, col)
        info_gains.append(ent - con_ent)
    return info_gains.index(max(info_gains)), max(info_gains)

In [12]:
find_best_feature(X_train, y_train)

(378, 0.3090682397722091)

In [13]:
def cut_data(data, label, Ag, ai):
    """
    按最优特征对数据进行切分
    param data: 待切分特征数据
    param laebel: 1维标签数据
    param Ag: 最优切分特征
    param ai: 最优切分特征的其中一个特征值
    return: 切分后，特征Ag的值为ai的数据和相应标签
    """
    part_data = []
    part_label = []
    for i in range(len(data)):
        if data[i][Ag] == ai:
            part_data.append(list(data[i][0:Ag]) + list(data[i][Ag+1:]))
            part_label.append(label[i])
    return np.array(part_data), np.array(part_label)

In [14]:
def find_class(label):
    """
    确定树节点的类别
    param label: 1维标签数据
    return: 标签/类别
    """
    counter = Counter(label)
    return counter.most_common(1)[0][0]

In [15]:
def dt_train(data, label, epsilon=0.1):
    """
    训练决策树
    param data: 训练特征数据
    param label: 训练标签数据
    param epsilon: 信息增益的阈值
    return: 使用字典描述的决策树
    """
    print("Create node, %d label data to split..." % len(label))

    clusters = set([i for i in label])          # 查看还有多少分类

    # 如果所有实例属于同一类，则决策树T为单节点树，返回该类作为该节点的类标记
    if len(clusters) == 1:
        return label[0]

    # 如果可选特征为空集，则决策树T为单节点树，返回实例数最大的类作为该节点的类标记
    if len(data) == 0:
        return find_class(label)
    
    # 计算最大信息增益特征及最大信息增益值
    Ag, max_info_gain = find_best_feature(data, label)

    # 如果最大信息增益小于阈值，则决策树T为单节点树，返回实例树最大的类作为该节点的类标记
    if max_info_gain < epsilon:
        return find_class(label)

    # 使用字典描述树，如tree{378:{0：{},1: {}}
    # 就代表按第387列特征分裂节点，可以分为特征值为0、1的2棵子树，各个子树又可以构造子树
    tree = {Ag:{}}

    # 按特征取值，对数据进行切分
    for ai in set([i for i in data[:, Ag]]):
        part_data, part_label = cut_data(data, label, Ag, ai)
        tree[Ag][ai] = dt_train(part_data, part_label)
    
    return tree

In [16]:
tree = dt_train(X_train, y_train)

Create node, 60000 label data to split...
Create node, 33390 label data to split...
Create node, 23703 label data to split...
Create node, 18501 label data to split...
Create node, 11141 label data to split...
Create node, 8582 label data to split...
Create node, 7560 label data to split...
Create node, 6752 label data to split...
Create node, 6171 label data to split...
Create node, 5792 label data to split...
Create node, 379 label data to split...
Create node, 215 label data to split...
Create node, 169 label data to split...
Create node, 140 label data to split...
Create node, 131 label data to split...
Create node, 126 label data to split...
Create node, 123 label data to split...
Create node, 121 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 5 label data to split...

Create node, 6 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 2 label data to split...
Create node, 6 label data to split...
Create node, 61 label data to split...
Create node, 56 label data to split...
Create node, 53 label data to split...
Create node, 51 label data to split...
Create node, 2 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 5 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 218 label data to split...
Create node, 197 label data to split...
Create node, 189 label data to split...
Create node, 178 label data to split...
Create node, 171 label data to split...
Create node, 66 label data to split...
Create node, 47 label data to split...
Create node, 28 label data to split...
Create node, 9 label data to spli

Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 36 label data to split...
Create node, 35 label data to split...
Create node, 34 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 174 label data to split...
Create node, 46 label data to split...
Create node, 13 label data to split...
Create node, 6 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 7 label data to split...
Create node, 33 label data to split...
Create node, 30 label data to split...
Create node, 28 label data to split...
Create node, 27 label data to split...
Create node, 25 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 3 label data to split...


Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 12 label data to split...
Create node, 11 label data to split...
Create node, 1 label data to split...
Create node, 242 label data to split...
Create node, 132 label data to split...
Create node, 45 label data to split...
Create node, 32 label data to split...
Create node, 5 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 27 label data to split...
Create node, 23 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 2 label data to split...
Create node, 13 label data to split...
Create node, 10 label data to split...
Create node, 5 label data to split...
Create node, 4 label data to split...
Create node, 1 label data to split...


Create node, 682 label data to split...
Create node, 588 label data to split...
Create node, 514 label data to split...
Create node, 496 label data to split...
Create node, 18 label data to split...
Create node, 12 label data to split...
Create node, 5 label data to split...
Create node, 4 label data to split...
Create node, 1 label data to split...
Create node, 7 label data to split...
Create node, 6 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 74 label data to split...
Create node, 40 label data to split...
Create node, 15 label data to split...
Create node, 11 label data to split...
Create node, 10 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 2 label data to split...
Create node, 25 label data to split...
Create node, 16 label data to spli

Create node, 15 label data to split...
Create node, 14 label data to split...
Create node, 1 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 8 label data to split...
Create node, 7 label data to split...
Create node, 1 label data to split...
Create node, 22 label data to split...
Create node, 18 label data to split...
Create node, 14 label data to split...
Create node, 13 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 8 label data to split...
Create node, 4 label data to split...
Create node, 1 label data to split...
Create node, 3 label data to split...
Create

Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 17 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 16 label data to split...
Create node, 6 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 10 label data to split...
Create node, 22 label data to split...
Create node, 7 label data to split...
Create node, 15 label data to split...
Create node, 6 label data to split...
Create node, 9 label data to split...
Create node, 6 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 74 label data to split...
Create

Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 3 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 7 label data to split...
Create node, 3 label data to split...
Create node, 4 label data to split...
Create node, 36 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 34 label data to split...
Create node, 32 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 82 label data to split...
Create node, 55 label data to split...
Create node, 44 label data to split...
Create node, 8 label data to split...
Create node, 6 label data to split...
Create node, 2 label data to split...
Create node, 36 label data to split...
Create node, 3 label data to split...
Creat

Create node, 26 label data to split...
Create node, 2 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 34 label data to split...
Create node, 33 label data to split...
Create node, 1 label data to split...
Create node, 129 label data to split...
Create node, 125 label data to split...
Create node, 120 label data to split...
Create node, 115 label data to split...
Create node, 5 label data to split...
Create node, 2 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 5 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 4 label data to split...
Create node, 205 label data to split...
Create node, 171 label data to split...
Create node, 20 label data to split

Create node, 98 label data to split...
Create node, 86 label data to split...
Create node, 12 label data to split...
Create node, 3 label data to split...
Create node, 9 label data to split...
Create node, 3 label data to split...
Create node, 6 label data to split...
Create node, 14 label data to split...
Create node, 7 label data to split...
Create node, 6 label data to split...
Create node, 1 label data to split...
Create node, 7 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 18 label data to split...
Create node, 13 label data to split...
Create node, 5 label data to split...
Create node, 2 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 74 label data to split...
Creat

Create node, 8 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 12 label data to split...
Create node, 10 label data to split...
Create node, 9 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 48 label data to split...
Create node, 24 label data to split...
Create node, 12 label data to split...
Create node, 9 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 12 label data to split...
Create node, 7 label data to split...
Create node, 3 label data to split...
Create

Create node, 166 label data to split...
Create node, 56 label data to split...
Create node, 26 label data to split...
Create node, 15 label data to split...
Create node, 9 label data to split...
Create node, 8 label data to split...
Create node, 1 label data to split...
Create node, 6 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 11 label data to split...
Create node, 5 label data to split...
Create node, 6 label data to split...
Create node, 2 label data to split...
Create node, 4 label data to split...
Create node, 30 label data to split...
Create node, 23 label data to split...
Create node, 17 label data to split...
Create node, 16 label data to split...
Create node, 1 label data to split...
Create node, 6 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Cr

Create node, 143 label data to split...
Create node, 5 label data to split...
Create node, 4 label data to split...
Create node, 1 label data to split...
Create node, 9 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 6 label data to split...
Create node, 26 label data to split...
Create node, 16 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 2 label data to split...
Create node, 12 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 8 label data to split...
Create node, 10 label data to split...
Create node, 8 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 258 label data to split...
Create node, 209 label data to split...
Cr

Create node, 51 label data to split...
Create node, 44 label data to split...
Create node, 42 label data to split...
Create node, 39 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 7 label data to split...
Create node, 1 label data to split...
Create node, 6 label data to split...
Create node, 10 label data to split...
Create node, 5 label data to split...
Create node, 2 label data to split...
Create node, 3 label data to split...
Create node, 5 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 52 label data to split...
Create node, 49 label data to split...
Create node, 44 label data to split...
Create node, 42 label data to split...
Create node, 2 label data to split...
Create node, 5 label data to split...
Cre

Create node, 3139 label data to split...
Create node, 2946 label data to split...
Create node, 193 label data to split...
Create node, 74 label data to split...
Create node, 59 label data to split...
Create node, 48 label data to split...
Create node, 45 label data to split...
Create node, 43 label data to split...
Create node, 42 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 11 label data to split...
Create node, 6 label data to split...
Create node, 3 label data to split...
Create node, 3 label data to split...
Create node, 5 label data to split...
Create node, 4 label data to split...
Create node, 1 label data to split...
Create node, 15 label data to split...
Create node, 11 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to spli

Create node, 15 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 366 label data to split...
Create node, 116 label data to split...
Create node, 81 label data to split...
Create node, 17 label data to split...
Create node, 13 label data to split...
Create node, 7 label data to split...
Create node, 6 label data to split...
Create node, 1 label data to split...
Create node, 6 label data to split...
Create node, 4 label data to split...
Create node, 64 label data to split...
Create node, 14 label data to split...
Create node, 6 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 8 label data to split...
Create node, 1 label data to split...
Create node, 7 label data to split...
Create node, 50 label data to split...
C

Create node, 42 label data to split...
Create node, 36 label data to split...
Create node, 32 label data to split...
Create node, 28 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 6 label data to split...
Create node, 7 label data to split...
Create node, 5 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 12 label data to split...
Create node, 1 label data to split...
Create node, 11 label data to split...
Create node, 104 label data to split...
Create node, 96 label data to split...
Create node, 8 label data to split...
Cre

Create node, 17 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 5 label data to split...
Create node, 2 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 91 label data to split...
Create node, 48 label data to split...
Create node, 19 label data to split...
Create node, 14 label data to split...
Create node, 9 label data to split...
Create node, 6 label data to split...
Create node, 1 label data to split...
Create node, 5 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 5 label data to split...
Create node, 5 label data to split...
Create 

Create node, 7 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 7 label data to split...
Create node, 5 label data to split...
Create node, 2 label data to split...
Create node, 53 label data to split...
Create node, 9 label data to split...
Create node, 5 label data to split...
Create node, 4 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 44 label data to split...
Create node, 25 label data to split...
Create node, 19 label data to split...
Create node, 11 label data to split...
Create node, 8 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create 

Create node, 13 label data to split...
Create node, 12 label data to split...
Create node, 1 label data to split...
Create node, 25 label data to split...
Create node, 11 label data to split...
Create node, 14 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 10 label data to split...
Create node, 20 label data to split...
Create node, 15 label data to split...
Create node, 5 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 81 label data to split...
Create node, 42 label data to split...
Create node, 19 label data to split...
Create node, 13 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 9 label data to split...
Create node, 6 label data to split...


Create node, 1111 label data to split...
Create node, 1018 label data to split...
Create node, 93 label data to split...
Create node, 68 label data to split...
Create node, 50 label data to split...
Create node, 46 label data to split...
Create node, 42 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 18 label data to split...
Create node, 6 label data to split...
Create node, 12 label data to split...
Create node, 4 label data to split...
Create node, 8 label data to split...
Create node, 7 label data to split...
Create node, 1 label data to split...
Create node, 25 label data to split...
Create node, 7 label data to split...
Create node, 2 label data to split...
Create node, 5 label data to split...
Create node, 18 label data to split...
Create node, 4 label data to split.

Create node, 80 label data to split...
Create node, 11 label data to split...
Create node, 6 label data to split...
Create node, 2 label data to split...
Create node, 4 label data to split...
Create node, 5 label data to split...
Create node, 69 label data to split...
Create node, 65 label data to split...
Create node, 12 label data to split...
Create node, 7 label data to split...
Create node, 5 label data to split...
Create node, 4 label data to split...
Create node, 1 label data to split...
Create node, 53 label data to split...
Create node, 4 label data to split...
Create node, 272 label data to split...
Create node, 85 label data to split...
Create node, 58 label data to split...
Create node, 20 label data to split...
Create node, 14 label data to split...
Create node, 11 label data to split...
Create node, 8 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...

Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 4 label data to split...
Create node, 1 label data to split...
Create node, 3 label data to split...
Create node, 7 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 22 label data to split...
Create node, 9 label data to split...
Create node, 7 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 13 label data to split...
Create node, 11 label data to split...
Create node, 2 label data to split...
Create node, 37 label data to split...
Create node, 19 label data to split...
Create node, 10 label data to split...
Create node, 7 label data to split...
Create

Create node, 1 label data to split...
Create node, 10 label data to split...
Create node, 3 label data to split...
Create node, 10 label data to split...
Create node, 16 label data to split...
Create node, 16 label data to split...
Create node, 8 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 6 label data to split...
Create node, 8 label data to split...
Create node, 46 label data to split...
Create node, 13 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 11 label data to split...
Create node, 33 label data to split...
Create node, 22 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Cre

Create node, 40 label data to split...
Create node, 37 label data to split...
Create node, 29 label data to split...
Create node, 11 label data to split...
Create node, 5 label data to split...
Create node, 6 label data to split...
Create node, 18 label data to split...
Create node, 8 label data to split...
Create node, 7 label data to split...
Create node, 1 label data to split...
Create node, 6 label data to split...
Create node, 1 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 12 label data to split...
Create node, 6 label data to split...
Create node, 4 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 6 label data to split...
Create node, 3 label data to split...
Create node, 3 label data to split...
Create node, 128 label data to split...
Create node, 114 label data to split...
Cr

Create node, 268 label data to split...
Create node, 197 label data to split...
Create node, 153 label data to split...
Create node, 101 label data to split...
Create node, 28 label data to split...
Create node, 25 label data to split...
Create node, 12 label data to split...
Create node, 7 label data to split...
Create node, 5 label data to split...
Create node, 2 label data to split...
Create node, 5 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 13 label data to split...
Create node, 4 label data to split...
Create node, 9 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 7 label data to split...
Create node, 3 label data to split...
Create node, 73 label data to split...
Create node, 38 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split..

Create node, 39 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 104 label data to split...
Create node, 88 label data to split...
Create node, 81 label data to split...
Create node, 71 label data to split...
Create node, 10 label data to split...
Create node, 6 label data to split...
Create node, 4 label data to split...
Create node, 1 label data to split...
Create node, 3 label data to split...
Create node, 7 label data to split...
Create node, 16 label data to split...
Create node, 13 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 86 label data to split...
Create node, 42 label data to split...
Create node, 15 label data to split...
Create node, 9 label data to split...
Create node, 5 label data to split...
Create node, 3 label data to split...


Create node, 114 label data to split...
Create node, 9 label data to split...
Create node, 5 label data to split...
Create node, 4 label data to split...
Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 303 label data to split...
Create node, 102 label data to split...
Create node, 89 label data to split...
Create node, 75 label data to split...
Create node, 72 label data to split...
Create node, 3 label data to split...
Create node, 1 label data to split...
Create node, 2 label data to split...
Create node, 14 label data to split...
Create node, 2 label data to split...
Create node, 12 label data to split...
Create node, 11 label data to split...
Create node, 1 label data to split...
Create node, 13 label data to split...
Create node, 8 label data to split...
Create node, 6 label data to split...
Create node, 2 label data to split...
Create node, 5 label data to split...

Create node, 1 label data to split...
Create node, 4 label data to split...
Create node, 33 label data to split...
Create node, 23 label data to split...
Create node, 3 label data to split...
Create node, 2 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 1 label data to split...
Create node, 20 label data to split...
Create node, 19 label data to split...
Create node, 1 label data to split...
Create node, 10 label data to split...
Create node, 1 label data to split...
Create node, 9 label data to split...
Create node, 733 label data to split...
Create node, 597 label data to split...
Create node, 103 label data to split...
Create node, 50 label data to split...
Create node, 30 label data to split...
Create node, 23 label data to split...
Create node, 17 label data to split...
Create node, 9 label data to split...
Create node, 8 label data to split...
Create node, 5 label data to split...
Create node, 3 label data to split.

Create node, 721 label data to split...


In [17]:
def dt_predict(row, tree):
    """
    使用训练好的决策树逐行预测
    param row: 待预测的特征数据,list数据
    param tree: 训练好的决策树
    return: 预测的类别
    """
    while True:
        (key, value), = tree.items()
        if type(value).__name__ == 'dict':        # 如果是字典，说明还没到叶子节点，继续搜索子树
            feature = row[key]                    # 获取当前分裂特征下，row对应的特征值
            del row[key]                          # 将已经考虑的特征剔除
            tree = value[feature]                 # 用子树更新tree的值
            if type(tree).__name__ == "int64":
                return tree
        else:
            return value

In [18]:
def accuracy(y, y_pred):
    count = 0
    for i in range(len(y)):
        if y[i] == y_pred[i]:
            count += 1
    return count/len(y)

In [19]:
# 对测试集做预测
pred_y = []
for i in range(len(X_test)):
    pred_y.append(dt_predict(list(X_test[i]), tree))

In [20]:
score = accuracy(y_test, pred_y)

In [21]:
score

0.8636