# 决策树
常用的机器学习方法之一，对归一化不敏感，而且几乎没有超参数。
## 离散型决策树
普通的决策树，使用算法ID3或C4.5。这种决策树多用于离散型决策树，对于一个特征有s个离散值，则在该结点上产生s个分支。两种算法的不同主要时选择最优特征时，采用信息增益还是信息增益比。

而CART算法则不同，它生成的是一个二叉决策树，不仅能用于离散数据，还能用于连续数据。在选择最优特征时，还要选择最优的划分点，同时其评判准则为基尼指数（离散数据）或平方误差（连续数据）。

### 离散型数据

|不能上陆|有蹼|是否是鱼类|
|--|--|--|
|1|1|是|
|1|1|是|
|1|0|否|
|0|1|否|
|0|1|否|

In [1]:
from math import log

In [2]:
def create_dataset():
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    names = ['no surfacing','flippers']  # 不能上陆，有无蹼
    return dataSet, names

In [3]:
dataset, names = create_dataset()
dataset

[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]

In [4]:
names

['no surfacing', 'flippers']

### 递归构建决策树
换回一个嵌套的字典，形如：
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

代码共分为部分：

1. 递归终止条件
    1. 只有一类
    2. 没有用于分类的特征
    3. \*信息增益低于划分阈值
2. 根绝最大增益选择特征
3. 根据该特征划分数据集
4. 左右子树进入递归

In [9]:
def create_tree(dataset, names, features, tree):
    """return a tree dictionary
    dataset: 数据集或数据集子集
    names: 特征名称
    features: 特征集合或特征子集 features = {0, 1} 表示第一列、第二列
    tree: 决策树字典或决策子树字典
    """
    classes = set(sample[-1] for sample in dataset)  # classes = {'yes', 'no'}
    if len(classes) <=1:  # Only one class
        tree = classes.pop()
        return tree
    if len(features) == 0:  # no feature
        tree = majority_count(dataset, classes)
        return tree

    # 最佳特征及其下的左子树和右子树
    (best_feature, best_value, another, 
     best_left, best_right) = choose_feature(dataset, features, classes)
    best_feature_name = names[best_feature]
    tree = {best_feature_name: {best_value: {}, another: {}}}
    features.remove(best_feature)
    print('current feature => ', best_feature_name)
    
    if best_left:
        sub_tree = create_tree(best_left, names, features, tree[best_feature_name][best_feature])
        tree[best_feature_name][best_value] = sub_tree
    if best_right:
        sub_tree = create_tree(best_right, names, features, tree[best_feature_name][another])
        tree[best_feature_name][1 - best_value] = sub_tree
    return tree

In [5]:
def majority_count(dataset, classes):
    """
    返回叶子结点中样本最多的样本
    """
    max_cnt = 0
    max_c = None
    labels = [sample[-1] for sample in dataset]
    for c in classes:
        cnt = labels.count(c)
        if cnt > max_cnt:
            max_cnt = cnt
            max_c = c
    return c

### 划分数据集
根据信息增益划分数据集：
    
    按照给定特征划分数据集（按上述特征划分数据集=> `split_dataset`）
    选择最好的数据集划分方式（选择信息增益最大的特征=> `choose_best_feature`）
    按照给定特征划分数据集（按上述特征划分数据集=> `split_dataset`）
    ……

In [6]:
def split_dataset(dataset, feature, value):
    """
    Left is true. Right is false
    """
    left, right = list(), list()
    for sample in dataset:
        if sample[feature] == value:
            left.append(sample)
        else:
            right.append(sample)
    return left, right

In [7]:
def calc_entropy(classes, *datasets):
    entropy = 0.0
    for d in datasets:
        if d:
            n_samples = len(d)
            for c in classes:
                proportion = [sample[-1] for sample in d].count(c) / float(n_samples)
                if proportion != 0:
                    entropy -= proportion * log(proportion, 2)
    return entropy

In [8]:
def choose_feature(dataset, features, classes):
    best_gain = 0.0
    best_feature = 0
    best_value = None
    another = None
    best_left, best_right = None, None
    for f in features:
        values = set(sample[f] for sample in dataset)
        for value in values:
            # split the dataset
            left, right = split_dataset(dataset, f, value)
            # calculate information gain
            org_entropy = calc_entropy(classes, dataset)
            new_entropy = calc_entropy(classes, left, right)
            gain = org_entropy - new_entropy
            if gain > best_gain:
                best_gain = gain
                best_feature = f
                best_value = value
                best_left, best_right = left, right
        for value in values:
            if value != best_value:
                another = value
                break
    return best_feature, best_value, another, best_left, best_right

### main

In [10]:
tree = dict()
dataset, names = create_dataset()
features = set(range(len(dataset[0]) - 1))  # features = {0, 1}
create_tree(dataset, names, features, tree)

current feature =>  no surfacing
current feature =>  flippers


{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

## 连续型决策树
连续性决策树连续值的分类问题
### 加载数据

In [1]:
def is_float(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [2]:
def load_dataset():
    file_path = '../ensemble/sonar-all-data.txt'
    dataset = []
    with open(file_path, 'r') as f:
        for line in f:
            if line:
                sample = []
                for item in line.split(','):
                    item = item.strip()
                    if is_float(item):
                        sample.append(float(item))
                    else:  # 字符=>类标签
                        sample.append(item)
                dataset.append(sample)
    return dataset

In [3]:
dataset = load_dataset()
print(dataset[0])

[0.02, 0.0371, 0.0428, 0.0207, 0.0954, 0.0986, 0.1539, 0.1601, 0.3109, 0.2111, 0.1609, 0.1582, 0.2238, 0.0645, 0.066, 0.2273, 0.31, 0.2999, 0.5078, 0.4797, 0.5783, 0.5071, 0.4328, 0.555, 0.6711, 0.6415, 0.7104, 0.808, 0.6791, 0.3857, 0.1307, 0.2604, 0.5121, 0.7547, 0.8537, 0.8507, 0.6692, 0.6097, 0.4943, 0.2744, 0.051, 0.2834, 0.2825, 0.4256, 0.2641, 0.1386, 0.1051, 0.1343, 0.0383, 0.0324, 0.0232, 0.0027, 0.0065, 0.0159, 0.0072, 0.0167, 0.018, 0.0084, 0.009, 0.0032, 'R']


### 构建决策树
选择基尼系数最大的特征和特征值，划分数据集

In [4]:
def split_dataset(dataset, feature, value):
    left, right = [], []
    for sample in dataset:
        if sample[feature] < value:
            left.append(sample)
        else:
            right.append(sample)
    return left, right

In [5]:
def calc_gini(classes, *datasets):
    gini = 0.0
    for d in datasets:
        n_samples = len(d)
        if n_samples == 0:
            continue
        for c in classes:
            proportion = [sample[-1] for sample in d].count(c) / float(n_samples)
            gini += proportion * (1 - proportion)
    return gini

In [6]:
def choose_and_split(dataset, features, classes):
    """
    choose the feature with largest Gini.
    traverse all features=>traverse all values under current feature=>calculate Ginni
    """
    min_gini = np.inf
    best_f, best_v = 0, 0.0
    best_left, best_right = None, None
    for f in features:
        values = set(sample[f] for sample in dataset)
        for v in values:
            left, right = split_dataset(dataset, f, v)
            gini = calc_gini(classes, left, right)
            if gini < min_gini:
                min_gini = gini
                best_f, best_v = f, v
                best_left, best_right = left, right
    return {'feature': best_f, 'value': best_v, 'left': best_left, 'right': best_right}

**可以选择特征数量**

In [7]:
from random import randrange
import numpy as np

In [8]:
def sub_features(dataset, n_features):
    features = set()
    while len(features) < n_features:
        f = randrange(len(dataset[0]) - 1)
        features.add(f)
    return features

### 迭代生成决策树
生成决策树的迭代停止的条件：

- 节点中只有一类
- 特征用尽
- 达到最大深度限制
- 达到最小叶子结点限制

In [9]:
def create_tree(dataset, max_depth, min_size, n_features):
    features = sub_features(dataset, n_features)
    classes = set(sample[-1] for sample in dataset)
    root = choose_and_split(dataset, features, classes)
    features.remove(root['feature'])
    create_node(root, features, max_depth, min_size, 1)
    return root

In [10]:
def majority_class(dataset):
    labels = [sample[-1] for sample in dataset]
    lb = max(set(labels), key=labels.count)
    return lb

In [11]:
def create_node(node, features, max_depth, min_size, depth):
    print('create node>>>')
    left, right = node['left'], node['right']
    if left:
        classes = set(sample[-1] for sample in left)
        # Only one class or no feauture remained
        if (len(classes) == 1 or len(features) == 0 or 
            not features or len(left) < min_size or depth >= max_depth):
            # return classification result
            node['left'] = majority_class(left)
        else:
            node['left'] = choose_and_split(left, features, classes)
            features.remove(node['left']['feature'])
            create_node(node['left'], features, 
                        max_depth, min_size, depth+1)
    if right:
        classes = set(sample[-1] for sample in right)
        if (len(classes) == 1 or len(features) == 0 or 
           not features or len(left) < min_size or depth >= max_depth):
            node['right'] = majority_class(right)
        else:
            node['right'] = choose_and_split(right, features, classes)
            features.remove(node['right']['feature'])
            create_node(node['right'], features, 
                        max_depth, min_size, depth+1)

In [12]:
create_tree(dataset, 15, 1, 20)

create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>


{'feature': 48,
 'left': 'R',
 'right': {'feature': 10,
  'left': 'R',
  'right': {'feature': 12,
   'left': 'R',
   'right': {'feature': 31,
    'left': 'R',
    'right': {'feature': 35,
     'left': {'feature': 38,
      'left': {'feature': 39,
       'left': 'R',
       'right': {'feature': 11,
        'left': {'feature': 15,
         'left': {'feature': 56,
          'left': 'R',
          'right': {'feature': 49,
           'left': 'R',
           'right': {'feature': 51,
            'left': 'R',
            'right': {'feature': 32,
             'left': [],
             'right': 'M',
             'value': 0.0477},
            'value': 0.0013},
           'value': 0.0044},
          'value': 0.0009},
         'right': 'R',
         'value': 0.9751},
        'right': 'R',
        'value': 0.6552},
       'value': 0.0227},
      'right': 'R',
      'value': 0.8849},
     'right': 'R',
     'value': 0.9922},
    'value': 0.0877},
   'value': 0.0616},
  'value': 0.0523},
 'value': 0.00

### 测试决策树

In [13]:
test_sample = dataset[5]
print(test_sample)

[0.0286, 0.0453, 0.0277, 0.0174, 0.0384, 0.099, 0.1201, 0.1833, 0.2105, 0.3039, 0.2988, 0.425, 0.6343, 0.8198, 1.0, 0.9988, 0.9508, 0.9025, 0.7234, 0.5122, 0.2074, 0.3985, 0.589, 0.2872, 0.2043, 0.5782, 0.5389, 0.375, 0.3411, 0.5067, 0.558, 0.4778, 0.3299, 0.2198, 0.1407, 0.2856, 0.3807, 0.4158, 0.4054, 0.3296, 0.2707, 0.265, 0.0723, 0.1238, 0.1192, 0.1089, 0.0623, 0.0494, 0.0264, 0.0081, 0.0104, 0.0045, 0.0014, 0.0038, 0.0013, 0.0089, 0.0057, 0.0027, 0.0051, 0.0062, 'R']


In [14]:
def predict(sample, tree):
    if sample[tree['feature']] < tree['value']:
        if len(tree['left']) == 1:
            return tree['left']
        else:
            return predict(sample, tree['left'])
    else:
        if len(tree['right']) == 1:
            return tree['right']
        else:
            return predict(sample, tree['right'])

In [15]:
tree = create_tree(dataset, 15, 1, 20)

create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>
create node>>>


In [16]:
predict(test_sample, tree)

'R'