# 分类回归树
## 递归算法构建树
伪代码：
```
找到最佳的待切分特征
    如果该节点不能再分，将该节点存为叶节点
    执行二元切分
    对右子树调用create_tree()方法
    对左子树调用create_tree()方法
```

In [1]:
import numpy as np

In [2]:
def load_dataset(file_name):
    """general function to parse tab-delimited floats"""
    dataset = []  # assume last column is target value
    f = open(file_name)
    for line in f.readlines():
        line_list = line.strip().split('\t')
        line_list = list(map(float, line_list))  # map all elements to float()
        dataset.append(line_list)
    return dataset

In [3]:
def split(dataset, feature, value):
    left = dataset[np.nonzero(dataset[:, feature] > value)[0], :]
    right = dataset[np.nonzero(dataset[:, feature] <= value)[0], :]
    return left, right

In [7]:
def create_tree(dataset, leaf_type=get_value, error_type=get_error, ops=(1, 4)):
    """assume dataset is NumPy Mat so we can array filtering"""
    feature, value = choose_best_split(dataset, leaf_type, error_type, ops)
    if feature is None:
        return value
    tree = dict()
    tree['feature'] = feature
    tree['value'] = value
    l_set, r_set = split(dataset, feature, value)
    tree['left'] = create_tree(l_set, leaf_type, error_type, ops)
    tree['right'] = create_tree(r_set, leaf_type, error_type, ops)
    return tree

## 将CART算法用于回归
### 寻找最佳切分点
```
对每个特征：
    对每个特征值：
        将数据集切分成两份
        计算切分的误差
        如果当前误差小于最小误差，那么将当前切分设定为最小切分并更新最小误差
返回最佳切分的特征和特征值
```

In [5]:
def get_value(dataset):
    """returns the value used for each leaf"""
    return np.mean(dataset[:, -1])


def get_error(dataset):
    """计算总方差"""
    return np.var(dataset[:, -1]) * dataset.shape[0]

In [6]:
def choose_best_split(dataset, leaf_type=get_value, error_type=get_error, ops=(1, 4)):
    """returns the best feature to split on and the value used for that split"""
    tol_error = ops[0]  # 容许的误差下降的下限
    tol_samples_num = ops[1]  # 容许切分的最小样本数
    # if all the target variables are the same value:
    # quit and return value
    if len(set(dataset[:, -1].T.tolist()[0])) == 1:  # exit condition 1
        return None, leaf_type(dataset)
    n_samples, n_features = dataset.shape
    # the choice of the best feature is driven by Reduction in RSS error from mean
    s = error_type(dataset)
    best_s = np.inf
    best_index = 0
    best_value = 0
    for feature in range(n_features - 1):
        for value in set(dataset[:, feature].T.tolist()[0]):
            left, right = split(dataset, feature, value)
            if (left.shape[0] < tol_samples_num) or (right.shape[0] < tol_samples_num):
                continue
            new_s = error_type(left) + error_type(right)
            if new_s < best_s:
                best_index = feature
                best_value = value
                best_s = new_s
    # if the decrease (s-best_s) is less than a threshold don't do the split
    if (s - best_s) < tol_error:
        return None, leaf_type(dataset)  # exit condition 2
    left, right = split(dataset, best_index, best_value)
    if (left.shape[0] < tol_samples_num) or (right.shape[0] < tol_samples_num):  # exit condition 3
        return None, leaf_type(dataset)
    return best_index, best_value

In [8]:
dataset = load_dataset('cart-dataset2.txt')

In [9]:
dataset

[[1.0, 0.409175, 1.88318],
 [1.0, 0.182603, 0.063908],
 [1.0, 0.663687, 3.042257],
 [1.0, 0.517395, 2.305004],
 [1.0, 0.013643, -0.067698],
 [1.0, 0.469643, 1.662809],
 [1.0, 0.725426, 3.275749],
 [1.0, 0.39435, 1.118077],
 [1.0, 0.50776, 2.095059],
 [1.0, 0.237395, 1.181912],
 [1.0, 0.057534, 0.221663],
 [1.0, 0.36982, 0.938453],
 [1.0, 0.976819, 4.149409],
 [1.0, 0.616051, 3.105444],
 [1.0, 0.4137, 1.896278],
 [1.0, 0.105279, -0.121345],
 [1.0, 0.670273, 3.161652],
 [1.0, 0.952758, 4.135358],
 [1.0, 0.272316, 0.859063],
 [1.0, 0.303697, 1.170272],
 [1.0, 0.486698, 1.68796],
 [1.0, 0.51181, 1.979745],
 [1.0, 0.195865, 0.06869],
 [1.0, 0.986769, 4.052137],
 [1.0, 0.785623, 3.156316],
 [1.0, 0.797583, 2.95063],
 [1.0, 0.081306, 0.068935],
 [1.0, 0.659753, 2.85402],
 [1.0, 0.37527, 0.999743],
 [1.0, 0.819136, 4.048082],
 [1.0, 0.142432, 0.230923],
 [1.0, 0.215112, 0.816693],
 [1.0, 0.04127, 0.130713],
 [1.0, 0.044136, -0.537706],
 [1.0, 0.131337, -0.339109],
 [1.0, 0.463444, 2.124538],
 

In [10]:
len(dataset)

200

In [11]:
dataset = np.mat(dataset)

In [12]:
create_tree(dataset)

{'feature': 1,
 'left': {'feature': 1,
  'left': {'feature': 1,
   'left': 3.9871631999999999,
   'right': 2.9836209534883724,
   'value': 0.797583},
  'right': 1.980035071428571,
  'value': 0.582002},
 'right': {'feature': 1,
  'left': 1.0289583666666666,
  'right': -0.023838155555555553,
  'value': 0.197834},
 'value': 0.39435}

## 树剪枝
有两种剪枝方式：
- 预剪枝：通过超参数来控制决策树过拟合，对参数选择十分敏感
- 后剪枝：利用测试集来对树进行剪枝，不需要用户指定参数

### 后剪枝
伪代码：
```
基于已有的树切分测试数据：
    如果存在任一子集是一棵树，则在该子集递归剪枝过程中
    计算将当前两个叶节点合并后的误差
    计算不合并的误差
    如果合并会降低误差的话，就将叶节点合并
```

In [13]:
def is_tree(obj):
    return type(obj).__name__ == 'dict'


def get_mean(tree):
    if is_tree(tree['right']):
        tree['right'] = get_mean(tree['right'])
    if is_tree(tree['left']):
        tree['left'] = get_mean(tree['left'])
    return (tree['left'] + tree['right']) / 2.0

In [14]:
def prune(tree, test_data):
    if test_data.shape[0] == 0:
        # if we have no test data collapse the tree
        return get_mean(tree)
    if is_tree(tree['right']) or is_tree(tree['left']):
        # if the branches are not trees try to prune them
        l_set, r_set = split(test_data, tree['feature'], tree['value'])
        if is_tree(tree['left']):
            tree['left'] = prune(tree['left'], l_set)
        if is_tree(tree['right']):
            tree['right'] = prune(tree['right'], r_set)

    # if they are now both leafs, see if we can merge them
    if not is_tree(tree['left']) and not is_tree(tree['right']):
        l_set, r_set = split(test_data, tree['feature'], tree['value'])
        error_no_merge = (sum(np.power(l_set[:, -1] - tree['left'], 2)) +
                          sum(np.power(r_set[:, -1] - tree['right'], 2)))
        tree_mean = (tree['left'] + tree['right']) / 2.0
        error_merge = sum(np.power(test_data[:, -1] - tree_mean, 2))
        if error_merge < error_no_merge:
            print("merging")
            return tree_mean
        else:
            return tree
    else:
        return tree

In [15]:
dataset = load_dataset(r'D:\PythonWork\data-modeling\regression\cart-dataset3.txt')
dataset = np.mat(dataset)
tree = create_tree(dataset, ops=(0, 1))
print(tree)

{'feature': 0, 'value': 0.499171, 'left': {'feature': 0, 'value': 0.729397, 'left': {'feature': 0, 'value': 0.952833, 'left': {'feature': 0, 'value': 0.965969, 'left': {'feature': 0, 'value': 0.968621, 'left': 86.399636999999998, 'right': 98.648346000000004}, 'right': {'feature': 0, 'value': 0.956951, 'left': {'feature': 0, 'value': 0.958512, 'left': {'feature': 0, 'value': 0.960398, 'left': 112.386764, 'right': 123.559747}, 'right': 135.83701300000001}, 'right': {'feature': 0, 'value': 0.953902, 'left': {'feature': 0, 'value': 0.954711, 'left': 82.016541000000004, 'right': 100.935789}, 'right': 130.92648}}}, 'right': {'feature': 0, 'value': 0.759504, 'left': {'feature': 0, 'value': 0.763328, 'left': {'feature': 0, 'value': 0.769043, 'left': {'feature': 0, 'value': 0.790312, 'left': {'feature': 0, 'value': 0.806158, 'left': {'feature': 0, 'value': 0.815215, 'left': {'feature': 0, 'value': 0.833026, 'left': {'feature': 0, 'value': 0.841547, 'left': {'feature': 0, 'value': 0.841625, 'lef

In [16]:
dataset = load_dataset(r'D:\PythonWork\data-modeling\regression\cart-dataset4.txt')
dataset = np.mat(dataset)
tree = prune(tree, dataset)

merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging


In [17]:
print(tree)

{'feature': 0, 'value': 0.499171, 'left': {'feature': 0, 'value': 0.729397, 'left': {'feature': 0, 'value': 0.952833, 'left': {'feature': 0, 'value': 0.965969, 'left': 92.523991499999994, 'right': {'feature': 0, 'value': 0.956951, 'left': {'feature': 0, 'value': 0.958512, 'left': {'feature': 0, 'value': 0.960398, 'left': 112.386764, 'right': 123.559747}, 'right': 135.83701300000001}, 'right': 111.2013225}}, 'right': {'feature': 0, 'value': 0.759504, 'left': {'feature': 0, 'value': 0.763328, 'left': {'feature': 0, 'value': 0.769043, 'left': {'feature': 0, 'value': 0.790312, 'left': {'feature': 0, 'value': 0.806158, 'left': {'feature': 0, 'value': 0.815215, 'left': {'feature': 0, 'value': 0.833026, 'left': {'feature': 0, 'value': 0.841547, 'left': {'feature': 0, 'value': 0.841625, 'left': {'feature': 0, 'value': 0.944221, 'left': {'feature': 0, 'value': 0.948822, 'left': 96.41885225, 'right': 69.318648999999994}, 'right': {'feature': 0, 'value': 0.85497, 'left': {'feature': 0, 'value': 0

## 模型树
用书来对数据建模，除了把叶节点简单的设定为常数之外，还有一种方法是把叶节点设定为分段线性函数。

In [18]:
def linear_solve(dataset):
    """helper function used in two places"""
    n_samples, n_features = np.shape(dataset)
    xs = np.mat(np.ones((n_samples, n_features)))
    ys = np.mat(np.ones((n_samples, 1)))  # create a copy of data with 1 in 0th postion
    xs[:, 1:n_features] = dataset[:, 0:n_features - 1]
    ys = dataset[:, -1]  # and strip out Y
    xTx = xs.T * xs
    if np.linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (xs.T * ys)
    return ws, xs, ys


def model_leaf(dataset):
    """create linear model and return coeficients"""
    ws, xs, ys = linear_solve(dataset)
    print(ws)
    return ws


def model_err(dataset):
    ws, xs, ys = linear_solve(dataset)
    y_preds = xs * ws
    return sum(np.power(ys - y_preds, 2))

In [19]:
dataset = load_dataset(r'D:\PythonWork\data-modeling\regression\cart-dataset5.txt')
dataset = np.mat(dataset)
create_tree(dataset, model_leaf, model_err, (1, 10))

[[  1.69855694e-03]
 [  1.19647739e+01]]
[[ 3.46877936]
 [ 1.18521743]]


{'feature': 0, 'left': matrix([[  1.69855694e-03],
         [  1.19647739e+01]]), 'right': matrix([[ 3.46877936],
         [ 1.18521743]]), 'value': 0.285477}