# 随机森林
随机森林分类效果（错误率）与两个因素有关：
- 森林中任意两棵树的相关性：相关性越大，错误率越大；
- 森林中每棵树的分类能力：每棵树的分类能力越强，这个森林的错误率越低

减小特征选择个数m，树的相关性和分类能力也会相应的降低；增大m，两者也会随之增大。所以**关键问题是如何选择最优的m（或者是范围）**，这也是随机森林唯一的一个参数。

> bagging和boosting区别是什么？

1. bagging是一种与boosting很类似的技术，所使用的多个分类器的类型（数据量和特征量）都是一致的。
2. bagging是由不同的分类器（1.数据随机化2.特征随机化）经过训练，综合得出的出现最多分类结果；boosting是通过调整已有分类器错分的那些数据来获得新的分类器，得出目前最优的结果。
3. bagging中的分类器权重是相等的；而boosting中的分类器加权求和，所以权重并不相等，每个权重代表的是其对应分类器在上一轮迭代中的成功度。

## 伪代码
```
采取有放回的抽样方式构造子数据集，保证不同子集之间的数量级一样（不同子集/同一子集之间的元素可以重复）
利用子集数据来构建决策树
预测时，将这个数据放到每个字决策树中，每个子决策树输出一个结果。
统计子决策树的投票结果，得到最终的分类就是随机森林的输出结果。
```
![特征重抽样](feature-resample.jpg)

In [1]:
from random import randrange

In [28]:
import numpy as np

In [1]:
filename = r'sonar-all-data.txt'

In [4]:
def is_float(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [23]:
def load_dataset(file_path):
    dataset = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            if not line:
                continue
            temp_line = []
            for item in line.split(','):
                value = item.strip()
                if is_float(value):
                    temp_line.append(float(value))
                else:
                    temp_line.append(value)
            dataset.append(temp_line)
    return dataset

In [24]:
dataset = load_dataset(r'sonar-all-data.txt')

In [25]:
dataset

[[0.02,
  0.0371,
  0.0428,
  0.0207,
  0.0954,
  0.0986,
  0.1539,
  0.1601,
  0.3109,
  0.2111,
  0.1609,
  0.1582,
  0.2238,
  0.0645,
  0.066,
  0.2273,
  0.31,
  0.2999,
  0.5078,
  0.4797,
  0.5783,
  0.5071,
  0.4328,
  0.555,
  0.6711,
  0.6415,
  0.7104,
  0.808,
  0.6791,
  0.3857,
  0.1307,
  0.2604,
  0.5121,
  0.7547,
  0.8537,
  0.8507,
  0.6692,
  0.6097,
  0.4943,
  0.2744,
  0.051,
  0.2834,
  0.2825,
  0.4256,
  0.2641,
  0.1386,
  0.1051,
  0.1343,
  0.0383,
  0.0324,
  0.0232,
  0.0027,
  0.0065,
  0.0159,
  0.0072,
  0.0167,
  0.018,
  0.0084,
  0.009,
  0.0032,
  'R'],
 [0.0453,
  0.0523,
  0.0843,
  0.0689,
  0.1183,
  0.2583,
  0.2156,
  0.3481,
  0.3337,
  0.2872,
  0.4918,
  0.6552,
  0.6919,
  0.7797,
  0.7464,
  0.9444,
  1.0,
  0.8874,
  0.8024,
  0.7818,
  0.5212,
  0.4052,
  0.3957,
  0.3914,
  0.325,
  0.32,
  0.3271,
  0.2767,
  0.4423,
  0.2028,
  0.3788,
  0.2947,
  0.1984,
  0.2341,
  0.1306,
  0.4182,
  0.3835,
  0.1057,
  0.184,
  0.197,
  0.1674,


In [10]:
def split(index, value, dataset):
    """
    Split a dataset based on an attribute and an attribute value
    """
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(right)
    return left, right

In [11]:
def groups_gini(groups, class_values):
    """
    Calculate the Gini index for a split dataset
    对于当前分组，计算基尼系数
    """
    gini = 0.0
    for c in class_values:
        for g in groups:
            size = len(g)
            if size == 0:
                continue
            # p = 当前类的数量 / 总数量size
            proportion = [row[-1] for row in group].count(c) / float(size)
            gini += proportion * (1 - proportion)
    return gini

In [None]:
def get_split(dataset, n_features):
    class_values = list(set(row[-1] for row in dataset))  # class_values = ['M', 'R']
    best_index, best_value, best_score, best_groups = np.inf, np.inf, np.inf, None
    feature = list()
    while len(feature) < n_features:
        

In [31]:
def to_terminal(group):
    """
    Create a terminal node value
    """
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

In [None]:
def create_tree(dataset, names):
    class_l = [sample[-1] for sample in dataset]
    assert len(set(class_l)) <= 2  # 二分类决策树
    if class_l.count(class_l[0]) == len(class_l):
        return class_l[0]  # stop splitting when all of the classes are equal
    if len(dataset[0]) == 1:  # stop splitting when there are no more features in dataset
        return majority_count(class_l)
    
    best_feature = choose_best_feature(dataset)
    best_feature_name = names[best_feature]
    my_tree = {best_feature_name: {}}  # 用于输出的数的结构字典文本
    del(names[best_feature])
    feature_values_set = set([sample[best_feature] for sample in dataset])
    for value in feature_values_set:
        sub_names = names[:]
        my_tree[best_feature_name][value] = create_tree(split_dataset(dataset, best_feature, 
                                                                      value), sub_names)
    return my_tree

In [None]:
def split(groups, max_depth, min_size, n_features, depth):
    """
    Create child splits for a node or make terminal
    """
    left, right = groups
    '''递归判断条件
    1. 只有一类
    2. 没有特征了
    3. 决策树深度达到限制
    4. 叶子结点大小达到限制
    '''
    if not left or not right:  # check for a no split
        pass
    if depth == max_depth:  # check for max depth
        pass
    if len(left) <= min_size:  # process left child
        pass
    else:
        '''
        1. 根据最大增益选择特征
        2. 根据该特征划分数据集
        '''
        groups = split(left, max_depth, min_size, n_features, depth+1)
    if len(right) <= min_size:  # process right child
        pass
    else:
        groups = split(right, max_depth, min_size, n_features, depth+1)

In [None]:
def choose_feature(dataset, features):
    """choose the best feature based on Gini"""
    for i in range(len(features)):  # traverse all features
        feature = features[i]
        for value in set(row[i] for row in dataset):  # travese all values of the feature
            

In [None]:
def split_dataset():
    """split current dataset"""

In [None]:
def calc_gini():
    """calculate Gini, return it"""

In [None]:
def build_tree(dataset, max_depth, min_size, n_features):
    """
    Build a decision tree
    dataset: 
    max_depth: 决策树深度限制
    min_size: 叶子结点的大小
    n_features: 选取的特征数目
    """
    classes = [row[-1] for row in dataset]  # 数据集类别
    '''递归判断条件
    1. 只有一类
    2. 只有一个特征
    3. 决策树深度达到限制
    4. 叶子结点大小达到限制
    '''
    if classes.count(classes[0]) == len(classes):
        pass
    if len(dataset[0]) == 1:
        pass
    if depth == max_depth:
        pass
    if len(left) <= min_size:
        pass
    
    

In [26]:
# 找出分割数据集的最优特征，得到最优的特征index，
# 特征值row[index], 以及分割完的数据groups(left, right)

In [27]:
class_value = list(set(row[-1] for row in dataset))

In [28]:
class_value

['R', 'M']

In [29]:
b_index, b_value, b_score, b_groups = 999, 999, 999, None

In [30]:
len(dataset)

208

In [31]:
len(dataset[0])

61

In [54]:
def test_split(index, value, dataset):
    # 根据特征和特征值分割数据集
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
        break
    return left, right

In [44]:
def gini_index(groups, class_values):
    # Calculate the Gini index for a split dataset
    gini = 0.0
    for class_value in class_values:  # class_values = [0, 1]
        for group in groups:  # groups = (left, right)
            size = len(group)
            if size == 0:
                continue
            proportion = [row[-1] for row in group].count(class_value) / float(size)
            gini += (proportion * (1.0 - proportion))  # 这是基尼指数 => p(1-p)
    return gini

In [35]:
n_features = 15

In [75]:
features = list()
while len(features) < n_features:
    index = randrange(len(dataset[0]) - 1)
    if index not in features:
        features.append(index)
for index in features:
    for row in dataset:
        groups = test_split(index, row[index], dataset)
        gini = gini_index(groups, class_value)
        if gini < b_score:
            b_index, b_value, b_score, b_groups = index, row[index], gini, groups

In [76]:
b_index

44

In [78]:
n_trees = 10

In [40]:
trees = list()
for i in range(n_trees):
    sample = 

[35]

In [81]:
len(dataset_split)

5