# 随机森林
随机森林分类效果（错误率）与两个因素有关：
- 森林中任意两棵树的相关性：相关性越大，错误率越大；
- 森林中每棵树的分类能力：每棵树的分类能力越强，这个森林的错误率越低

减小特征选择个数m，树的相关性和分类能力也会相应的降低；增大m，两者也会随之增大。所以**关键问题是如何选择最优的m（或者是范围）**，这也是随机森林唯一的一个参数。

> bagging和boosting区别是什么？

1. bagging是一种与boosting很类似的技术，所使用的多个分类器的类型（数据量和特征量）都是一致的。
2. bagging是由不同的分类器（1.数据随机化2.特征随机化）经过训练，综合得出的出现最多分类结果；boosting是通过调整已有分类器错分的那些数据来获得新的分类器，得出目前最优的结果。
3. bagging中的分类器权重是相等的；而boosting中的分类器加权求和，所以权重并不相等，每个权重代表的是其对应分类器在上一轮迭代中的成功度。

## 伪代码
```
采取有放回的抽样方式构造子数据集，保证不同子集之间的数量级一样（不同子集/同一子集之间的元素可以重复）
利用子集数据来构建决策树
预测时，将这个数据放到每个字决策树中，每个子决策树输出一个结果。
统计子决策树的投票结果，得到最终的分类就是随机森林的输出结果。
```
![特征重抽样](feature-resample.jpg)

### 构建一颗决策树

In [1]:
def is_float(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [2]:
def load_dataset():
    file_path = '../ensemble/sonar-all-data.txt'
    dataset = []
    with open(file_path, 'r') as f:
        for line in f:
            if line:
                sample = []
                for item in line.split(','):
                    item = item.strip()
                    if is_float(item):
                        sample.append(float(item))
                    else:  # 字符=>类标签
                        sample.append(item)
                dataset.append(sample)
    return dataset

In [3]:
dataset = load_dataset()
print(dataset[0])

[0.02, 0.0371, 0.0428, 0.0207, 0.0954, 0.0986, 0.1539, 0.1601, 0.3109, 0.2111, 0.1609, 0.1582, 0.2238, 0.0645, 0.066, 0.2273, 0.31, 0.2999, 0.5078, 0.4797, 0.5783, 0.5071, 0.4328, 0.555, 0.6711, 0.6415, 0.7104, 0.808, 0.6791, 0.3857, 0.1307, 0.2604, 0.5121, 0.7547, 0.8537, 0.8507, 0.6692, 0.6097, 0.4943, 0.2744, 0.051, 0.2834, 0.2825, 0.4256, 0.2641, 0.1386, 0.1051, 0.1343, 0.0383, 0.0324, 0.0232, 0.0027, 0.0065, 0.0159, 0.0072, 0.0167, 0.018, 0.0084, 0.009, 0.0032, 'R']


In [4]:
def split_dataset(dataset, feature, value):
    left, right = [], []
    for sample in dataset:
        if sample[feature] < value:
            left.append(sample)
        else:
            right.append(sample)
    return left, right

In [5]:
def calc_gini(classes, *datasets):
    gini = 0.0
    for d in datasets:
        n_samples = len(d)
        if n_samples == 0:
            continue
        for c in classes:
            proportion = [sample[-1] for sample in d].count(c) / float(n_samples)
            gini += proportion * (1 - proportion)
    return gini

In [6]:
def choose_and_split(dataset, features, classes):
    min_gini = np.inf
    best_f, best_v = 0, 0.0
    best_left, best_right = None, None
    for f in features:
        values = set(sample[f] for sample in dataset)
        for v in values:
            left, right = split_dataset(dataset, f, v)
            gini = calc_gini(classes, left, right)
            if gini < min_gini:
                min_gini = gini
                best_f, best_v = f, v
                best_left, best_right = left, right
    return {'feature': best_f, 'value': best_v, 'left': best_left, 'right': best_right}

**随机森林的要素之一 => 随机选择特征**

In [7]:
from random import randrange
import numpy as np
def sub_features(dataset, n_features):
    features = set()
    while len(features) < n_features:
        f = randrange(len(dataset[0]) - 1)
        features.add(f)
    return features

In [8]:
def create_tree(dataset, max_depth, min_size, n_features):
    features = sub_features(dataset, n_features)
    classes = set(sample[-1] for sample in dataset)
    root = choose_and_split(dataset, features, classes)
    features.remove(root['feature'])
    create_node(root, features, max_depth, min_size, 1)
    return root

In [9]:
def majority_class(dataset):
    labels = [sample[-1] for sample in dataset]
    lb = max(set(labels), key=labels.count)
    return lb

In [10]:
def create_node(node, features, max_depth, min_size, depth):
    left, right = node['left'], node['right']
    # 这里的意义是，left或right可能出现为空列表[]的情况
    # 这样就会造成预测产生TypeError
    # 故排除这种情况
    if not left or not right:
        node['left'] = node['right'] = majority_class(left + right)
        return
    if left:
        classes = set(sample[-1] for sample in left)
        # Only one class or no feauture remained
        if (len(classes) == 1 or len(features) == 0 or 
            not features or len(left) < min_size or depth >= max_depth):
            # return classification result
            node['left'] = majority_class(left)
        else:
            node['left'] = choose_and_split(left, features, classes)
            features.remove(node['left']['feature'])
            create_node(node['left'], features, 
                        max_depth, min_size, depth+1)
    if right:
        classes = set(sample[-1] for sample in right)
        if (len(classes) == 1 or len(features) == 0 or 
           not features or len(left) < min_size or depth >= max_depth):
            node['right'] = majority_class(right)
        else:
            node['right'] = choose_and_split(right, features, classes)
            features.remove(node['right']['feature'])
            create_node(node['right'], features, 
                        max_depth, min_size, depth+1)

### 生成随机森林

**随机森林的要素之二 => 为每棵决策树随机选择训练集**

In [11]:
def sub_samples(dataset, ratio):
    samples = []
    n_sample = round(len(dataset) * ratio)
    while len(samples) < n_sample:
        index = randrange(len(dataset))
        samples.append(dataset[index])
    return list(samples)

In [12]:
def predict(node, sample):
    if sample[node['feature']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], sample)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], sample)
        else:
            return node['right']

In [13]:
def bagging_predict(trees, sample):
    predictions = [predict(tree, sample) for tree in trees]
    return max(set(predictions), key=predictions.count)

In [14]:
def random_forest(train, test, max_depth=20, min_size=1, sample_size=0.75, n_trees=10, n_features=30):
    """
    Use random forest and return a prediction.
    train: 训练数据集
    test: 测试数据集
    max_depth: 决策树深度限制，太深容易过拟合
    min_size: 叶子结点大小限制
    sample_size: 单个树的训练集随机采样的比例
    n_trees: 决策树个数
    n_features: 随机选择的特征的个数
    """
    trees = []
    for i in range(n_trees):
        sub_train = sub_samples(train, sample_size)
        tree = create_tree(sub_train, max_depth, min_size, n_features)
        trees.append(tree)
    predictions = [bagging_predict(trees, sample) for sample in test]
    return predictions

In [15]:
len(dataset[0])

61

### 测试随机森林

In [16]:
def accuracy_metric(actual, predictions):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predictions[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [17]:
train_set = dataset[:-10]
test_set = []
for sample in dataset[-10:]:
    sample_copy = list(sample)
    sample_copy[-1] = None
    test_set.append(sample_copy)
predictions = random_forest(train_set, test_set)
actual_labels = [sample[-1] for sample in dataset[-10:]]
print('predictions=> ', predictions)
print('labels=> ', actual_labels)
accuracy = accuracy_metric(actual_labels, predictions)
print('使用随机森林检测声呐信号的准确率是=>', accuracy)

predictions=>  ['M', 'R', 'R', 'M', 'M', 'M', 'M', 'R', 'R', 'R']
labels=>  ['M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M']
使用随机森林检测声呐信号的准确率是=> 50.0


<font color=blue>准确率这么低，可能是代码中有bug。但是我确实找不出来了，希望有人看到问题所在后可以告诉我=>**983910368@qq.com**</font>