## Random Forest

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score

In [2]:
iris = load_iris()

In [3]:
x = iris.data
y = iris.target

In [4]:
print(x, y)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [5]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(x, y)

tree_clf.feature_importances_

array([0.02666667, 0.        , 0.55072262, 0.42261071])

In [6]:
train_x, test_x,train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=0)

tree_clf = DecisionTreeClassifier()
tree_clf.fit(train_x, train_y)

print(tree_clf.score(train_x, train_y))
print(tree_clf.score(test_x, test_y))

1.0
0.9777777777777777


In [7]:
dataset = fetch_openml(name='boston', version=1, as_frame=True, return_X_y=False, parser='pandas')

data = dataset['data']
target = dataset['target']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=0)

tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train, y_train)

print('whole dataset train acc: {}'.format(tree_reg.score(x_train, y_train)))
print('whole dataset test acc: {}'.format(tree_reg.score(x_test, y_test)))

whole dataset train acc: 1.0
whole dataset test acc: 0.6383026907922255


In [9]:
def random_forest(x_train, y_train, x_test, y_test, drop_n=4):
    features_random = np.random.choice(list(x_train.columns), size=len(x_train.columns)-drop_n)

    x_sample = x_train[features_random]
    y_sample = y_train

    reg = DecisionTreeRegressor()
    reg.fit(x_sample, y_sample)

    score_train = reg.score(x_sample, y_sample)
    score_test = reg.score(x_test[features_random], y_test)

    print('sub sample :: train score: {}, test score: {}'.format(score_train, score_test))

    y_predicated = reg.predict(x_test[features_random])

    return y_predicated, score_test

In [10]:
random_forest(x_train, y_train, x_test, y_test, 4)

sub sample :: train score: 0.9999193511776068, test score: 0.48040258427460114


(array([42.3, 50. , 28.7, 12.5, 20.8, 23.1, 21.2, 18.5, 20.2, 14.1,  9.5,
        17.9, 10.8, 11.3, 50. , 31.1, 19.6, 34.6, 31.5, 22.2, 23.1, 24.3,
        19.4, 32.4, 23.4, 50. , 22. , 13.8, 33.2, 18.8, 21.4, 19.5, 17.6,
        22.2, 24.6, 25. ,  6.1, 10.4, 14.1, 20.8, 24.4, 23.4, 19.3, 12. ,
        19.3, 20.6, 23.1, 13.1, 14.6, 22.8, 13.1, 21.7, 41.7, 24.4, 15.7,
        21.7, 19.4, 17.5, 13.4, 19.3, 18.2, 22.1, 25. , 46.7, 19.9, 37. ,
        17.8, 18.9, 16.3, 24.4, 17.6, 20.8, 19.4, 19.3, 21.8, 13.8, 41.7,
        22.9, 22. , 26.5, 22.9, 18.1, 11.9, 41.7, 37.6, 24.4, 33.2, 17.9,
        19.6, 10.2, 24.1, 11.8, 20.9, 24.1, 20.3, 28.6, 13.8, 34.6, 12.7,
        22.2, 24.8, 19.5, 27.5, 22.8, 22.4, 19.3,  8.3, 16.4, 20.3, 25. ,
        50. , 19.1, 18.9, 18.3, 18.9, 22.4, 11.8, 18.4, 11.5, 34.9, 36.4,
        11.3, 18.9, 19.3, 16.2, 20.5, 28.2, 22.5, 22.2, 32. ,  8.5,  8.3,
        13.3, 23.8, 12.7, 35.1, 29. , 15.6, 24.8,  9.5, 15. , 21.8, 36.2,
        23.4, 22. , 14.5, 34.9, 30.8, 

In [11]:
with_feature_names = pd.DataFrame(data)
with_feature_names.columns = dataset['feature_names']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(with_feature_names, target, test_size=0.3, random_state=0)

In [13]:
random_forest(x_train, y_train, x_test, y_test, 4)

sub sample :: train score: 1.0, test score: 0.6789447199702522


(array([26.5, 33.8, 22.5, 14.1, 20.8, 20.4, 22.7, 21.2, 23.8, 17.6,  5. ,
        17.9, 13.4,  8.8, 50. , 34.9, 21.2, 29.1, 31.7, 22.2, 23.1, 16.1,
        20.1, 26.6, 18.5, 21.4, 17.5, 15.6, 43.8, 19.8, 11.7, 19.1, 24.5,
        22.2, 22.2, 17.7,  8.5, 33.8, 11.7, 17.9, 22.2, 20.7, 19.3, 11.7,
        19.3, 20.5, 22.5, 15.6, 15.4, 20.3, 15.6, 18.8, 18.2, 24.8, 14.8,
        18.9, 22.2, 17.5, 27.5, 22.4, 22.3, 22.2, 24.6, 37.6, 19.1, 24.6,
        16.1, 18.9, 14.1, 22.5, 21.7, 20.3, 34.9, 24.6, 25. ,  8.8, 46.7,
        22.5, 23.4, 16.8, 24.1, 16.8, 22.6, 46.7, 31.5, 25. , 22.5, 13.8,
        24.3, 16.1, 20.8, 12.8, 34.6, 34.6, 23.9, 24.5, 10.2, 26.6, 16.4,
        16. , 25. , 20.1, 25.1, 25. , 22.2, 18.9,  8.3, 19.5, 22.4, 22. ,
        24. , 17.9, 18.9, 18.3, 18.9, 22.5, 10.2, 18.2, 10.2, 44. , 24.6,
        11.3, 16.6, 19.3, 24.7, 20.5, 36.1, 22.5, 25. , 35.2, 12.7,  8.3,
        15.2, 16.8, 10.8, 20.7, 19.4, 15.4, 28.4,  6.3, 10.2, 19.3, 37.9,
        27.9, 32. , 18.4, 42.3, 37.9, 

In [14]:
tree_num = 4
predicates = []
for _ in range(tree_num):
    predicated, score = random_forest(x_train, y_train, x_test, y_test)
    predicates.append((predicated, score))

predicates_value = [v for v, s in predicates]
forest_scores = [s for v, s in predicates]

print('the mean result is: {}'.format(np.mean(predicates_value), axis=0))
print('the score of forest is: {}'.format(r2_score(y_test, np.mean(predicates_value, axis=0))))


sub sample :: train score: 1.0, test score: 0.6721378635339852
sub sample :: train score: 1.0, test score: 0.4177313079346989
sub sample :: train score: 1.0, test score: 0.5816639564013537
sub sample :: train score: 1.0, test score: 0.5947546119365912
the mean result is: 21.735526315789475
the score of forest is: 0.7534092800208592


In [15]:
weights = np.array(forest_scores) / np.sum(forest_scores)
score_weights = np.zeros_like(np.mean(predicates_value, axis=0))

for i, v in enumerate(predicates_value):
    score_weights += v * weights[i]

print('the score of weighted forest is: {}'.format(r2_score(y_test, score_weights)))

the score of weighted forest is: 0.7579273037104974


![欢迎订阅：坍缩的奇点](../assets/Capture-2023-11-02-164446.png)