## Random Forest

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score

In [2]:
iris = load_iris()

In [3]:
x = iris.data
y = iris.target

In [4]:
print(x, y)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [5]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(x, y)

tree_clf.feature_importances_

array([0.02666667, 0.        , 0.05072262, 0.92261071])

In [6]:
train_x, test_x,train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=0)

tree_clf = DecisionTreeClassifier()
tree_clf.fit(train_x, train_y)

print(tree_clf.score(train_x, train_y))
print(tree_clf.score(test_x, test_y))

1.0
0.9777777777777777


In [7]:
dataset = fetch_openml(name='boston', version=1, as_frame=True, return_X_y=False, parser='pandas')

data = dataset['data']
target = dataset['target']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=0)

tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train, y_train)

print('whole dataset train acc: {}'.format(tree_reg.score(x_train, y_train)))
print('whole dataset test acc: {}'.format(tree_reg.score(x_test, y_test)))

whole dataset train acc: 1.0
whole dataset test acc: 0.6661053215849546


In [9]:
def random_forest(x_train, y_train, x_test, y_test, drop_n=4):
    features_random = np.random.choice(list(x_train.columns), size=len(x_train.columns)-drop_n)

    x_sample = x_train[features_random]
    y_sample = y_train

    reg = DecisionTreeRegressor()
    reg.fit(x_sample, y_sample)

    score_train = reg.score(x_sample, y_sample)
    score_test = reg.score(x_test[features_random], y_test)

    print('sub sample :: train score: {}, test score: {}'.format(score_train, score_test))

    y_predicated = reg.predict(x_test[features_random])

    return y_predicated, score_test

In [10]:
random_forest(x_train, y_train, x_test, y_test, 4)

sub sample :: train score: 0.8331785920058569, test score: 0.43429398341543435


(array([19.3       , 50.        , 22.4       ,  8.96363636, 20.55      ,
        16.4       , 20.3       , 20.22857143, 20.475     , 20.85714286,
         9.56666667, 17.1       , 12.77      , 11.38      , 38.5       ,
        22.5       , 22.13333333, 32.45      , 24.68      , 21.2       ,
        22.3       , 30.2375    , 20.85714286, 32.        , 21.46666667,
        50.        , 16.4       , 16.33333333, 35.13333333, 19.72857143,
        19.01666667, 15.34666667, 24.18      , 23.2       , 24.68      ,
        20.73333333,  8.96363636, 13.8       , 15.25      , 15.25      ,
        25.8       , 21.46666667, 19.3       , 15.25      , 24.76666667,
        34.7       , 25.8       , 15.3       , 16.33333333, 20.85714286,
        15.3       , 18.95      , 36.9       , 24.18      , 16.725     ,
        18.9       , 19.4       , 17.5       , 17.1       , 20.56666667,
        24.1       , 20.475     , 22.        , 36.9       , 20.73333333,
        35.075     , 15.34666667, 18.9       , 17.1

In [11]:
with_feature_names = pd.DataFrame(data)
with_feature_names.columns = dataset['feature_names']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(with_feature_names, target, test_size=0.3, random_state=0)

In [13]:
random_forest(x_train, y_train, x_test, y_test, 4)

sub sample :: train score: 1.0, test score: 0.5828499217537433


(array([37. , 22.8, 23.8, 12.5, 22.5, 20.4, 21.4, 19. , 22. , 19.3, 10.2,
        17.9, 17.1,  8.7, 50. , 22.5, 23.1, 42.3, 19.4, 25. , 23.1, 22.7,
        20.1, 24.7, 21.7, 15. , 21. , 13.1, 46.7, 18.9, 12.6, 17.7, 23.8,
        22.2, 25.1, 10.9,  7.2, 22.8,  9.6, 12. , 20.9, 21.1, 20.3, 13.1,
        20.9, 23.8, 16.6, 13.1, 14.6, 22.8, 13.1, 19.8, 18.2, 29.6, 18.1,
        19.8, 19.7, 17.5, 22.8, 20. , 25. , 20.8, 23.7, 46.7, 19.9, 37. ,
        14.1, 18.9, 16.3, 20.2, 17.6, 20.9, 23.7, 19.4, 25. ,  8.8, 46.7,
        22.2, 22. , 19.9, 23.7, 10.9, 15. , 50. , 31.5, 23.1, 29.6, 11.7,
        24.3, 14.9, 19.7, 12. , 23.1, 22.3, 21.6, 25. , 13.8, 23.1, 14.8,
        16. , 25. , 19.6, 25. , 24.7, 22.2, 18.9,  7. , 10.9, 22.8, 20.9,
        24.3, 17.9, 18.9, 19.8, 18.9, 22.2,  8.8, 19.6,  8.7, 42.3, 34.9,
        11.3, 16.6, 21.8, 22.8, 14.3, 28.2, 16.6, 25. , 28.7, 12.5,  8.1,
        15.1, 22.7, 10.5, 26.6, 20.7, 15.6, 24.4,  8.1,  7. , 21.8, 32.5,
        26.6, 24.4, 18.4, 30.8, 35.1, 

In [14]:
tree_num = 4
predicates = []
for _ in range(tree_num):
    predicated, score = random_forest(x_train, y_train, x_test, y_test)
    predicates.append((predicated, score))

predicates_value = [v for v, s in predicates]
forest_scores = [s for v, s in predicates]

print('the mean result is: {}'.format(np.mean(predicates_value), axis=0))
print('the score of forest is: {}'.format(r2_score(y_test, np.mean(predicates_value, axis=0))))


sub sample :: train score: 1.0, test score: 0.6303643870644049
sub sample :: train score: 1.0, test score: 0.22216870278126066
sub sample :: train score: 1.0, test score: 0.22545242830060186
sub sample :: train score: 1.0, test score: 0.5794990363043866
the mean result is: 21.927796052631578
the score of forest is: 0.7294247406318071


In [15]:
weights = np.array(forest_scores) / np.sum(forest_scores)
score_weights = np.zeros_like(np.mean(predicates_value, axis=0))

for i, v in enumerate(predicates_value):
    score_weights += v * weights[i]

print('the score of weighted forest is: {}'.format(r2_score(y_test, score_weights)))

the score of weighted forest is: 0.7394533612680413


![欢迎订阅：坍缩的奇点](../assets/Capture-2023-11-02-164446.png)