## Random Forest

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score

In [2]:
iris = load_iris()

In [3]:
x = iris.data
y = iris.target

In [4]:
print(x, y)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [5]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(x, y)

tree_clf.feature_importances_

array([0.02666667, 0.        , 0.05072262, 0.92261071])

In [6]:
train_x, test_x,train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=0)

tree_clf = DecisionTreeClassifier()
tree_clf.fit(train_x, train_y)

print(tree_clf.score(train_x, train_y))
print(tree_clf.score(test_x, test_y))

1.0
0.9777777777777777


In [7]:
dataset = fetch_openml(name='boston', version=1, as_frame=True, return_X_y=False, parser='pandas')

data = dataset['data']
target = dataset['target']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=0)

tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train, y_train)

print('whole dataset train acc: {}'.format(tree_reg.score(x_train, y_train)))
print('whole dataset test acc: {}'.format(tree_reg.score(x_test, y_test)))

whole dataset train acc: 1.0
whole dataset test acc: 0.6728845239323917


In [9]:
def random_forest(x_train, y_train, x_test, y_test, drop_n=4):
    features_random = np.random.choice(list(x_train.columns), size=len(x_train.columns)-drop_n)

    x_sample = x_train[features_random]
    y_sample = y_train

    reg = DecisionTreeRegressor()
    reg.fit(x_sample, y_sample)

    score_train = reg.score(x_sample, y_sample)
    score_test = reg.score(x_test[features_random], y_test)

    print('sub sample :: train score: {}, test score: {}'.format(score_train, score_test))

    y_predicated = reg.predict(x_test[features_random])

    return y_predicated, score_test

In [10]:
random_forest(x_train, y_train, x_test, y_test, 4)

sub sample :: train score: 1.0, test score: 0.6784967237312082


(array([24.1, 50. , 23.2, 16.3, 21.7, 23.1, 22.6, 22. , 23.8, 18.4, 10.8,
        17.9, 17.1,  8.8, 50. , 34.9, 16.1, 34.6, 30.1, 22.6, 24.5, 22.7,
        20.1, 24.8, 19.7, 20.8, 18.2, 13.1, 43.8, 18.4, 11.7, 13.4, 18.4,
        22.2, 23.9, 18.8,  8.3, 50. , 11.7, 17.9, 21.2, 19.7, 23.9, 11.7,
        23.9, 23.9, 16. , 18.4, 14.6, 23.8, 18.4, 17.5, 18.2, 48.8, 17.8,
        18.9, 17.6, 17.5, 50. , 22.4, 20.6, 18.2, 34.9, 32.4, 11.7, 32.4,
        16.4, 18.9, 14.1, 20. , 21.7, 23.9, 23.7, 33.1, 26.5,  8.8, 48.8,
        22.2, 22. , 16.8, 23.9, 14.3, 50. , 48.8, 48.8, 25. , 20.6, 11.7,
        25. , 12.7, 16. , 11.8, 23.4, 26.6, 22.2, 21.2, 11.9, 23.1, 14.3,
        18.5, 25. , 21.7, 29.9, 19.4, 28.5, 19.4,  8.5, 19.9, 20.5, 22.9,
        23.9, 17.9, 18.9, 18.3, 18.9, 22.4,  9.5, 19.6, 10.2, 50. , 24.1,
         7.2, 16.6, 20.1, 19.4, 20.5, 33.2, 16. , 20.6, 33.2, 12.5,  9.5,
        17.8, 18.4, 15. , 34.6, 19.4, 15.6, 23.9,  8.3, 10.2, 20.1, 23.6,
        23.7, 24.1, 15.6, 37.3, 37.3, 

In [11]:
with_feature_names = pd.DataFrame(data)
with_feature_names.columns = dataset['feature_names']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(with_feature_names, target, test_size=0.3, random_state=0)

In [13]:
random_forest(x_train, y_train, x_test, y_test, 4)

sub sample :: train score: 1.0, test score: 0.21148237274057458


(array([21.2,  7.2, 24.6, 12.7, 25.2, 20.4, 12.7, 19. , 13.6, 22.3,  8.5,
        13.9, 16.1, 16.3, 50. , 30.3, 19.6, 34.6, 22. , 23.2, 24.6, 20.6,
        20.8, 20.1, 18.5, 11.9, 14.8, 15.6, 37.6, 22.6, 12.6, 13.4, 24.7,
        20.5, 22.4,  7. , 10.2,  8.5, 13. , 16.1, 20.5, 18.5, 21.2, 11.7,
        23.2, 23.9, 18.5, 24.3, 50. , 30.1, 27. , 17.6, 18.2, 41.7, 18.4,
        20.9, 19.4, 22. ,  5. , 22.4, 22. , 17.8, 29.6, 25. , 11.7, 22. ,
        14.3, 23.1, 14.1, 22.4, 23.4, 23.4, 24.8, 37. , 19.6, 17.9, 41.7,
        23.1, 22. , 18.4, 29.1, 19.2, 11.3, 41.7, 48.8, 23.1, 25.3, 11.8,
        27. , 12.7, 22. ,  9.6, 22.6, 26.6, 20.3, 22.6, 11.9, 22. , 16.7,
        16. , 23.1, 22.3, 29.8, 22.8, 31.1, 20.1, 12.7,  8.1, 20.3, 22. ,
        30.7, 11.3, 25.3, 20.3, 17.6, 22.5, 12.7, 16.1, 13.8, 50. , 28. ,
        11.3, 18.5, 16.8, 17.8, 23.3, 32.7, 17.6, 16. , 32.7, 16.7, 16.7,
        15.2, 23.8, 14.1, 34.6, 17.8, 50. , 25. , 16.7, 10.2, 18.4, 29.8,
        27.1, 23.4, 20.4, 37.3, 37.3, 

In [14]:
tree_num = 4
predicates = []
for _ in range(tree_num):
    predicated, score = random_forest(x_train, y_train, x_test, y_test)
    predicates.append((predicated, score))

predicates_value = [v for v, s in predicates]
forest_scores = [s for v, s in predicates]

print('the mean result is: {}'.format(np.mean(predicates_value), axis=0))
print('the score of forest is: {}'.format(r2_score(y_test, np.mean(predicates_value, axis=0))))


sub sample :: train score: 1.0, test score: 0.6895030510854971
sub sample :: train score: 1.0, test score: 0.6774458683556731
sub sample :: train score: 1.0, test score: 0.6970225928967511
sub sample :: train score: 1.0, test score: 0.5963759316588455
the mean result is: 21.736842105263158
the score of forest is: 0.763965043256427


In [15]:
weights = np.array(forest_scores) / np.sum(forest_scores)
score_weights = np.zeros_like(np.mean(predicates_value, axis=0))

for i, v in enumerate(predicates_value):
    score_weights += v * weights[i]

print('the score of weighted forest is: {}'.format(r2_score(y_test, score_weights)))

the score of weighted forest is: 0.7656685804209508


![欢迎订阅：坍缩的奇点](../assets/Capture-2023-11-02-164446.png)