# SECTION 17, Random Forest

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score

In [2]:
iris = load_iris()

In [3]:
x = iris.data
y = iris.target

In [4]:
print(x, y)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [5]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(x, y)

tree_clf.feature_importances_

array([0.        , 0.01333333, 0.06405596, 0.92261071])

In [6]:
train_x, test_x,train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=0)

tree_clf = DecisionTreeClassifier()
tree_clf.fit(train_x, train_y)

print(tree_clf.score(train_x, train_y))
print(tree_clf.score(test_x, test_y))

1.0
0.9777777777777777


In [7]:
dataset = fetch_openml(name='boston', version=1, as_frame=True, return_X_y=False, parser='pandas')

data = dataset['data']
target = dataset['target']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=0)

tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train, y_train)

print('whole dataset train acc: {}'.format(tree_reg.score(x_train, y_train)))
print('whole dataset test acc: {}'.format(tree_reg.score(x_test, y_test)))

whole dataset train acc: 1.0
whole dataset test acc: 0.660933216856945


In [9]:
def random_forest(x_train, y_train, x_test, y_test, drop_n=4):
    features_random = np.random.choice(list(x_train.columns), size=len(x_train.columns)-drop_n)

    x_sample = x_train[features_random]
    y_sample = y_train

    reg = DecisionTreeRegressor()
    reg.fit(x_sample, y_sample)

    score_train = reg.score(x_sample, y_sample)
    score_test = reg.score(x_test[features_random], y_test)

    print('sub sample :: train score: {}, test score: {}'.format(score_train, score_test))

    y_predicated = reg.predict(x_test[features_random])

    return y_predicated, score_test

In [10]:
random_forest(x_train, y_train, x_test, y_test, 4)

sub sample :: train score: 1.0, test score: 0.5515375920300917


(array([24.8, 50. , 23.2, 12.5, 22.5, 23.1, 19.4, 22. , 15.3, 16.1, 10.5,
        17.9, 17.1,  8.8, 50. , 34.9, 18.2, 34.6, 32.5, 22.1, 25. , 23.8,
        20.1, 25. , 17.6, 15.3, 18.4, 13.1, 43.8, 21.4, 10.9, 13.4, 22.2,
        22.2, 23.9, 18.4,  8.3, 50. , 10.9, 17.9, 23.4, 21.1, 23.9, 11.7,
        24.4, 23.9, 16.6, 17.4, 14.6, 21. , 17.4, 17.5, 20.6, 21.9, 17.8,
        18.9, 17.6, 18.5, 50. , 22.5, 21.2, 20.8, 35.1, 32.4, 11.7, 29.1,
        16.4, 18.9, 14.1, 21.7, 20.8, 23.9, 23.7, 23.6, 26.5,  8.8, 41.7,
        22.2, 22. , 19.9, 23.9, 14.3, 50. , 37.6, 48.8, 23.1, 20.6, 11.7,
        25. , 12.7, 16.6, 11.8, 23.4, 37. , 36.2, 20.6, 11.9, 23.1, 16.4,
        19.7, 24.5, 16.1, 30.7, 21.7, 37. , 21.1,  8.5, 15.3, 36.2, 22.9,
        23.6, 17.9, 18.9, 18.8, 18.9, 22.4,  9.5, 23.1, 10.2, 48.5, 28.6,
         8.8, 16.6, 19.4, 18.9, 18.8, 36.1, 16.6, 20.6, 36.1, 12.5,  9.5,
        20.5, 18.4, 15. , 36.1, 18.2, 15.6, 24.5,  8.3, 10.2, 19.4, 23.6,
        23.7, 25. , 14.8, 37.3, 37.9, 

In [11]:
with_feature_names = pd.DataFrame(data)
with_feature_names.columns = dataset['feature_names']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(with_feature_names, target, test_size=0.3, random_state=0)

In [13]:
random_forest(x_train, y_train, x_test, y_test, 4)

sub sample :: train score: 1.0, test score: 0.6157424851248163


(array([22.2, 50. , 24.6, 12.7, 25.2, 23.1, 18.5, 19. , 19.7, 21.7, 10.9,
        14. , 17.1,  8.8, 48.5, 30.3, 19.6, 34.6, 32.5, 24.7, 24.2, 23.8,
        19.3, 23.1, 16.2, 14.4, 13.2, 19.6, 37.6, 22.1, 21.4, 14.1, 25.2,
        19.4, 22.3, 17.8,  7.2, 13.8, 16.1, 21.4, 20.5, 19.3, 36.2, 16.3,
        23.1, 23.1, 20.3, 21.5, 14.6, 21.7, 21.5, 23.7, 24.7, 35.2, 16.2,
        17.8, 17.4, 17.5, 20.8, 22.4, 26.4, 22.1, 34.9, 27.1, 20.8, 27.1,
        14.9, 23.1, 50. , 22.5, 23.4, 23.4, 22. , 23.6, 26.5, 10.2, 41.7,
        23.4, 22. , 19.8, 30.8, 18. , 13.8, 37.6, 37.6, 23.1, 25.3, 20.6,
        21.5, 13.8, 23.1, 15.4, 20.9, 30.3, 18.4, 20.5, 11.9, 22.5, 14.9,
        16. , 23.4, 20.1, 30.7, 20.9, 37. , 18.9, 10.2, 13.5, 18.4, 23.1,
        30.7, 13.6, 25.3, 18.3, 17.4, 22.4, 12.8, 19.2, 11.5, 50. , 28.6,
        11.3, 17.4, 21.8, 19.4, 21.4, 33.2, 17.6, 16. , 33.2, 12.5, 10.9,
        20.1, 23.8, 10.4, 34.6, 24.7, 15.6, 28. , 10.9, 10.9, 21.8, 30.7,
        23.1, 23.4, 23.1, 37.9, 37.3, 

In [14]:
tree_num = 4
predicates = []
for _ in range(tree_num):
    predicated, score = random_forest(x_train, y_train, x_test, y_test)
    predicates.append((predicated, score))

predicates_value = [v for v, s in predicates]
forest_scores = [s for v, s in predicates]

print('the mean result is: {}'.format(np.mean(predicates_value), axis=0))
print('the score of forest is: {}'.format(r2_score(y_test, np.mean(predicates_value, axis=0))))


sub sample :: train score: 1.0, test score: 0.3698589197028964
sub sample :: train score: 1.0, test score: 0.7534922422873487
sub sample :: train score: 1.0, test score: 0.65015128268059
sub sample :: train score: 1.0, test score: 0.6748400630710755
the mean result is: 21.730756578947368
the score of forest is: 0.747496983975937


In [15]:
weights = np.array(forest_scores) / np.sum(forest_scores)
score_weights = np.zeros_like(np.mean(predicates_value, axis=0))

for i, v in enumerate(predicates_value):
    score_weights += v * weights[i]

print('the score of weighted forest is: {}'.format(r2_score(y_test, score_weights)))

the score of weighted forest is: 0.7699765952132285
