## [作業重點]
確保你了解隨機森林模型中每個超參數的意義，並觀察調整超參數對結果的影響

## 作業

1. 試著調整 RandomForestClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型與決策樹的結果進行比較

In [5]:
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np

In [7]:
data = datasets.load_iris()
print(data.feature_names)
print(data.data.shape)
print(np.unique(data.target))

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
(150, 4)
[0 1 2]


In [13]:
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=20)

for n_trees in range(10,100,5):
    
    clf = RandomForestClassifier(n_estimators=n_trees)

    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    
    acc = metrics.accuracy_score(y_test, y_pred)
    
    print("Accuracy for {} trees:".format(n_trees), acc)

Accuracy for 10 trees: 0.8947368421052632
Accuracy for 15 trees: 0.8947368421052632
Accuracy for 20 trees: 0.8947368421052632
Accuracy for 25 trees: 0.9210526315789473
Accuracy for 30 trees: 0.8947368421052632
Accuracy for 35 trees: 0.8947368421052632
Accuracy for 40 trees: 0.8947368421052632
Accuracy for 45 trees: 0.9210526315789473
Accuracy for 50 trees: 0.8947368421052632
Accuracy for 55 trees: 0.9210526315789473
Accuracy for 60 trees: 0.9210526315789473
Accuracy for 65 trees: 0.9210526315789473
Accuracy for 70 trees: 0.8947368421052632
Accuracy for 75 trees: 0.8947368421052632
Accuracy for 80 trees: 0.8947368421052632
Accuracy for 85 trees: 0.8947368421052632
Accuracy for 90 trees: 0.8947368421052632
Accuracy for 95 trees: 0.9210526315789473


In [14]:
for depth in range(1,30):
    
    clf = RandomForestClassifier(n_estimators = 25, max_depth = depth)

    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    
    acc = metrics.accuracy_score(y_test, y_pred)
    
    print("Accuracy for depth {}:".format(depth), acc)

Accuracy for depth 1: 0.9210526315789473
Accuracy for depth 2: 0.9473684210526315
Accuracy for depth 3: 0.8947368421052632
Accuracy for depth 4: 0.868421052631579
Accuracy for depth 5: 0.8947368421052632
Accuracy for depth 6: 0.9210526315789473
Accuracy for depth 7: 0.9210526315789473
Accuracy for depth 8: 0.8947368421052632
Accuracy for depth 9: 0.8947368421052632
Accuracy for depth 10: 0.8947368421052632
Accuracy for depth 11: 0.8947368421052632
Accuracy for depth 12: 0.8947368421052632
Accuracy for depth 13: 0.8947368421052632
Accuracy for depth 14: 0.9210526315789473
Accuracy for depth 15: 0.8947368421052632
Accuracy for depth 16: 0.9210526315789473
Accuracy for depth 17: 0.8947368421052632
Accuracy for depth 18: 0.8947368421052632
Accuracy for depth 19: 0.8947368421052632
Accuracy for depth 20: 0.8947368421052632
Accuracy for depth 21: 0.8947368421052632
Accuracy for depth 22: 0.8947368421052632
Accuracy for depth 23: 0.8947368421052632
Accuracy for depth 24: 0.9210526315789473
Ac

#### boston

In [15]:
data = datasets.load_boston()
print(data.feature_names)
print(data.data.shape)
print(np.unique(data.target))

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
(506, 13)
[ 5.   5.6  6.3  7.   7.2  7.4  7.5  8.1  8.3  8.4  8.5  8.7  8.8  9.5
  9.6  9.7 10.2 10.4 10.5 10.8 10.9 11.  11.3 11.5 11.7 11.8 11.9 12.
 12.1 12.3 12.5 12.6 12.7 12.8 13.  13.1 13.2 13.3 13.4 13.5 13.6 13.8
 13.9 14.  14.1 14.2 14.3 14.4 14.5 14.6 14.8 14.9 15.  15.1 15.2 15.3
 15.4 15.6 15.7 16.  16.1 16.2 16.3 16.4 16.5 16.6 16.7 16.8 17.  17.1
 17.2 17.3 17.4 17.5 17.6 17.7 17.8 17.9 18.  18.1 18.2 18.3 18.4 18.5
 18.6 18.7 18.8 18.9 19.  19.1 19.2 19.3 19.4 19.5 19.6 19.7 19.8 19.9
 20.  20.1 20.2 20.3 20.4 20.5 20.6 20.7 20.8 20.9 21.  21.1 21.2 21.4
 21.5 21.6 21.7 21.8 21.9 22.  22.1 22.2 22.3 22.4 22.5 22.6 22.7 22.8
 22.9 23.  23.1 23.2 23.3 23.4 23.5 23.6 23.7 23.8 23.9 24.  24.1 24.2
 24.3 24.4 24.5 24.6 24.7 24.8 25.  25.1 25.2 25.3 26.2 26.4 26.5 26.6
 26.7 27.  27.1 27.5 27.9 28.  28.1 28.2 28.4 28.5 28.6 28.7 29.  29.1
 29.4 29.6 29.8 29.9 30.1 30.3 30.5 30.7 30.8 31.  3

In [24]:
from sklearn.ensemble import RandomForestRegressor

x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=20)

for n_trees in range(10,105,5):
    
    rg = RandomForestRegressor(n_estimators = n_trees)

    rg.fit(x_train, y_train)

    y_pred = rg.predict(x_test)

    mse = metrics.mean_squared_error(y_test, y_pred)

    print("MSE for {} trees:".format(n_trees), mse)

MSE for 10 trees: 15.697712598425198
MSE for 15 trees: 18.24456552930884
MSE for 20 trees: 14.910252362204728
MSE for 25 trees: 15.784180409448815
MSE for 30 trees: 16.322947331583553
MSE for 35 trees: 13.0299399003696
MSE for 40 trees: 16.475420521653547
MSE for 45 trees: 15.136493243900066
MSE for 50 trees: 15.965694078740158
MSE for 55 trees: 15.596512839200885
MSE for 60 trees: 14.625236876640418
MSE for 65 trees: 13.27236837347994
MSE for 70 trees: 15.368990454764576
MSE for 75 trees: 15.618276409448814
MSE for 80 trees: 14.861491190944877
MSE for 85 trees: 14.790104667193402
MSE for 90 trees: 15.366426421697279
MSE for 95 trees: 16.54617123912142
MSE for 100 trees: 14.714560677165347


In [23]:
for depth in range(10,105,5):
    
    rg = RandomForestRegressor(n_estimators = 35,  max_depth = depth)
    
    rg.fit(x_train, y_train)

    y_pred = rg.predict(x_test)

    mse = metrics.mean_squared_error(y_test, y_pred)

    print("MSE for depth {}:".format(depth), mse)

MSE for depth 10: 14.28249524396384
MSE for depth 15: 17.23725312942744
MSE for depth 20: 13.444067684396598
MSE for depth 25: 15.766466141732284
MSE for depth 30: 15.350047244094494
MSE for depth 35: 16.087673597943116
MSE for depth 40: 15.973961497669938
MSE for depth 45: 16.23564692270609
MSE for depth 50: 17.761230403342434
MSE for depth 55: 16.586636413305484
MSE for depth 60: 14.602919363650976
MSE for depth 65: 16.479986887353366
MSE for depth 70: 15.803289795918364
MSE for depth 75: 15.725557448176124
MSE for depth 80: 17.169905447533353
MSE for depth 85: 14.957855760887036
MSE for depth 90: 14.131332412019928
MSE for depth 95: 15.702090824361244
MSE for depth 100: 17.54106328137555


In [40]:
from sklearn.model_selection import cross_val_score, cross_validate
import numpy as np

for mf in ['sqrt', 'log2']:
    rg = RandomForestRegressor(n_estimators = 35,  max_features = mf)
    print("R^2 for max_features = :".format(mf), np.mean(cross_val_score(rg, x_train, y_train, cv=3)))
    rg.fit(x_train, y_train)
    y_pred = rg.predict(x_test)
    mse = metrics.mean_squared_error(y_test, y_pred)
    print("MSE for max_features = :".format(mf), mse)

R^2 for max_features = : 0.8548915576288162
MSE for max_features = : 14.677160726337782
R^2 for max_features = : 0.8652485436769513
MSE for max_features = : 11.501600642776804
