## Explore Random Forest and Gradient Boosting

In [None]:
import numpy as np

from sklearn.metrics import accuracy_score as accuracy
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split


## Prepare Data

In [None]:
data = load_breast_cancer()
print(data.data[:3], data.target[:3])
print(data.data.shape, data.target.shape)

[[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
  1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
  6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
  1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
  4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 1.326e+03 8.474e-02 7.864e-02 8.690e-02
  7.017e-02 1.812e-01 5.667e-02 5.435e-01 7.339e-01 3.398e+00 7.408e+01
  5.225e-03 1.308e-02 1.860e-02 1.340e-02 1.389e-02 3.532e-03 2.499e+01
  2.341e+01 1.588e+02 1.956e+03 1.238e-01 1.866e-01 2.416e-01 1.860e-01
  2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 1.203e+03 1.096e-01 1.599e-01 1.974e-01
  1.279e-01 2.069e-01 5.999e-02 7.456e-01 7.869e-01 4.585e+00 9.403e+01
  6.150e-03 4.006e-02 3.832e-02 2.058e-02 2.250e-02 4.571e-03 2.357e+01
  2.553e+01 1.525e+02 1.709e+03 1.444e-01 4.245e-01 4.504e-01 2.430e-01
  3.613e-01 8.758e-02]] [0 0 0]
(569, 30) (569,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.1, random_state=42)

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators = 500,
                            bootstrap = True,
                            random_state = 42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(accuracy(y_test, y_pred_rf))

0.9649122807017544


## Gradient Boosting

In [None]:
gb = GradientBoostingClassifier(n_estimators = 500,
                                learning_rate = 0.01,
                                random_state = 42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print(accuracy(y_test, y_pred_gb))

0.9649122807017544


## Parameter Tuning

In [None]:
clf = GridSearchCV(
        GradientBoostingClassifier(), [{'n_estimators': [10, 50, 100, 300, 500, 800]}], scoring='accuracy', 
    )
clf.fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)
print(accuracy(y_test, y_pred_clf))
print(clf.best_params_)
print(clf.cv_results_)


0.9649122807017544
{'n_estimators': 100}
{'mean_fit_time': array([0.03936086, 0.18667769, 0.36910009, 0.68319449, 0.73318849,
       0.8280262 ]), 'std_fit_time': array([0.00415867, 0.00565705, 0.00152433, 0.03799116, 0.03797413,
       0.04034145]), 'mean_score_time': array([0.00060487, 0.00065856, 0.00077786, 0.00110888, 0.00115485,
       0.00123019]), 'std_score_time': array([4.26450802e-05, 2.16283369e-05, 6.83825051e-06, 2.86233475e-04,
       1.42202754e-04, 4.01904427e-05]), 'param_n_estimators': masked_array(data=[10, 50, 100, 300, 500, 800],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 10}, {'n_estimators': 50}, {'n_estimators': 100}, {'n_estimators': 300}, {'n_estimators': 500}, {'n_estimators': 800}], 'split0_test_score': array([0.96116505, 0.97087379, 0.96116505, 0.96116505, 0.96116505,
       0.96116505]), 'split1_test_score': array([0.90291262, 0.95145631, 0.98058252, 0.9805825

## Exercise

- try the same grid search for the random forest algorithm
- try parameter tunings for more parameters than `n_estimators` 