In [87]:
#chapter 7

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()

import numpy as np
X = iris.data[:, 2:]
noise_1 = [np.random.random_sample() for i in range(len(X))]
noise_2 = [np.random.random_sample() for i in range(len(X))]
noise = np.array([noise_1, noise_2]).transpose()
X_noise = X + noise

y = iris.target

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_noise, y, test_size=0.3, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [88]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.9111111111111111
RandomForestClassifier 0.8888888888888888
SVC 0.9333333333333333
VotingClassifier 0.9111111111111111


In [90]:
# note that voting classifier did not always improve than existing models

In [91]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [94]:
accuracy_score(y_test, y_pred)

0.8888888888888888

In [95]:
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
bootstrap=True, n_jobs=-1, oob_score=True)

In [96]:
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.9333333333333333

In [97]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8888888888888888

In [98]:
bag_clf.oob_decision_function_

array([[0.02173913, 0.97826087, 0.        ],
       [0.        , 0.21311475, 0.78688525],
       [0.        , 0.00555556, 0.99444444],
       [0.        , 0.96315789, 0.03684211],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.88717949, 0.11282051],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.1978022 , 0.8021978 , 0.        ],
       [0.        , 0.02197802, 0.97802198],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.8128655 , 0.1871345 , 0.        ],
       [0.        , 0.98342541, 0.01657459],
       [1.        , 0.        , 0.        ],
       [0.        , 0.75287356, 0.24712644],
       [0.

In [100]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,
                                n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [101]:
accuracy_score(y_test, y_pred_rf)

0.8888888888888888

In [102]:
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.49876479068108565
sepal width (cm) 0.5012352093189144


In [103]:
# Adaboost
# Focuses on training instances that the predecessor underfitted

from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5
)

ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [104]:
accuracy_score(y_test,ada_clf.predict(X_test))

0.9111111111111111

In [105]:
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(max_depth=2)

In [106]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2)

In [108]:
# create a fake_sample
noise_1 = [np.random.random_sample() for i in range(len(X_test))]
noise_2 = [np.random.random_sample() for i in range(len(X_test))]
noise = np.array([noise_1, noise_2]).transpose()
X_test_noise = X_test + noise

y_pred = sum(tree.predict(X_test_noise) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [120]:
# now using sklearn version

from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=10,
                                learning_rate=1.0)
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_test)

In [121]:
from sklearn.metrics import mean_squared_error
errors = [mean_squared_error(y_test,y_pred) for y_pred in
         gbrt.staged_predict(X_test)]

In [122]:
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=1)

In [123]:
errors

[0.10833663627956136,
 0.07253928098327377,
 0.08482723618998984,
 0.08431123733363896,
 0.08577538450397013,
 0.08709069058180413,
 0.08710712472427759,
 0.08705372700198392,
 0.08853218026166948,
 0.08721585218431135]

In [130]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1,120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_test)
    val_error = mean_squared_error(y_test, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break


In [131]:
min_val_error

0.06273226211752059

In [135]:
#!pip install xgboost - if not xgboost installed

In [140]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)

mean_squared_error(y_test,y_pred)

0.07337405872278505

In [142]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_test)

[0]	validation_0-rmse:0.70735
[1]	validation_0-rmse:0.51843
[2]	validation_0-rmse:0.38264
[3]	validation_0-rmse:0.28560
[4]	validation_0-rmse:0.21846
[5]	validation_0-rmse:0.17294
[6]	validation_0-rmse:0.13817
[7]	validation_0-rmse:0.11533
[8]	validation_0-rmse:0.09799
[9]	validation_0-rmse:0.08278
[10]	validation_0-rmse:0.07004
[11]	validation_0-rmse:0.06233
[12]	validation_0-rmse:0.05573
[13]	validation_0-rmse:0.05094
[14]	validation_0-rmse:0.04684
[15]	validation_0-rmse:0.04031
[16]	validation_0-rmse:0.03498
[17]	validation_0-rmse:0.03062
[18]	validation_0-rmse:0.02736
[19]	validation_0-rmse:0.02340
[20]	validation_0-rmse:0.01982
[21]	validation_0-rmse:0.01684
[22]	validation_0-rmse:0.01436
[23]	validation_0-rmse:0.01223
[24]	validation_0-rmse:0.01053
[25]	validation_0-rmse:0.00899
[26]	validation_0-rmse:0.00787
[27]	validation_0-rmse:0.00695
[28]	validation_0-rmse:0.00609
[29]	validation_0-rmse:0.00549
[30]	validation_0-rmse:0.00488
[31]	validation_0-rmse:0.00427
[32]	validation_0-