In [2]:
# ensemble

from sklearn.datasets import make_moons
import numpy as np
from sklearn.model_selection import train_test_split

np.random.seed(42)
X,y = make_moons(n_samples=5000, noise=.15)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=42)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf),
               ('rf', rnd_clf),
               ('svc', svm_clf)],
    voting='hard'
)
voting_clf.fit(X_train,y_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',...
                                        

In [4]:
# ensemble method not always improve accuracy
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

LogisticRegression 0.881
RandomForestClassifier 0.987
SVC 0.986
VotingClassifier 0.985




In [12]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, 
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [13]:
accuracy_score(y_test, y_pred)

0.984

In [14]:
dec_clf = DecisionTreeClassifier()
dec_clf.fit(X_train,y_train)
y_pred = dec_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.984

In [15]:

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True
)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.978

In [16]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.983

In [17]:
bag_clf.oob_decision_function_

array([[1.        , 0.        ],
       [0.00207469, 0.99792531],
       [0.11201629, 0.88798371],
       ...,
       [0.09465021, 0.90534979],
       [0.6902834 , 0.3097166 ],
       [0.10569106, 0.89430894]])

In [18]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [19]:
accuracy_score(y_test, y_pred_rf)

0.987

In [20]:
bag_clf_similar_rf = BaggingClassifier(
    DecisionTreeClassifier(max_features='auto', max_leaf_nodes=16),
    n_estimators=500,
    max_samples=1.,
    bootstrap=True,
    n_jobs=-1
)

In [21]:
bag_clf_similar_rf.fit(X_train, y_train)
y_pred_bag_clf_similar_rf = bag_clf_similar_rf.predict(X_test)
accuracy_score(y_test, y_pred_bag_clf_similar_rf)

0.987

In [26]:
# feature importance

from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10306599910881815
sepal width (cm) 0.02271574113630239
petal length (cm) 0.41997607712309915
petal width (cm) 0.4542421826317802


In [28]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm='SAMME.R', learning_rate=.5
)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [29]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [33]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X,y)
y2= y- tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X,y2)
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X,y3)

X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [34]:
y_pred

array([0.75026781])

In [35]:
tree_reg1.predict(X_new)

array([0.52856846])

In [36]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.)
gbrt.fit(X,y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=1.0, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [37]:
gbrt.predict(X_new)

array([0.75026781])

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=110,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [40]:
bst_n_estimators

110

In [41]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float('inf')
error_going_up = 0

for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break

In [42]:
n_estimators

72

In [43]:
import xgboost
xgb_reg = xgboost.XGBRegressor()

  return f(*args, **kwds)
  return f(*args, **kwds)


In [44]:
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)



In [46]:
mean_squared_error(y_val, y_pred)

0.002823648728673055

In [48]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train, eval_set=[(X_val,y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)
mean_squared_error(y_val, y_pred)

[0]	validation_0-rmse:0.252919
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.22967
[2]	validation_0-rmse:0.208932
[3]	validation_0-rmse:0.190537
[4]	validation_0-rmse:0.174254
[5]	validation_0-rmse:0.159642
[6]	validation_0-rmse:0.146745
[7]	validation_0-rmse:0.134787
[8]	validation_0-rmse:0.123794
[9]	validation_0-rmse:0.11347
[10]	validation_0-rmse:0.10435
[11]	validation_0-rmse:0.095989
[12]	validation_0-rmse:0.0889
[13]	validation_0-rmse:0.083319
[14]	validation_0-rmse:0.078312
[15]	validation_0-rmse:0.073715
[16]	validation_0-rmse:0.06965
[17]	validation_0-rmse:0.06641
[18]	validation_0-rmse:0.063814
[19]	validation_0-rmse:0.061651
[20]	validation_0-rmse:0.05997
[21]	validation_0-rmse:0.058577
[22]	validation_0-rmse:0.057521
[23]	validation_0-rmse:0.056667
[24]	validation_0-rmse:0.055901
[25]	validation_0-rmse:0.055247
[26]	validation_0-rmse:0.054666
[27]	validation_0-rmse:0.054397
[28]	validation_0-rmse:0.054156
[29]	validation_0-rmse:0.0

0.002734999923069037