**Voting classifier** (composed of three diverse classifiers)

In [9]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

In [13]:
X,y=make_moons(n_samples=100,noise=0.15)
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2)

In [14]:
X_train.shape,X_val.shape

((80, 2), (20, 2))

In [2]:
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [4]:
log_clf=LogisticRegression()
rnd_clf=RandomForestClassifier()
svm_clf=SVC()

In [6]:
voting_clf=VotingClassifier(
    estimators=[('lr',log_clf),('rnd',rnd_clf),('svm',svm_clf)],
    voting='hard'
)

In [15]:
voting_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rnd', RandomForestClassifier()),
                             ('svm', SVC())])

In [19]:
from sklearn.metrics import accuracy_score
for clf in [log_clf,rnd_clf,svm_clf,voting_clf]:
    clf.fit(X_train,y_train)
    val_predict=clf.predict(X_val)
    print(clf.__class__.__name__,accuracy_score(y_val,val_predict))
    

LogisticRegression 0.9
RandomForestClassifier 0.95
SVC 0.95
VotingClassifier 0.95


**Bagging and Pasting**

In [21]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [28]:
bag_clf=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=10,bootstrap=True)

In [29]:
bag_clf.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=10,
                  n_estimators=500)

**Out-of-Bag Evaluation**

In [31]:
bag_clf=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=10,bootstrap=True,oob_score=True)

In [32]:
bag_clf.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=10,
                  n_estimators=500, oob_score=True)

In [34]:
bag_clf.oob_score_

0.8875

In [35]:
bag_val_predict=bag_clf.predict(X_val)
print(bag_clf.__class__.__name__,accuracy_score(y_val,bag_val_predict))

BaggingClassifier 0.95


In [38]:
bag_clf.oob_decision_function_[:8]

array([[0.77241379, 0.22758621],
       [0.11312217, 0.88687783],
       [0.87268519, 0.12731481],
       [0.51860465, 0.48139535],
       [0.35891648, 0.64108352],
       [0.36363636, 0.63636364],
       [0.33180778, 0.66819222],
       [0.14831461, 0.85168539]])

**Feature Importance**

In [41]:
from sklearn.datasets import load_iris

In [42]:
iris=load_iris()

In [43]:
rnd_clf=RandomForestClassifier(n_estimators=100,max_depth=3)

In [45]:
rnd_clf.fit(iris['data'],iris['target'])

RandomForestClassifier(max_depth=3)

In [47]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [49]:
for name,score in zip(iris['feature_names'],rnd_clf.feature_importances_):
    print(name,score)

sepal length (cm) 0.06474540622175312
sepal width (cm) 0.004132709186909798
petal length (cm) 0.46542089046879587
petal width (cm) 0.4657009941225413


**Boosting**

In [58]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeRegressor

In [51]:
ada_clf=AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=200,learning_rate=0.1)

In [53]:
ada_clf.fit(X_train,y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.1, n_estimators=200)

In [60]:
#Gradient Boosting Regressor 
#manually implement
tree_reg1=DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X,y)
y2=y-tree_reg1.predict(X)
tree_reg2=DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X,y2)
tree_reg3=DecisionTreeRegressor(max_depth=2)
y3=y2-tree_reg2.predict(X)
tree_reg3.fit(X,y3)

DecisionTreeRegressor(max_depth=2)

In [66]:
y_pred=sum(tree.predict([[2,3]]) for tree in (tree_reg1,tree_reg2,tree_reg3))
y_pred

array([0.90318096])

In [77]:
#use GBRT from sklearn
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [55]:
gbrt=GradientBoostingRegressor(max_depth=2,n_estimators=3,learning_rate=0.1)

In [67]:
#use early stopping to find the optimal number of trees
from sklearn.metrics import mean_squared_error

In [74]:
gbrt.fit(X_train,y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=3)

In [79]:
error=[mean_squared_error(y_val,y_val_predict) for y_val_predict in gbrt.staged_predict(X_val)]

In [81]:
best_n_estimator=np.argmin(error)+1
best_n_estimator

3

In [82]:
gbrt_best=GradientBoostingRegressor(max_depth=2,n_estimators=best_n_estimator,learning_rate=0.1)

In [83]:
#actually stopping training early
gbrt=GradientBoostingRegressor(max_depth=2,warm_start=True)

In [86]:
min_val_error=float('inf')

In [89]:
error_going_up=0
for n_estimators in range(1,120):
    gbrt.n_estimators=n_estimators
    gbrt.fit(X_train,y_train)
    y_pred=gbrt.predict(X_val)
    val_error=mean_squared_error(y_val,y_pred)
    if val_error<min_val_error:
        min_val_error=val_error
        error_going_up=0
    else:
        error_going_up+=1
        if error_going_up==5:
            break #early stopping

**xgboost**

In [90]:
import xgboost

In [91]:
xgb_reg=xgboost.XGBRegressor()

In [92]:
xgb_reg.fit(X_train,y_train)

KeyError: 'base_score'

KeyError: 'base_score'

In [93]:
xgb_reg.fit(X_train,y_train,eval_set=[(X_val,y_val)],early_stopping_rounds=2)

[0]	validation_0-rmse:0.39439
[1]	validation_0-rmse:0.29700
[2]	validation_0-rmse:0.26013
[3]	validation_0-rmse:0.21615
[4]	validation_0-rmse:0.18688
[5]	validation_0-rmse:0.16763
[6]	validation_0-rmse:0.15503
[7]	validation_0-rmse:0.15207
[8]	validation_0-rmse:0.15011
[9]	validation_0-rmse:0.14899
[10]	validation_0-rmse:0.14798
[11]	validation_0-rmse:0.14732
[12]	validation_0-rmse:0.14706
[13]	validation_0-rmse:0.14671
[14]	validation_0-rmse:0.14595
[15]	validation_0-rmse:0.14573
[16]	validation_0-rmse:0.14511
[17]	validation_0-rmse:0.14463
[18]	validation_0-rmse:0.14456
[19]	validation_0-rmse:0.14421
[20]	validation_0-rmse:0.14420
[21]	validation_0-rmse:0.14393
[22]	validation_0-rmse:0.14386
[23]	validation_0-rmse:0.14362
[24]	validation_0-rmse:0.14351
[25]	validation_0-rmse:0.14341
[26]	validation_0-rmse:0.14336
[27]	validation_0-rmse:0.14336


KeyError: 'base_score'

KeyError: 'base_score'

In [94]:
y_pred=xgb_reg.predict(X_val)

In [95]:
y_pred

array([ 4.62433032e-04,  3.79808486e-01,  3.06107453e-04,  9.99628663e-01,
        9.99628663e-01,  3.06107453e-04,  4.83497411e-01, -1.85697910e-03,
        9.99628663e-01,  3.06107453e-04,  9.99628663e-01,  9.42210318e-04,
        8.89967079e-04,  9.99628663e-01,  4.62433032e-04,  9.42210318e-04,
       -1.10480025e-04,  3.06107453e-04, -1.10480025e-04,  9.99628663e-01],
      dtype=float32)

In [97]:
mean_squared_error(y_val,y_pred)

0.020551854014096902