# Ensemble Learning
Joey Ashcroft Hands on Machine Learning w/ Scikit-learn Chapter 7

### Voting Classifiers

In [10]:
#creates and trains a voting classifier, composed of 3 diverse classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_moons

In [11]:
moons = make_moons(100, shuffle=True, noise=None, random_state=None)

In [12]:
X = moons[0][:,(0,1)]

In [13]:
y = moons[1]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

In [6]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

In [7]:
voting_clf = VotingClassifier(
    estimators=[('lr',log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard' #hard voting classifier
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomF...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [8]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.7
RandomForestClassifier 0.95
SVC 0.95
VotingClassifier 0.95


  if diff:


### Bagging and Pasting

In [9]:
#trains an ensemble of 500 decision tree classifiers, each trained on 100 training instances randomly sampled w replacement
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#if using later, figure out the parameter "max_samples": pg 188
#if you want to use pasting, change bootstrap to False
#n_jobs = -1 tells computer to use all CPU cores
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [10]:
#Out-of-Bag Evaluation
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.925

In [11]:
#let's verify the 91% accuracy the oob score gave us
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.95

In [12]:
#oob decision function for each instance also available through the oob_decision_function_ variable
#function returns the class probabilities for each training instance
bag_clf.oob_decision_function_

array([[0.14213198, 0.85786802],
       [0.        , 1.        ],
       [0.96172249, 0.03827751],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.95061728, 0.04938272],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.36969697, 0.63030303],
       [1.        , 0.        ],
       [0.66120219, 0.33879781],
       [0.93442623, 0.06557377],
       [0.        , 1.        ],
       [0.97191011, 0.02808989],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.91017964, 0.08982036],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.88461538, 0.11538462],
       [0.01052632, 0.98947368],
       [0.        , 1.        ],
       [0.01020408, 0.98979592],
       [0.        , 1.        ],
       [0.99441341, 0.00558659],
       [0.        , 1.        ],
       [1.

### Random Forests

In [13]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [14]:
#this bagging classifier is equivalent to our random forest model above
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
    n_estimators=500, max_samples=1, bootstrap=True, n_jobs=-1
)

### Feature Importance in Random Forests

In [15]:
from sklearn.datasets import load_iris
iris=load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])

for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.0935407623157406
sepal width (cm) 0.021030940325464195
petal length (cm) 0.45639027183192915
petal width (cm) 0.42903802552686626


### Boosting

In [16]:
#Adaboost
from sklearn.ensemble import AdaBoostClassifier

#adaboost classifier based on 200 decision stumps/instances
#if adaboost is overfitting, try reducing number of estimators
#estimators: number of models to iteratively train
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=None)

In [17]:
#Gradient Boosting

#Gradient Boost Regression Tree
from sklearn.ensemble import GradientBoostingRegressor

#learning rate hyperparam scales the contribution of each tree
#low vals: will need more trees in the ensemble to fit the training set, but predictions will generalize better
#therefore finding the optimal number of trees is important
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [20]:
#optimizing number of trees using early stopping

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

#find error at each "stage" or iteration in the ensemble
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
#find the row with the lowest error. This is the number of trees/iterations you need
bst_n_estimators = np.argmin(errors)

#train new model with optimal number of trees
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=119, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [25]:
#early stopping can also be implemented through warm_start=True
#makes scikitlearn keep existing trees and stops when the validation error does not improve for 5 iterations in a row
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up=0
for n_estimators in range(1,120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break #early stopping
#gradientboostingregressor also supports subsample hyperparameter which specifies the fraction of training instances to
#be used for training each tree
#for example, if subsample=.25, each tree is trained on 25% of the training instances selected randomly
#this trades a higher bias for lower variance
#this is called stochastic gradient boosting