In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier

data = fetch_openml('mnist_784', version=1, parser="auto")  # data from https://www.openml.org/d/554
dfData = pd.DataFrame(np.c_[data["data"], data["target"]],
                      columns = data["feature_names"] + ["target"])

In [2]:
stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

for train_index, test_index in stratSplit.split(dfData[data["feature_names"]], dfData["target"]):
    X_train = dfData[data["feature_names"]].iloc[train_index]
    X_test = dfData[data["feature_names"]].iloc[test_index]
    
    y_train = dfData["target"].iloc[train_index]
    y_test = dfData["target"].iloc[test_index]

In [3]:
from sklearn.ensemble import GradientBoostingClassifier

# For classification, regression trees are still used to perform fitting on residuals
grad_boost_clf = GradientBoostingClassifier(n_estimators = 30,
                                            loss = 'log_loss', # 2*neg. log. likelihood
                                                           # loss determines how residuals are calculated, residuals
                                                           # are the negative gradient of the loss
                                            learning_rate = 0.1,
                                            subsample = 0.9, # bagging
                                            criterion = 'friedman_mse', # decision tree splitting criterion
                                            random_state = 0) # random state for Grad. Boost. Ensemble
                                            # additional decision tree parameters

grad_boost_clf.fit(X_train, y_train)

In [4]:
print(grad_boost_clf.estimators_.shape)  # (n_estimators, n_classes)
                                         # estimators at each stage for each class (if multiclass problem)
print(grad_boost_clf.train_score_.shape)  # loss at each estimator in the ensemble for in-bag data, or training date
                                          # if subsamples=1
print(grad_boost_clf.oob_improvement_.shape)  # available if we have subsample<1, loss improvements of oob samples
                                              # compared to previous stage
print(grad_boost_clf.feature_importances_.shape)

(30, 10)
(30,)
(30,)
(784,)


In [5]:
grad_boost_clf.score(X_test, y_test)

0.9011428571428571

In [6]:
from sklearn.metrics import accuracy_score
#resulting final prediction at each stage of the estimator
stage = 1
for stage_pred in grad_boost_clf.staged_predict(X_test):
    print('stage:',"{0:2d}".format(stage),
          'stage predictions:', stage_pred,
          # staged score doesn't exist internally
          'staged_score:', "{0:.4f}".format(accuracy_score(y_pred=stage_pred, y_true=y_test)),
          len(stage_pred))
    stage += 1

stage:  1 stage predictions: ['0' '0' '9' ... '5' '2' '7'] staged_score: 0.6631 14000
stage:  2 stage predictions: ['0' '0' '9' ... '5' '2' '7'] staged_score: 0.7336 14000
stage:  3 stage predictions: ['0' '0' '9' ... '5' '2' '7'] staged_score: 0.7563 14000
stage:  4 stage predictions: ['0' '0' '4' ... '5' '2' '7'] staged_score: 0.7823 14000
stage:  5 stage predictions: ['0' '0' '4' ... '5' '2' '7'] staged_score: 0.7926 14000
stage:  6 stage predictions: ['0' '0' '4' ... '5' '2' '7'] staged_score: 0.8021 14000
stage:  7 stage predictions: ['0' '0' '4' ... '5' '2' '7'] staged_score: 0.8130 14000
stage:  8 stage predictions: ['0' '0' '4' ... '5' '2' '7'] staged_score: 0.8219 14000
stage:  9 stage predictions: ['0' '0' '4' ... '5' '2' '7'] staged_score: 0.8324 14000
stage: 10 stage predictions: ['0' '0' '4' ... '5' '2' '7'] staged_score: 0.8414 14000
stage: 11 stage predictions: ['0' '0' '4' ... '5' '2' '7'] staged_score: 0.8461 14000
stage: 12 stage predictions: ['0' '0' '4' ... '5' '2' 