In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipe = Pipeline([('std_scl',StandardScaler())])
X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(X_test)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train,columns=["target"])
y_test = pd.DataFrame(y_test,columns=["target"])

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

grad_boost_reg = GradientBoostingRegressor(loss = 'squared_error',
                                           n_estimators = 30,
                                           learning_rate = 0.1,
                                           subsample = 0.9,  #bagging
                                           criterion = 'friedman_mse',  # splitting criterion decision trees
                                           random_state = 0)

grad_boost_reg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [4]:
print(len(grad_boost_reg.estimators_)) # fitted estimators for each stage
print(len(grad_boost_reg.train_score_)) # loss at each estimator in the ensemble for in-bag data,
                                              # or training date if subsamples=1
print(len(grad_boost_reg.oob_improvement_)) # available if we have subsample<1, loss improvements of oob samples
                                                  # compared to previous stage
print(len(grad_boost_reg.feature_importances_))

30
30
30
13


In [5]:
grad_boost_reg.score(X_test, y_test)

0.7626850807121547

In [6]:
from sklearn.metrics import r2_score
stage = 1
for stage_pred in grad_boost_reg.staged_predict(X_test):
    print('stage:',"{0:2d}".format(stage),
          'staged_score: {0:.4f}'.format(r2_score(y_pred=stage_pred, y_true=y_test)))
    stage += 1

stage:  1 staged_score: 0.1158
stage:  2 staged_score: 0.2128
stage:  3 staged_score: 0.3253
stage:  4 staged_score: 0.3965
stage:  5 staged_score: 0.4687
stage:  6 staged_score: 0.5258
stage:  7 staged_score: 0.5590
stage:  8 staged_score: 0.5978
stage:  9 staged_score: 0.6300
stage: 10 staged_score: 0.6502
stage: 11 staged_score: 0.6703
stage: 12 staged_score: 0.6994
stage: 13 staged_score: 0.7147
stage: 14 staged_score: 0.7240
stage: 15 staged_score: 0.7305
stage: 16 staged_score: 0.7334
stage: 17 staged_score: 0.7375
stage: 18 staged_score: 0.7421
stage: 19 staged_score: 0.7465
stage: 20 staged_score: 0.7537
stage: 21 staged_score: 0.7551
stage: 22 staged_score: 0.7563
stage: 23 staged_score: 0.7574
stage: 24 staged_score: 0.7579
stage: 25 staged_score: 0.7599
stage: 26 staged_score: 0.7594
stage: 27 staged_score: 0.7623
stage: 28 staged_score: 0.7626
stage: 29 staged_score: 0.7620
stage: 30 staged_score: 0.7627
