In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipe = Pipeline([('std_scl',StandardScaler())])
X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(X_test)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train,columns=["target"])
y_test = pd.DataFrame(y_test,columns=["target"])

In [2]:
from sklearn.ensemble import BaggingRegressor

# Performs voting regressor aggregation
bag_reg = BaggingRegressor(estimator=DecisionTreeRegressor(max_depth=3,
                                                           max_features='sqrt',
                                                           splitter='random'),
                           n_estimators=30,
                           max_samples=0.8,  # int/float, default with replacement
                                              # maximum training set sample size compared to original training set
                           max_features=0.9, # int/float, default without replacement
                                              # maximum feature size compared to original number of features
                           bootstrap=True,   # bagging (True)/pasting (False) (affects max_samples behaviour)
                           bootstrap_features=False, # if feature selection should use baggin (max_features)
                           oob_score=False,  # perform oob scoring
                           warm_start=False,
                           n_jobs=2,
                           random_state=0,
                           verbose=0)

bag_reg.fit(X_train, y_train)

In [3]:
print(len(bag_reg.estimators_)) # array of trained estimators
print(len(bag_reg.estimators_samples_)) # array of sample subsets for each estimator
print(len(bag_reg.estimators_features_)) # array of feature subsets for each estimator

# #when oob_score is true
# bag_clf.oob_score_
# bag_clf.oob_prediction_

30
30
30


In [7]:
scores = []
for est, features in zip(bag_reg.estimators_,bag_reg.estimators_features_):
    scores.append(est.score(X_test[:, features], y_test))

In [8]:
print('Avg. estimator performance:',np.mean(scores))
print('Estimator performance std. dev.:',np.std(scores))

Avg. estimator performance: 0.1847313130270765
Estimator performance std. dev.: 0.19848630969677158


In [9]:
bag_reg.score(X_test, y_test)

0.39328148432267007