# Bootstrap Aggregating (Bagging)

Bagging is a homogeneous ensembling technique which uses samples of datasets for different independent estimators in a parrellel way and follows 'wisdom of crowd' principle. We have implemented bagging regressor with different base estimators along with famous RandomForest Regressor.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import utils
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('dataset/df_train.csv')
df_test = pd.read_csv('dataset/df_test.csv')
target = df_train['SalePrice']
df_train = df_train.drop(['SalePrice'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(df_train, target, test_size = 0.25, random_state = 42)

#### Random Forest Regressor

In [3]:
from sklearn.ensemble import RandomForestRegressor
print('####################################################\n{}\tRandom forest'
      .format(datetime.datetime.now().strftime('%H:%M:%S')))
parameters={
    'bootstrap':['auto'],
    'max_features':['auto'],
    'min_samples_leaf':[2],
    'min_samples_split':[4],
    'n_estimators': [1500],
    'n_jobs':[-1],
    'oob_score':[True]
}

r_forest = RandomForestRegressor()
clf = GridSearchCV(r_forest, parameters, verbose=0, iid = False)
clf.fit(X_train, y_train)
random_forest = RandomForestRegressor(**clf.best_params_)

print('\nRegressor: \n', random_forest, '\n')
print('{}\tDone!\n####################################################'
      .format(datetime.datetime.now().strftime('%H:%M:%S')))

####################################################
14:07:23	Random forest

Regressor: 
 RandomForestRegressor(bootstrap='auto', criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=4,
                      min_weight_fraction_leaf=0.0, n_estimators=1500,
                      n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                      warm_start=False) 

14:08:05	Done!
####################################################


#### Bagging Regressors with different base estimators

In [36]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
max_features = int(X_train.shape[1]/3)

svr_bag = BaggingRegressor(base_estimator=SVR(), n_estimators=50, max_features=max_features ,oob_score=True)
knn_bag = BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=50, max_features=max_features ,oob_score=True)
#equivalent to Random Forest
dt_bag = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=50, max_features=max_features ,oob_score=True)

In [37]:
#fitting the random forest model
models = {
    'Random Forest':r_forest,
    'Bagging Decision Tree':dt_bag,
    'Bagging SVM':svr_bag,
    'Bagging KNN':knn_bag
}

df_metric = utils.fit_models(X_train, y_train, X_test, y_test, models)
df_metric = df_metric.set_index('model')

Fitting: 	Random Forest
Done!
Fitting: 	Bagging Decision Tree
Done!
Fitting: 	Bagging SVM
Done!
Fitting: 	Bagging KNN
Done!
=== Fitting Completed ! ====


In [38]:
df_metric

Unnamed: 0_level_0,rmse,r2,rmsle
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Random Forest,0.143376,0.873579,0.011283
Bagging Decision Tree,0.13965,0.880065,0.010968
Bagging SVM,0.191927,0.773464,0.014935
Bagging KNN,0.263972,0.57147,0.020407


In [41]:
print(svr_bag.oob_score_)
print(knn_bag.oob_score_)
print(dt_bag.oob_score_)

0.7693473954504522
0.571373690285562
0.8642768246230236
