
# Bagging Regression


In [52]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

## E.3(a) - Listing data without Feature Selection

In [53]:
# IMPORT LISTING DATA
listings1 = pd.read_csv('a.csv')
listings1.head()

Unnamed: 0.1,Unnamed: 0,id,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,...,price,minimum_nights,maximum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code
0,0,49091,100,0,2.0,1.0,1.0,1.44255,103.7958,1,...,4.406719,180,360,1,1,0,-0.016218,0.849882,1.326759,2.181841
1,1,50646,0,0,1.0,1.0,1.0,1.33235,103.78521,2,...,4.382027,90,730,1,18,0,-0.016218,0.849882,-0.434121,-1.296388
2,2,56334,100,0,2.0,1.0,1.0,1.44246,103.79667,1,...,4.219508,6,14,1,20,0,-0.016218,0.849882,1.326759,2.181841
3,3,71609,100,0,8.0,1.0,1.0,1.34541,103.95712,6,...,5.187386,90,1125,1,20,1,1.242553,0.849882,0.446319,1.684951
4,4,71896,100,0,8.0,1.0,1.0,1.34567,103.95963,3,...,4.553877,90,1125,1,24,1,0.75841,0.849882,0.446319,1.684951


In [54]:
# DROP UNNECCESSARY COLUMNS
listings1 = listings1.drop(columns='Unnamed: 0')
listings1 = listings1.drop(columns='id')
listings1.head()

Unnamed: 0,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,bathrooms_text,bedrooms,...,price,minimum_nights,maximum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code
0,100,0,2.0,1.0,1.0,1.44255,103.7958,1,1.0,1.0,...,4.406719,180,360,1,1,0,-0.016218,0.849882,1.326759,2.181841
1,0,0,1.0,1.0,1.0,1.33235,103.78521,2,1.0,1.0,...,4.382027,90,730,1,18,0,-0.016218,0.849882,-0.434121,-1.296388
2,100,0,2.0,1.0,1.0,1.44246,103.79667,1,1.0,1.0,...,4.219508,6,14,1,20,0,-0.016218,0.849882,1.326759,2.181841
3,100,0,8.0,1.0,1.0,1.34541,103.95712,6,1.0,2.0,...,5.187386,90,1125,1,20,1,1.242553,0.849882,0.446319,1.684951
4,100,0,8.0,1.0,1.0,1.34567,103.95963,3,0.5,1.0,...,4.553877,90,1125,1,24,1,0.75841,0.849882,0.446319,1.684951


In [55]:
X = listings1.drop('price',axis=1)
y = listings1['price']

In [56]:
# Prepare the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [57]:
# Create the parameter grid for GridSearchCV
br_param_grid = {
    "n_estimators": [50,100,200],
    "max_features":[1,2,4,6,8],
    "max_samples": [0.5,0.1],
    "bootstrap": [True, False],
    "bootstrap_features": [True, False]
}

In [58]:
# Instantiate a Random Forest Regressor
br_reg = BaggingRegressor(random_state = 42)

In [59]:
# Setup grid search
br_grid = GridSearchCV(estimator = br_reg, param_grid = br_param_grid, cv=5,  n_jobs=-1, verbose=2)

In [60]:
# Fit the grid search model on training dataset
br_grid.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  1.8min finished


GridSearchCV(cv=5, estimator=BaggingRegressor(random_state=42), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'bootstrap_features': [True, False],
                         'max_features': [1, 2, 4, 6, 8],
                         'max_samples': [0.5, 0.1],
                         'n_estimators': [50, 100, 200]},
             verbose=2)

In [61]:
# Get best RF estimator based on best parameters, and use RF regressor to predict on test set
br_best = br_grid.best_estimator_
y_pred = br_best.predict(X_test)

In [62]:
scores = cross_val_score(br_best, X_train, y_train, cv=5)

In [63]:
# Calculate performance metrics
br_dict1 = {'Model':'Bagging Regression (a)',
          'R^2':metrics.r2_score(y_test, y_pred),
          'MAE':metrics.mean_absolute_error(y_test, y_pred),
          'MSE':metrics.mean_squared_error(y_test, y_pred),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
          'CVS':scores.mean()
          }


In [103]:
# Display model performance metrics
br_reg_metrics1 = pd.DataFrame.from_dict(br_dict1, orient = 'index').T
br_reg_metrics1

Unnamed: 0,Model,R^2,MAE,MSE,RMSE,CVS
0,Bagging Regression (a),0.74576,0.270054,0.173588,0.416638,0.716451


## E.3(b) - Listing data with Feature Selection

In [65]:
# IMPORT LISTING DATA
listings2 = pd.read_csv('b.csv')
listings2.head()

Unnamed: 0.1,Unnamed: 0,price,host_response_rate,host_is_superhost,latitude,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code
0,0,4.406719,100,0,1.44255,1,1.0,1.0,1.0,180,1,1,0,-0.016218,0.849882,1.326759,2.181841
1,1,4.382027,0,0,1.33235,2,1.0,1.0,1.0,90,1,18,0,-0.016218,0.849882,-0.434121,-1.296388
2,2,4.219508,100,0,1.44246,1,1.0,1.0,1.0,6,1,20,0,-0.016218,0.849882,1.326759,2.181841
3,3,5.187386,100,0,1.34541,6,1.0,2.0,3.0,90,1,20,1,1.242553,0.849882,0.446319,1.684951
4,4,4.553877,100,0,1.34567,3,0.5,1.0,1.0,90,1,24,1,0.75841,0.849882,0.446319,1.684951


In [66]:
# DROP UNNECCESSARY COLUMNS
listings2 = listings2.drop(columns='Unnamed: 0')
listings2.head()

Unnamed: 0,price,host_response_rate,host_is_superhost,latitude,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code
0,4.406719,100,0,1.44255,1,1.0,1.0,1.0,180,1,1,0,-0.016218,0.849882,1.326759,2.181841
1,4.382027,0,0,1.33235,2,1.0,1.0,1.0,90,1,18,0,-0.016218,0.849882,-0.434121,-1.296388
2,4.219508,100,0,1.44246,1,1.0,1.0,1.0,6,1,20,0,-0.016218,0.849882,1.326759,2.181841
3,5.187386,100,0,1.34541,6,1.0,2.0,3.0,90,1,20,1,1.242553,0.849882,0.446319,1.684951
4,4.553877,100,0,1.34567,3,0.5,1.0,1.0,90,1,24,1,0.75841,0.849882,0.446319,1.684951


In [67]:
X = listings2.drop('price',axis=1)
y = listings2['price']

In [68]:
# Prepare the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [69]:
# Create the parameter grid for GridSearchCV
br_param_grid = {
    "n_estimators": [50,100,200],
    "max_features":[1,2,4,6,8],
    "max_samples": [0.5,0.1],
    "bootstrap": [True, False],
    "bootstrap_features": [True, False]
}

In [70]:
# Instantiate a Random Forest Regressor
br_reg = BaggingRegressor(random_state = 42)

In [71]:
# Setup grid search
br_grid = GridSearchCV(estimator = br_reg, param_grid = br_param_grid, cv=5,  n_jobs=-1, verbose=2)

In [72]:
# Fit the grid search model on training dataset
br_grid.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   54.2s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  1.7min finished


GridSearchCV(cv=5, estimator=BaggingRegressor(random_state=42), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'bootstrap_features': [True, False],
                         'max_features': [1, 2, 4, 6, 8],
                         'max_samples': [0.5, 0.1],
                         'n_estimators': [50, 100, 200]},
             verbose=2)

In [73]:
# Get best RF estimator based on best parameters, and use RF regressor to predict on test set
br_best = br_grid.best_estimator_
y_pred = br_best.predict(X_test)

In [74]:
scores = cross_val_score(br_best, X_train, y_train, cv=5)

In [75]:
# Calculate performance metrics
br_dict2 = {'Model':'Bagging Regression (b)',
          'R^2':metrics.r2_score(y_test, y_pred),
          'MAE':metrics.mean_absolute_error(y_test, y_pred),
          'MSE':metrics.mean_squared_error(y_test, y_pred),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
          'CVS':scores.mean()
          }


In [106]:
# Display model performance metrics
br_reg_metrics2 = pd.DataFrame.from_dict(br_dict2, orient = 'index').T
br_reg_metrics2

Unnamed: 0,Model,R^2,MAE,MSE,RMSE,CVS
0,Bagging Regression (b),0.743065,0.27037,0.175427,0.418841,0.722946



## E.3(c) -  Listing & Review data without Feature Selection


In [77]:
# IMPORT LISTING DATA
listings3 = pd.read_csv('c.csv')
listings3.head()

Unnamed: 0.1,Unnamed: 0,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,bathrooms_text,...,minimum_nights,maximum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code,avgcomp
0,0,100.0,0.0,2.0,1.0,1.0,1.44255,103.7958,1.0,1.0,...,180.0,360.0,1.0,1.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.9615
1,1,0.0,0.0,1.0,1.0,1.0,1.33235,103.78521,2.0,1.0,...,90.0,730.0,1.0,18.0,0.0,-0.016218,0.849882,-0.434121,-1.296388,0.825117
2,2,100.0,0.0,2.0,1.0,1.0,1.44246,103.79667,1.0,1.0,...,6.0,14.0,1.0,20.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.89919
3,3,100.0,0.0,8.0,1.0,1.0,1.34541,103.95712,6.0,1.0,...,90.0,1125.0,1.0,20.0,1.0,1.242553,0.849882,0.446319,1.684951,0.577285
4,4,100.0,0.0,8.0,1.0,1.0,1.34567,103.95963,3.0,0.5,...,90.0,1125.0,1.0,24.0,1.0,0.75841,0.849882,0.446319,1.684951,0.7472


In [78]:
# DROP UNNECCESSARY COLUMNS
listings3 = listings3.drop(columns='Unnamed: 0')
listings3.head()

Unnamed: 0,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,bathrooms_text,bedrooms,...,minimum_nights,maximum_nights,has_availability,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code,avgcomp
0,100.0,0.0,2.0,1.0,1.0,1.44255,103.7958,1.0,1.0,1.0,...,180.0,360.0,1.0,1.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.9615
1,0.0,0.0,1.0,1.0,1.0,1.33235,103.78521,2.0,1.0,1.0,...,90.0,730.0,1.0,18.0,0.0,-0.016218,0.849882,-0.434121,-1.296388,0.825117
2,100.0,0.0,2.0,1.0,1.0,1.44246,103.79667,1.0,1.0,1.0,...,6.0,14.0,1.0,20.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.89919
3,100.0,0.0,8.0,1.0,1.0,1.34541,103.95712,6.0,1.0,2.0,...,90.0,1125.0,1.0,20.0,1.0,1.242553,0.849882,0.446319,1.684951,0.577285
4,100.0,0.0,8.0,1.0,1.0,1.34567,103.95963,3.0,0.5,1.0,...,90.0,1125.0,1.0,24.0,1.0,0.75841,0.849882,0.446319,1.684951,0.7472


In [79]:
X = listings3.drop('price',axis=1)
y = listings3['price']

In [80]:
# Prepare the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [81]:
# Create the parameter grid for GridSearchCV
br_param_grid = {
    "n_estimators": [50,100,200],
    "max_features":[1,2,4,6,8],
    "max_samples": [0.5,0.1],
    "bootstrap": [True, False],
    "bootstrap_features": [True, False]
}

In [82]:
# Instantiate a Random Forest Regressor
br_reg = BaggingRegressor(random_state = 42)

In [83]:
# Setup grid search
br_grid = GridSearchCV(estimator = br_reg, param_grid = br_param_grid, cv=5,  n_jobs=-1, verbose=2)

In [84]:
# Fit the grid search model on training dataset
br_grid.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   51.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  1.5min finished


GridSearchCV(cv=5, estimator=BaggingRegressor(random_state=42), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'bootstrap_features': [True, False],
                         'max_features': [1, 2, 4, 6, 8],
                         'max_samples': [0.5, 0.1],
                         'n_estimators': [50, 100, 200]},
             verbose=2)

In [85]:
# Get best RF estimator based on best parameters, and use RF regressor to predict on test set
br_best = br_grid.best_estimator_
y_pred = br_best.predict(X_test)

In [86]:
scores = cross_val_score(br_best, X_train, y_train, cv=5)

In [87]:
# Calculate performance metrics
br_dict3 = {'Model':'Bagging Regression (c)',
          'R^2':metrics.r2_score(y_test, y_pred),
          'MAE':metrics.mean_absolute_error(y_test, y_pred),
          'MSE':metrics.mean_squared_error(y_test, y_pred),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
          'CVS':scores.mean()
          }


In [109]:
# Display model performance metrics
br_reg_metrics3 = pd.DataFrame.from_dict(br_dict3, orient = 'index').T
br_reg_metrics3

Unnamed: 0,Model,R^2,MAE,MSE,RMSE,CVS
0,Bagging Regression (c),0.734623,0.273348,0.182522,0.427226,0.732074



## E.3(d) -  Listing & Review data with Feature Selection


In [89]:
# IMPORT LISTING DATA
listings4 = pd.read_csv('d.csv')
listings4.head()

Unnamed: 0.1,Unnamed: 0,price,host_response_rate,host_is_superhost,host_total_listings_count,latitude,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code,avgcomp
0,0,4.406719,100.0,0.0,2.0,1.44255,1.0,1.0,1.0,1.0,180.0,1.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.9615
1,1,4.382027,0.0,0.0,1.0,1.33235,2.0,1.0,1.0,1.0,90.0,18.0,0.0,-0.016218,0.849882,-0.434121,-1.296388,0.825117
2,2,4.219508,100.0,0.0,2.0,1.44246,1.0,1.0,1.0,1.0,6.0,20.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.89919
3,3,5.187386,100.0,0.0,8.0,1.34541,6.0,1.0,2.0,3.0,90.0,20.0,1.0,1.242553,0.849882,0.446319,1.684951,0.577285
4,4,4.553877,100.0,0.0,8.0,1.34567,3.0,0.5,1.0,1.0,90.0,24.0,1.0,0.75841,0.849882,0.446319,1.684951,0.7472


In [90]:
# DROP UNNECCESSARY COLUMNS
listings4 = listings4.drop(columns='Unnamed: 0')
listings4.head()

Unnamed: 0,price,host_response_rate,host_is_superhost,host_total_listings_count,latitude,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,number_of_reviews,instant_bookable,ptype_code,rtype_code,ngtype_code,nbtype_code,avgcomp
0,4.406719,100.0,0.0,2.0,1.44255,1.0,1.0,1.0,1.0,180.0,1.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.9615
1,4.382027,0.0,0.0,1.0,1.33235,2.0,1.0,1.0,1.0,90.0,18.0,0.0,-0.016218,0.849882,-0.434121,-1.296388,0.825117
2,4.219508,100.0,0.0,2.0,1.44246,1.0,1.0,1.0,1.0,6.0,20.0,0.0,-0.016218,0.849882,1.326759,2.181841,0.89919
3,5.187386,100.0,0.0,8.0,1.34541,6.0,1.0,2.0,3.0,90.0,20.0,1.0,1.242553,0.849882,0.446319,1.684951,0.577285
4,4.553877,100.0,0.0,8.0,1.34567,3.0,0.5,1.0,1.0,90.0,24.0,1.0,0.75841,0.849882,0.446319,1.684951,0.7472


In [91]:
X = listings4.drop('price',axis=1)
y = listings4['price']

In [92]:
# Prepare the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [93]:
# Create the parameter grid for GridSearchCV
br_param_grid = {
    "n_estimators": [50,100,200],
    "max_features":[1,2,4,6,8],
    "max_samples": [0.5,0.1],
    "bootstrap": [True, False],
    "bootstrap_features": [True, False]
}

In [94]:
# Instantiate a Random Forest Regressor
br_reg = BaggingRegressor(random_state = 42)

In [95]:
# Setup grid search
br_grid = GridSearchCV(estimator = br_reg, param_grid = br_param_grid, cv=5,  n_jobs=-1, verbose=2)

In [96]:
# Fit the grid search model on training dataset
br_grid.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  1.6min finished


GridSearchCV(cv=5, estimator=BaggingRegressor(random_state=42), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'bootstrap_features': [True, False],
                         'max_features': [1, 2, 4, 6, 8],
                         'max_samples': [0.5, 0.1],
                         'n_estimators': [50, 100, 200]},
             verbose=2)

In [97]:
# Get best RF estimator based on best parameters, and use RF regressor to predict on test set
br_best = br_grid.best_estimator_
y_pred = br_best.predict(X_test)

In [98]:
scores = cross_val_score(br_best, X_train, y_train, cv=5)

In [99]:
# Calculate performance metrics
br_dict4 = {'Model':'Bagging Regression (d)',
          'R^2':metrics.r2_score(y_test, y_pred),
          'MAE':metrics.mean_absolute_error(y_test, y_pred),
          'MSE':metrics.mean_squared_error(y_test, y_pred),
          'RMSE':np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
          'CVS':scores.mean()
          }


In [112]:
# Display model performance metrics
br_reg_metrics4 = pd.DataFrame.from_dict(br_dict4, orient = 'index').T
br_reg_metrics4

Unnamed: 0,Model,R^2,MAE,MSE,RMSE,CVS
0,Bagging Regression (d),0.746308,0.264306,0.174485,0.417714,0.741149
