In [1]:
# ======================================================================================
# Notebook setup
# 
# Run this cell before all others to make sure that the Jupyter notebook works properly
# ======================================================================================

# Automatically reload all imported modules
%load_ext autoreload
%autoreload 2

figsize = (14, 3.5)

# Exercise: Seoul Bike Sharing

## Loading and Preprocessing as Usual

**We will try using ensemble models on the Seoul Bike Sharing dataset**

First, let's load and preprocess the data as usual (for tree-based models)

In [2]:
import pandas as pd
import os

fname = os.path.join('data', 'SeoulBikeData.csv')
data = pd.read_csv(fname, sep=',')

num_input = ['Temperature(^C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(^C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)']
cat_input = ['Hour', 'Seasons', 'Holiday', 'Functioning Day']
output = 'Rented Bike Count'

data_flt = data[num_input + cat_input + [output]].copy()
for cname in cat_input:
    data_flt[cname] = data_flt[cname].astype('category').cat.codes

data_flt.head()

Unnamed: 0,Temperature(^C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(^C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Hour,Seasons,Holiday,Functioning Day,Rented Bike Count
0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,0,3,1,1,254
1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,1,3,1,1,204
2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,2,3,1,1,173
3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,3,3,1,1,107
4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,4,3,1,1,78


In [5]:
from sklearn.model_selection import train_test_split

X = data_flt[num_input + cat_input]
y = data_flt[output] # We use a list, so we obtain a DataFrame

test_set_fraction = .34
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=test_set_fraction, random_state=42)

print(f"Number of training examples: {len(X_tr)}")
print(f"Number of test examples: {len(X_ts)}")

Number of training examples: 5781
Number of test examples: 2979


## Learning Ensemble Models

* Try learnign a Random Forest for this problem
* If possible, try also calibrating the parameters using grid search and cross-validation

In [11]:
%%time
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rfm = RandomForestRegressor()
param_grid = {'n_estimators': np.arange(50, 200, 50), 'max_depth': np.arange(2, 10, 2)}
rfm_cv = GridSearchCV(rfm, param_grid=param_grid)
rfm_cv.fit(X_tr, y_tr);
print(f'Best results with: {rfm_cv.best_params_}')

rfm_cv_pred_tr, rfm_cv_pred_ts = rfm_cv.predict(X_tr), rfm_cv.predict(X_ts)
print(f'R2: {r2_score(y_tr, rfm_cv_pred_tr):.3} (training), {r2_score(y_ts, rfm_cv_pred_ts):.3} (test)')

Best results with: {'max_depth': 8, 'n_estimators': 150}
R2: 0.874 (training), 0.828 (test)
CPU times: user 30.1 s, sys: 9.98 ms, total: 30.1 s
Wall time: 30.2 s


* Then, try learnign a Gradient Boosted Tree model for this problem
* If possible, try also calibrating the parameters using grid search and cross-validation

In [12]:
%%time
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import numpy as np

gbm = GradientBoostingRegressor()
param_grid = {'n_estimators': np.arange(50, 200, 50), 'max_depth': np.arange(2, 10, 2)}
gbm_cv = GridSearchCV(gbm, param_grid=param_grid)
gbm_cv.fit(X_tr, y_tr);
print(f'Best results with: {gbm_cv.best_params_}')

gbm_cv_pred_tr, gbm_cv_pred_ts = gbm_cv.predict(X_tr), gbm_cv.predict(X_ts)
print(f'R2: {r2_score(y_tr, gbm_cv_pred_tr):.3} (training), {r2_score(y_ts, gbm_cv_pred_ts):.3} (test)')

Best results with: {'max_depth': 6, 'n_estimators': 150}
R2: 0.955 (training), 0.867 (test)
CPU times: user 39.5 s, sys: 3.3 ms, total: 39.5 s
Wall time: 39.5 s
