# Introduction

In [1]:
"""
What? Bagging with random forest applied to regression on bike rental dataset

Corey Wade. “Hands-On Gradient Boosting with XGBoost and scikit-learn
https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn
"""

'\nWhat? Bagging with random forest\n\nRevision No: 1\nLaste revised: 08/02/21\nReference: Corey Wade. “Hands-On Gradient Boosting with XGBoost and scikit-learn\nGLM\n'

# Import modules

In [30]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from pylab import rcParams
from matplotlib import pyplot
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error as MSE

# Problem statement

In [None]:
"""
Imagine you work for a bike rental company and your goal is to predict the number of bike rentals per day 
depending upon the weather, the time of day, the time of year, and the growth of the company.”
"""

# Load dataset

In [14]:
# Upload csv file 'bike_rentals_cleaned' as df_bikes
df_bikes = pd.read_csv('../../DATASETS/bike_rentals_cleaned.csv')

In [15]:
# Show first 5 rows
df_bikes.head(5)

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1.0,0.0,1,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,985
1,2,1.0,0.0,1,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,801
2,3,1.0,0.0,1,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,1349
3,4,1.0,0.0,1,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,1562
4,5,1.0,0.0,1,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,1600


In [16]:
# Split data into X and y
X_bikes = df_bikes.iloc[:,:-1]
y_bikes = df_bikes.iloc[:,-1]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)

# Get baseline reasonable performance

In [18]:
# Initalize Random Forest as rf with 50 estimators, warm_start=True, and oob_score=True
rf = RandomForestRegressor(n_estimators=50, warm_start=True, n_jobs=-1, random_state=2)
   
# Obtain scores of cross-validation using num_splits and mean squared error
scores = cross_val_score(rf, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=10)
    
# Print error
rmse = np.sqrt(-scores)
print('RMSE:', np.round(rmse, 3))
print('RMSE mean: %0.3f' % (rmse.mean()))

RMSE: [ 836.482  541.898  533.086  812.782  894.877  881.117  794.103  828.968
  772.517 2128.148]
RMSE mean: 902.398


In [None]:
"""
This score is better than earlier in the chapter. Notice that the error in the last fold is much higher according 
to the last entry in the RMSE array. This could be due to errors within the data or outliers.
"""

# Create function for parameter hypertuning

In [21]:
def randomized_search_reg(params, runs=16, reg=RandomForestRegressor(random_state=2, n_jobs=-1)):

    # Instantiate RandomizedSearchCV as grid_reg
    rand_reg = RandomizedSearchCV(reg, params, n_iter=runs, scoring='neg_mean_squared_error',
                                  cv=10, n_jobs=-1, random_state=2)

    # Fit grid_reg on X_train and y_train
    rand_reg.fit(X_train, y_train)

    # Extract best estimator
    best_model = rand_reg.best_estimator_

    # Extract best params
    best_params = rand_reg.best_params_

    # Print best params
    print("Best params:", best_params)

    # Compute best score
    best_score = np.sqrt(-rand_reg.best_score_)

    # Print best score
    print("Training score: {:.3f}".format(best_score))

    # Predict test set labels
    y_pred = best_model.predict(X_test)

    # Compute rmse_test
    rmse_test = MSE(y_test, y_pred)**0.5

    # Print rmse_test
    print('Test set score: {:.3f}'.format(rmse_test))

### Trial No 1

In [22]:
randomized_search_reg(params={'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05],
                          'min_samples_split':[2, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.1],
                          'min_samples_leaf':[1,2,4,6,8,10,20,30],
                          'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2],
                          'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
                          'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
                          'max_depth':[None,2,4,6,8,10,20]
                         })

Best params: {'min_weight_fraction_leaf': 0.0, 'min_samples_split': 0.03, 'min_samples_leaf': 6, 'min_impurity_decrease': 0.05, 'max_leaf_nodes': 25, 'max_features': 0.7, 'max_depth': None}
Training score: 759.076
Test set score: 701.802


### Trial No 2

In [23]:
randomized_search_reg(params={
                          'min_samples_leaf':[1,2,4,6,8,10,20,30],
                          'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2],
                          'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
                          'max_depth':[None,2,4,6,8,10,20],
                         })

Best params: {'min_samples_leaf': 1, 'min_impurity_decrease': 0.1, 'max_features': 0.6, 'max_depth': 10}
Training score: 679.052
Test set score: 626.541


### Trial No 3

In [24]:
randomized_search_reg(params={
                          'min_samples_leaf':[1,2,4,6,8,10,20,30],
                          'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2],
                          'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
                          'max_depth':[None,4,6,8,10,12,15,20]
                         }, runs=20)

Best params: {'min_samples_leaf': 1, 'min_impurity_decrease': 0.1, 'max_features': 0.6, 'max_depth': 12}
Training score: 675.128
Test set score: 619.014


### Trial No 4

In [19]:
randomized_search_reg(params={
                          'min_samples_leaf':[1,2,3,4,5,6],
                          'min_impurity_decrease':[0.0, 0.01, 0.05, 0.08, 0.10, 0.12, 0.15],
                          'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
                          'max_depth':[None,8,10,12,14,16,18,20]
                         })

Best params: {'min_samples_leaf': 1, 'min_impurity_decrease': 0.05, 'max_features': 0.7, 'max_depth': 18}
Training score: 679.595
Test set score: 630.954


### Trial No 5

In [25]:
randomized_search_reg(params={
                          'min_samples_leaf':[1,2,4,6,8,10,20,30],
                          'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2],
                          'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
                          'max_depth':[None,4,6,8,10,12,15,20],
                        'n_estimators':[100]
                         }, runs=20)

Best params: {'n_estimators': 100, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.1, 'max_features': 0.6, 'max_depth': 12}
Training score: 675.128
Test set score: 619.014


### Trial No 6

In [26]:
randomized_search_reg(params={
                          'min_samples_leaf':[1,2,3],
                          'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10],
                          'max_features':[0.6],
                          'max_depth':[10,12,14,16,18,20],
                        'n_estimators':[100]
                         }, runs=20)

Best params: {'n_estimators': 100, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_features': 0.6, 'max_depth': 20}
Training score: 673.982
Test set score: 623.106


# Test best parameters against all dataset

In [27]:
# Initalize Random Forest as rf with 100 estimators, warm_start=True, and oob_score=True
rf = RandomForestRegressor(n_estimators=100,  min_impurity_decrease=0.1, max_features=0.6, max_depth=12, n_jobs=-1, random_state=2)
   
# Obtain scores of cross-validation using num_splits and mean squared error
scores = cross_val_score(rf, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=10)
    
# Take square root of the scores
rmse = np.sqrt(-scores)
    
# Display accuracy
print('RMSE:', np.round(rmse, 3))

# Display mean score
print('RMSE mean: %0.3f' % (rmse.mean()))

RMSE: [ 818.354  514.173  547.392  814.059  769.54   730.025  831.376  794.634
  756.83  1595.237]
RMSE mean: 817.162


In [None]:
"""
The RMSE goes back up to 817. The score is much better than 903, but it's considerably worse than 619. What's 
going on here? There may be an issue with the last split in cross_val_score since its score is twice as bad as
the others. Let's see if shuffling the data does the trick.
"""

In [29]:
df_bikes.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1.0,0.0,1,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,985
1,2,1.0,0.0,1,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,801
2,3,1.0,0.0,1,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,1349
3,4,1.0,0.0,1,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,1562
4,5,1.0,0.0,1,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,1600


In [32]:
df_shuffle_bikes = shuffle(df_bikes, random_state=2)
df_shuffle_bikes.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
671,672,4.0,1.0,11,0.0,5.0,1.0,1,0.355,0.356042,0.522083,0.266175,5847
199,200,3.0,0.0,7,0.0,2.0,1.0,1,0.776667,0.747479,0.650417,0.1306,4541
77,78,1.0,0.0,3,0.0,6.0,0.0,1,0.4725,0.466525,0.379167,0.368167,3117
225,226,3.0,0.0,8,0.0,0.0,0.0,2,0.676667,0.624388,0.8175,0.222633,3820
37,38,1.0,0.0,2,0.0,1.0,1.0,1,0.271667,0.303658,0.738333,0.045408,1712


In [33]:
# Split data into X and y
X_shuffle_bikes = df_shuffle_bikes.iloc[:,:-1]
y_shuffle_bikes = df_shuffle_bikes.iloc[:,-1]

In [34]:
# Initalize Random Forest as rf with 100 estimators, warm_start=True, and oob_score=True
rf = RandomForestRegressor(n_estimators=100,  min_impurity_decrease=0.1, max_features=0.6, max_depth=12, n_jobs=-1, random_state=2)
   
# Obtain scores of cross-validation using num_splits and mean squared error
scores = cross_val_score(rf, X_shuffle_bikes, y_shuffle_bikes, scoring='neg_mean_squared_error', cv=10)
    
# Take square root of the scores
rmse = np.sqrt(-scores)
    
# Display accuracy
print('RMSE:', np.round(rmse, 3))

# Display mean score
print('RMSE mean: %0.3f' % (rmse.mean()))

RMSE: [630.093 686.673 468.159 526.676 593.033 724.575 774.402 672.63  760.253
 616.797]
RMSE mean: 645.329


In [None]:
"""
In the shuffled data, there is no issue with the last split, and the score is much higher, as expected.

At the end of the day, the random forest is limited by its individual trees. If all trees make the same 
mistake, the random forest makes this mistake. There are scenarios, as is revealed in this case study before
the data was shuffled, where random forests are unable to significantly improve upon errors due to challenges
within the data that individual trees are unable to address.

An ensemble method capable of improving upon initial shortcomings, an ensemble method that will learn from the 
mistakes of trees in future rounds, could be advantageous. Boosting was designed to learn from the mistakes of
trees in early rounds. Boosting, in particular gradient boosting – the focus of the next chapter – addresses 
this topic.
"""