# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np

# A. Data Preprocessing and Baseline

## 1. Data Loading and Feature Engineering

The dataset that we're using is a Bike Sharing Demand dataset, from the UCI ML repository. It contains 17000 hourly samples including time, weather, temperature, and the total count of rented bikes at that time. The goal is to accurately predict the number of rented bikes using the time and weather data. 

In [2]:
data = pd.read_csv('hour.csv')
data

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


Let's drop the unnecessary columns, and split it into our features X and target y. 'instant' is just another index column, 'dteday' is the date, which can also be dropped. 'casual' and 'registered' add up to give the total count. Since we only care about the total count, we can drop those two columns as well. 

In [3]:
X = data.drop(columns = ["instant", "dteday", "casual", "registered", "cnt"], axis=1)
X

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642
17375,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642
17376,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642
17377,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343


In [4]:
y = data["cnt"]
y

0         16
1         40
2         32
3         13
4          1
        ... 
17374    119
17375     89
17376     90
17377     61
17378     49
Name: cnt, Length: 17379, dtype: int64

In [5]:
[(col, X[col].unique()) for col in X.columns if len(X[col].unique()) < 20]

[('season', array([1, 2, 3, 4])),
 ('yr', array([0, 1])),
 ('mnth', array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])),
 ('holiday', array([0, 1])),
 ('weekday', array([6, 0, 1, 2, 3, 4, 5])),
 ('workingday', array([0, 1])),
 ('weathersit', array([1, 2, 3, 4]))]

Next, we'll convert the categorical features like 'season', 'mnth', 'holiday', 'weekday', and 'weathersit' to one-hot features which is more suitable for a regression task

In [6]:
X_one_hot = pd.get_dummies(X, columns=["season", "mnth", "weekday", "weathersit"])
X_one_hot

Unnamed: 0,yr,hr,holiday,workingday,temp,atemp,hum,windspeed,season_1,season_2,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_1,weathersit_2,weathersit_3,weathersit_4
0,0,0,0,0,0.24,0.2879,0.81,0.0000,True,False,...,False,False,False,False,False,True,True,False,False,False
1,0,1,0,0,0.22,0.2727,0.80,0.0000,True,False,...,False,False,False,False,False,True,True,False,False,False
2,0,2,0,0,0.22,0.2727,0.80,0.0000,True,False,...,False,False,False,False,False,True,True,False,False,False
3,0,3,0,0,0.24,0.2879,0.75,0.0000,True,False,...,False,False,False,False,False,True,True,False,False,False
4,0,4,0,0,0.24,0.2879,0.75,0.0000,True,False,...,False,False,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,19,0,1,0.26,0.2576,0.60,0.1642,True,False,...,True,False,False,False,False,False,False,True,False,False
17375,1,20,0,1,0.26,0.2576,0.60,0.1642,True,False,...,True,False,False,False,False,False,False,True,False,False
17376,1,21,0,1,0.26,0.2576,0.60,0.1642,True,False,...,True,False,False,False,False,False,True,False,False,False
17377,1,22,0,1,0.26,0.2727,0.56,0.1343,True,False,...,True,False,False,False,False,False,True,False,False,False


## 2. Train/Test Split
We'll now split the data into train and test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_one_hot, y, test_size = 0.2)
print(f"Shape of train features: {X_train.shape}")
print(f"Shape of test features: {X_test.shape}")
print(f"Shape of train target: {y_train.shape}")
print(f"Shape of test target: {y_test.shape}")

Shape of train features: (13903, 35)
Shape of test features: (3476, 35)
Shape of train target: (13903,)
Shape of test target: (3476,)


We'll also standardize it:

In [8]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)
y_train = scaler_y.fit_transform(pd.DataFrame(y_train))
y_test = scaler_y.transform(pd.DataFrame(y_test))

## 3. Baseline Model (Single Regressor)
Now we'll train a decision tree and linear regressor as a baseline

In [9]:
# Decision tree
tree = DecisionTreeRegressor(max_depth=6)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
rmse_tree = np.sqrt(mean_squared_error(y_test, y_pred_tree))

# Linear regression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred_lin = linreg.predict(X_test)
rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))

if rmse_tree < rmse_lin:
    baseline_model = "Decision Tree Regressor"
    baseline_rmse = rmse_tree
else:
    baseline_model = "Linear Regression"
    baseline_rmse = rmse_lin

print(f"Decision Tree RMSE: {rmse_tree:.4f}")
print(f"Linear Regression RMSE: {rmse_lin:.4f}")
print(f"Baseline Model: {baseline_model} (RMSE = {baseline_rmse:.4f})")

Decision Tree RMSE: 0.5596
Linear Regression RMSE: 0.7775
Baseline Model: Decision Tree Regressor (RMSE = 0.5596)


# B. Ensemble Techniques for Bias and Variance Reduction

## 1. Bagging (Variance Reduction)

Bagging (Bootstrap Aggregating) is an ensemble learning technique. It helps by reducing variance and improving model stability. 
- Bootstrapping: First, we sample with replacement multiple times from the training data to create multiple training sets. Each one is slightly different from the original data.
- Model training: A separate model is trained on each one of these new datasets. Since each one is trained separately, they learn slightly different decision boundaries.
- Aggregation: We aggregate the results of all models during prediction. In the case of regression, we take the average. In classification, we take a majority vote. This reduces noise and variance in the model's predictions

In [10]:
bagging = BaggingRegressor(
    estimator=DecisionTreeRegressor(max_depth=6),
    n_estimators=50,
)
bagging.fit(X_train, y_train)
y_pred_bag = bagging.predict(X_test)
rmse_bag = np.sqrt(mean_squared_error(y_test, y_pred_bag))
print(f"Bagging RMSE: {rmse_bag:.4f}")

  return column_or_1d(y, warn=True)


Bagging RMSE: 0.5397


In this case, bagging didn't improve the results much. This is likely because the decision tree, being shallow, already has low variance and its predictions don't vary much across each model.

## 2. Boosting (Bias Reduction)

Boosting is an ensemble technique which helps by reducing bias. It builds a strong model by combining multiple weaker models sequentially. 
- First, we train a single weak learner on the training data
- Then, we evaluate the model and calculate the residuals (negative gradient) on the data
- We train a new model to predict those residuals
- Keep repeating this process, and then add up all of the results to get the final prediction

In [11]:
boost = GradientBoostingRegressor()
boost.fit(X_train, y_train)
y_pred_boost = boost.predict(X_test)
rmse_boost = np.sqrt(mean_squared_error(y_test, y_pred_boost))
print(f"Boosting RMSE: {rmse_boost:.4f}")

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Boosting RMSE: 0.3946


Boosting performed much better compared to bagging, showing that bias reduction is effective in this case

# C. Stacking for Optimal Performance

Finally, we'll try stacking. Stacking is an ensemble technique that combines multiple base models by training a meta model that learns how to best combine their predictions. 
- A bunch of individual models are trained
- Their predictions are collected for all entries in the training data
- A new learner (in this case a ridge regressor) is trained on those predictions to predict the true value. It basically looks at all of the other model's predictions and then makes a final call on the output value.

Here, we'll use a KNN regressor, along with our bagging and boosting models from earlier

In [12]:
base_learners_no_knn = [
    # ('knn', KNeighborsRegressor()),
    ('bagging', bagging),         
    ('boosting', boost)            
]
base_learners = [
    ('knn', KNeighborsRegressor()),
    ('bagging', bagging),         
    ('boosting', boost)            
]

In [13]:
meta_learner_no_knn = Ridge(alpha=1.0)
meta_learner = Ridge(alpha=1.0)
stack_model_no_knn = StackingRegressor(
    estimators=base_learners_no_knn,
    final_estimator=meta_learner_no_knn,
    n_jobs=-1
)
stack_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_learner,
    n_jobs=-1
)


In [14]:
stack_model.fit(X_train, y_train)
stack_model_no_knn.fit(X_train, y_train)
y_pred_stack_no_knn = stack_model_no_knn.predict(X_test)
y_pred_stack = stack_model.predict(X_test)
rmse_stack = np.sqrt(mean_squared_error(y_test, y_pred_stack))
rmse_stack_no_knn = np.sqrt(mean_squared_error(y_test, y_pred_stack_no_knn))
print(f"Stacking RMSE: {rmse_stack:.4f}")
print(f"Stacking RMSE (no knn): {rmse_stack_no_knn}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Stacking RMSE: 0.3785
Stacking RMSE (no knn): 0.3804075291860996


As we can see, combining all 3 approaches with stacking gives us the best results. The KNN helps learning local features in the data.

# D. Final Analysis

## 1. Comparative Table

In [15]:
results = pd.DataFrame({
    'Model': [
        'Decision Tree Regressor',
        'Linear Regression',
        f'Baseline (Best: {baseline_model})',
        'Bagging Regressor',
        'Gradient Boosting Regressor',
        'Stacking Regressor'
    ],
    'RMSE': [
        rmse_tree,
        rmse_lin,
        baseline_rmse,
        rmse_bag,
        rmse_boost,
        rmse_stack
    ]
})

# Format neatly
results = results.sort_values(by='RMSE').reset_index(drop=True)
print(results.to_string(index=False))

                                   Model     RMSE
                      Stacking Regressor 0.378538
             Gradient Boosting Regressor 0.394606
                       Bagging Regressor 0.539672
                 Decision Tree Regressor 0.559561
Baseline (Best: Decision Tree Regressor) 0.559561
                       Linear Regression 0.777484


## 2. Conclusion

The best performing approach turned out to be the stacking regressor with an RMSE of 0.378, scoring much better compared to the baseline decision tree regressor (RMSE = 0.559). The stacking model combines both bagging and boosting approaches, and therefore helps us reduce both bias and variance. Typically, we experience high bias in simplistic models, and as the model complexity increases, we see bias drop and variance increase. This is the bias-variance tradeoff. The meta-learner was able to learn the optimal blend of all 3. 

Overall, having 3 different models helps here because of their diversity. Each model specializes/helps in a different aspect. The KNN helps learn local relationships, the bagging model helps reduce variance, and the boosting model helps reduce overall bias and focusing on hard to learn patterns. The ridge regression model finally is able to find the best way to combine all 3 for the final prediction