In [2]:
# ==========================================================
# Bike Demand Forecasting
# Bagging vs Subagging vs Boosting (Regression)
# Dataset Path: /content/hour.csv
# ==========================================================

# Install XGBoost if needed
!pip install -q xgboost

import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

# ----------------------------------------------------------
# 1. Load Dataset
# ----------------------------------------------------------
DATA_PATH = "/content/hour.csv"
TARGET = "cnt"

df = pd.read_csv(DATA_PATH)

print("Dataset Shape:", df.shape)

# Remove unnecessary columns
df = df.drop(columns=["instant", "dteday"], errors="ignore")

X = df.drop(columns=[TARGET])
y = df[TARGET]

# ----------------------------------------------------------
# 2. K-Fold Cross Validation
# ----------------------------------------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = []

# ----------------------------------------------------------
# 3. Ensemble Models
# ----------------------------------------------------------
models = {

    "RandomForest (Bagging)": RandomForestRegressor(
        n_estimators=200,
        max_depth=12,
        random_state=42,
        n_jobs=-1
    ),

    "Subagging (BaggingRegressor)": BaggingRegressor(
        estimator=DecisionTreeRegressor(),
        n_estimators=200,
        max_samples=0.7,
        random_state=42,
        n_jobs=-1
    ),

    "GradientBoosting (Boosting)": GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    ),

    "XGBoost (Boosting)": XGBRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
}

# ----------------------------------------------------------
# 4. Cross Validation Training
# ----------------------------------------------------------
for name, model in models.items():

    rmse_scores = []
    mae_scores = []

    print(f"\nTraining {name}")

    for train_idx, val_idx in kf.split(X):

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)

        preds = model.predict(X_val)

        rmse = np.sqrt(mean_squared_error(y_val, preds))
        mae = mean_absolute_error(y_val, preds)

        rmse_scores.append(rmse)
        mae_scores.append(mae)

    results.append([
        name,
        np.mean(rmse_scores),
        np.std(rmse_scores),
        np.mean(mae_scores),
        np.std(mae_scores)
    ])

# ----------------------------------------------------------
# 5. Save CV Results
# ----------------------------------------------------------
results_df = pd.DataFrame(results, columns=[
    "Model",
    "RMSE_mean",
    "RMSE_std",
    "MAE_mean",
    "MAE_std"
])

results_df.to_csv("/content/cv_regression_results.csv", index=False)

print("\nCross Validation Results:")
print(results_df)

# ----------------------------------------------------------
# 6. Train Best Model
# ----------------------------------------------------------
best_model_name = results_df.sort_values("RMSE_mean").iloc[0]["Model"]
best_model = models[best_model_name]

print("\nBest Model:", best_model_name)

best_model.fit(X, y)

predictions = best_model.predict(X)

final_df = pd.DataFrame({
    "ActualCnt": y,
    "PredictedCnt": predictions
})

final_df.to_csv("/content/final_predictions.csv", index=False)

print("final_predictions.csv created!")

# ----------------------------------------------------------
# 7. Feature Importance (Top 8)
# ----------------------------------------------------------
if hasattr(best_model, "feature_importances_"):

    importance = pd.Series(
        best_model.feature_importances_,
        index=X.columns
    ).sort_values(ascending=False)

    print("\nTop 8 Important Features:")
    print(importance.head(8))


Dataset Shape: (17379, 17)

Training RandomForest (Bagging)

Training Subagging (BaggingRegressor)

Training GradientBoosting (Boosting)

Training XGBoost (Boosting)

Cross Validation Results:
                          Model  RMSE_mean  RMSE_std  MAE_mean   MAE_std
0        RandomForest (Bagging)   2.674723  0.557160  0.952002  0.038472
1  Subagging (BaggingRegressor)   2.750838  0.579156  0.938962  0.044020
2   GradientBoosting (Boosting)   4.756997  0.275157  2.835052  0.097847
3            XGBoost (Boosting)   5.961110  0.357773  3.347698  0.077641

Best Model: RandomForest (Bagging)
final_predictions.csv created!

Top 8 Important Features:
registered    0.947707
casual        0.052111
hum           0.000030
windspeed     0.000026
mnth          0.000023
hr            0.000021
temp          0.000020
atemp         0.000019
dtype: float64
