# Hourly Bike Demand Forecasting


In [1]:
import pandas as pd
from pathlib import Path

from src.data_aggregation import load_trip_data, aggregate_hourly_demand

raw_dir = Path("../data/raw")

df = load_trip_data(raw_dir)
agg = aggregate_hourly_demand(df)

agg.head()


Unnamed: 0,start_station_id,hour_ts,ride_count
0,2733.03,2024-06-01 09:00:00,1
1,2733.03,2024-06-01 11:00:00,1
2,2733.03,2024-06-01 14:00:00,3
3,2733.03,2024-06-01 15:00:00,4
4,2733.03,2024-06-01 17:00:00,1


In [2]:
agg["hour"] = agg["hour_ts"].dt.hour
agg["weekday"] = agg["hour_ts"].dt.weekday

agg.head()


Unnamed: 0,start_station_id,hour_ts,ride_count,hour,weekday
0,2733.03,2024-06-01 09:00:00,1,9,5
1,2733.03,2024-06-01 11:00:00,1,11,5
2,2733.03,2024-06-01 14:00:00,3,14,5
3,2733.03,2024-06-01 15:00:00,4,15,5
4,2733.03,2024-06-01 17:00:00,1,17,5


In [3]:
features = ["hour", "weekday"]
target = "ride_count"

X = agg[features]
y = agg[target]

X.shape, y.shape


((912650, 2), (912650,))

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((730120, 2), (182530, 2))

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(
    n_estimators=50,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)


4.046435753415493

## Baseline Model (Quick)

Model: RandomForestRegressor
Features: hour, weekday
Metric: MAE (rides/hour)

Result:
- MAE â‰ˆ 4 rides/hour on a random split
- Next: use a time-based split + add station-level effects and lag features
