In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.preprocessing import  OneHotEncoder, SplineTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import TimeSeriesSplit

In [3]:
df = pd.read_parquet('../../data/bike_sharing/processed/bike_sharing.parquet')
df

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed,count,count_mean_roll_7,count_mean_roll_30,count_lag_1
29,spring,0.0,1.0,6.0,False,0.0,False,rain,17.22,21.210,0.77,19.9995,2.0,13.285714,34.633333,3.0
30,spring,0.0,1.0,7.0,False,0.0,False,misty,16.40,20.455,0.76,12.9980,1.0,7.857143,34.133333,2.0
31,spring,0.0,1.0,8.0,False,0.0,False,rain,16.40,20.455,0.71,15.0013,8.0,6.571429,33.066667,1.0
32,spring,0.0,1.0,9.0,False,0.0,False,misty,15.58,19.695,0.76,15.0013,20.0,7.000000,32.666667,8.0
33,spring,0.0,1.0,10.0,False,0.0,False,misty,14.76,17.425,0.81,15.0013,53.0,13.285714,34.000000,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,spring,1.0,12.0,19.0,False,1.0,True,misty,10.66,12.880,0.60,11.0014,119.0,197.714286,114.166667,122.0
17375,spring,1.0,12.0,20.0,False,1.0,True,misty,10.66,12.880,0.60,11.0014,89.0,181.428571,111.800000,119.0
17376,spring,1.0,12.0,21.0,False,1.0,True,clear,10.66,12.880,0.60,11.0014,90.0,159.000000,110.200000,89.0
17377,spring,1.0,12.0,22.0,False,1.0,True,clear,10.66,13.635,0.56,8.9981,61.0,122.714286,107.800000,90.0


In [4]:
df.weather.value_counts()
X = df.drop("count", axis="columns")
y = df["count"] / df["count"].max()
X["weather"].replace(to_replace="heavy_rain", value="rain", inplace=True)
df

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed,count,count_mean_roll_7,count_mean_roll_30,count_lag_1
29,spring,0.0,1.0,6.0,False,0.0,False,rain,17.22,21.210,0.77,19.9995,2.0,13.285714,34.633333,3.0
30,spring,0.0,1.0,7.0,False,0.0,False,misty,16.40,20.455,0.76,12.9980,1.0,7.857143,34.133333,2.0
31,spring,0.0,1.0,8.0,False,0.0,False,rain,16.40,20.455,0.71,15.0013,8.0,6.571429,33.066667,1.0
32,spring,0.0,1.0,9.0,False,0.0,False,misty,15.58,19.695,0.76,15.0013,20.0,7.000000,32.666667,8.0
33,spring,0.0,1.0,10.0,False,0.0,False,misty,14.76,17.425,0.81,15.0013,53.0,13.285714,34.000000,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,spring,1.0,12.0,19.0,False,1.0,True,misty,10.66,12.880,0.60,11.0014,119.0,197.714286,114.166667,122.0
17375,spring,1.0,12.0,20.0,False,1.0,True,misty,10.66,12.880,0.60,11.0014,89.0,181.428571,111.800000,119.0
17376,spring,1.0,12.0,21.0,False,1.0,True,clear,10.66,12.880,0.60,11.0014,90.0,159.000000,110.200000,89.0
17377,spring,1.0,12.0,22.0,False,1.0,True,clear,10.66,13.635,0.56,8.9981,61.0,122.714286,107.800000,90.0


In [4]:
ts_cv = TimeSeriesSplit(
    n_splits=5,
    max_train_size=10000,
    test_size=1000,
)

In [5]:
split_idx = list(ts_cv.split(X,y))
train_0, test_0 = split_idx[0]

In [6]:
categorical_columns = [
    "weather",
    "season",
    "holiday",
    "workingday",
]
categories = [
    ["clear", "misty", "rain"],
    ["spring", "summer", "fall", "winter"],
    ["False", "True"],
    ["False", "True"],
]
ordinal_encoder = OrdinalEncoder(categories=categories)


gbrt_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", ordinal_encoder, categorical_columns),
        ],
        remainder="passthrough",
    ),
    HistGradientBoostingRegressor(
        categorical_features=range(4),
    ),
)

In [8]:
def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        f"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )

In [9]:
evaluate(gbrt_pipeline, X, y, ts_cv)

Mean Absolute Error:     0.044 +/- 0.004
Root Mean Squared Error: 0.067 +/- 0.007


In [10]:
train_last, test_last = split_idx[-1]
X_train, y_train =  X.loc[train_last], y.loc[train_last]
X_test,  y_test  =  X.loc[test_last],  y.loc[test_last]
gbrt_pipeline.fit(X_train, y_train)
y_pred = gbrt_pipeline.predict(X.loc[test_last])

In [11]:
one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
alphas = np.logspace(-6, 6, 25)
naive_linear_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", one_hot_encoder, categorical_columns),
        ],
        remainder=MinMaxScaler(),
    ),
    RidgeCV(alphas=alphas),
)

In [12]:
evaluate(naive_linear_pipeline, X, y, ts_cv)

Mean Absolute Error:     0.142 +/- 0.015
Root Mean Squared Error: 0.184 +/- 0.020


In [13]:
def periodic_spline_transformer(period, n_splines=None, degree=3):
    if n_splines is None:
        n_splines = period
    n_knots = n_splines + 1  # periodic and include_bias is True
    return SplineTransformer(
        degree=degree,
        n_knots=n_knots,
        knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
        extrapolation="periodic",
        include_bias=True,
    )

cyclic_spline_transformer = ColumnTransformer(
    transformers=[
        ("categorical", one_hot_encoder, categorical_columns),
        ("cyclic_month", periodic_spline_transformer(12, n_splines=6), ["month"]),
        ("cyclic_weekday", periodic_spline_transformer(7, n_splines=3), ["weekday"]),
        ("cyclic_hour", periodic_spline_transformer(24, n_splines=12), ["hour"]),
    ],
    remainder=MinMaxScaler(),
)
cyclic_spline_linear_pipeline = make_pipeline(
    cyclic_spline_transformer,
    RidgeCV(alphas=alphas),
)