In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')

df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,19.0,28.0,43.0,33.0,12.0,3.0,2.0,1.0,1.0,1.0,...,5.0,5.0,7.0,5.0,8.0,15.0,26.0,2023-01-29,4,53.0
1,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,2.0,0.0,...,3.0,1.0,5.0,2.0,1.0,2.0,0.0,2023-01-30,4,2.0
2,2.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,2.0,3.0,...,2.0,1.0,1.0,1.0,0.0,0.0,2.0,2023-01-31,4,0.0
3,3.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,6.0,2.0,...,3.0,2.0,2.0,1.0,4.0,7.0,7.0,2023-02-01,4,4.0
4,0.0,2.0,0.0,1.0,1.0,1.0,3.0,1.0,9.0,3.0,...,3.0,5.0,7.0,5.0,3.0,7.0,4.0,2023-02-02,4,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-11-26,27,0.0
80168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-11-27,27,0.0
80169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-11-28,27,0.0
80170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-11-29,27,0.0


In [3]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2023,6,1,0,0,0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{X_test.shape=}')

X_train.shape=(32226, 674)
y_train.shape=(32226,)
X_test.shape=(47946, 674)
X_test.shape=(47946, 674)


In [8]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:

    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves",2,256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits=4)
    scores = []
    for train_index, val_index in tss.split(X_train):
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)

    return np.array(scores).mean()

In [9]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2024-01-25 19:34:58,099] A new study created in memory with name: no-name-7411c68b-d96c-40e4-9cc7-5382bea532d3


[I 2024-01-25 19:35:13,246] Trial 0 finished with value: 1.647427571928798 and parameters: {'num_leaves': 128, 'feature_fraction': 0.6607250179413171, 'bagging_fraction': 0.2960390966420217, 'min_child_samples': 33}. Best is trial 0 with value: 1.647427571928798.
[I 2024-01-25 19:35:23,492] Trial 1 finished with value: 1.6538534992358331 and parameters: {'num_leaves': 66, 'feature_fraction': 0.7127154725695302, 'bagging_fraction': 0.5701705001944466, 'min_child_samples': 45}. Best is trial 0 with value: 1.647427571928798.
[I 2024-01-25 19:35:31,572] Trial 2 finished with value: 1.6390075388364562 and parameters: {'num_leaves': 53, 'feature_fraction': 0.5334617069046004, 'bagging_fraction': 0.5752941036329032, 'min_child_samples': 51}. Best is trial 2 with value: 1.6390075388364562.
[I 2024-01-25 19:35:51,826] Trial 3 finished with value: 1.6579592985210714 and parameters: {'num_leaves': 239, 'feature_fraction': 0.8960538126845972, 'bagging_fraction': 0.6983112682912245, 'min_child_samp

In [11]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 53, 'feature_fraction': 0.5334617069046004, 'bagging_fraction': 0.5752941036329032, 'min_child_samples': 51}


In [12]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.092471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156147
[LightGBM] [Info] Number of data points in the train set: 32226, number of used features: 675
[LightGBM] [Info] Start training from score 11.417923


In [14]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.8253


In [17]:
from src.plot import plot_one_sample

plot_one_sample(
    features=X_test,
    targets = y_test,
    example_id = 2900,
    predictions=pd.Series(predictions)
)

In [18]:
plot_one_sample(
    features=X_test,
    targets = y_test,
    example_id = 3900,
    predictions=pd.Series(predictions)
)