In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data_step_size_23h.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29 00:00:00,1,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2022-01-29 23:00:00,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,2.0,2.0,0.0,1.0,2.0,0.0,0.0,2022-01-30 22:00:00,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,2.0,1.0,0.0,1.0,2022-01-31 21:00:00,1,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2022-02-01 20:00:00,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93275,12.0,11.0,19.0,31.0,30.0,14.0,6.0,4.0,4.0,7.0,...,0.0,0.0,2.0,6.0,2.0,1.0,1.0,2022-12-27 13:00:00,265,2.0
93276,8.0,16.0,22.0,26.0,21.0,39.0,20.0,4.0,5.0,7.0,...,0.0,0.0,1.0,0.0,0.0,3.0,3.0,2022-12-28 12:00:00,265,2.0
93277,10.0,8.0,4.0,15.0,23.0,27.0,19.0,13.0,7.0,6.0,...,2.0,1.0,1.0,1.0,0.0,2.0,5.0,2022-12-29 11:00:00,265,2.0
93278,9.0,15.0,9.0,17.0,16.0,24.0,19.0,22.0,9.0,8.0,...,1.0,0.0,4.0,1.0,1.0,0.0,0.0,2022-12-30 10:00:00,265,5.0


In [3]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(34185, 674)
y_train.shape=(34185,)
X_test.shape=(59095, 674)
y_test.shape=(59095,)


In [5]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
       
    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [6]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[32m[I 2023-05-16 23:03:23,305][0m A new study created in memory with name: no-name-9dd7d542-df70-4ab4-9804-b6d56bd90691[0m




[32m[I 2023-05-16 23:03:55,024][0m Trial 0 finished with value: 3.6649096308205373 and parameters: {'num_leaves': 130, 'feature_fraction': 0.9814356338840302, 'bagging_fraction': 0.7585007274078854, 'min_child_samples': 11}. Best is trial 0 with value: 3.6649096308205373.[0m




[32m[I 2023-05-16 23:04:45,656][0m Trial 1 finished with value: 3.658324281137176 and parameters: {'num_leaves': 209, 'feature_fraction': 0.60696724131277, 'bagging_fraction': 0.8052778200971784, 'min_child_samples': 13}. Best is trial 1 with value: 3.658324281137176.[0m




[32m[I 2023-05-16 23:05:05,442][0m Trial 2 finished with value: 3.720074467756576 and parameters: {'num_leaves': 85, 'feature_fraction': 0.6769987557458856, 'bagging_fraction': 0.7812954025286771, 'min_child_samples': 25}. Best is trial 1 with value: 3.658324281137176.[0m




[32m[I 2023-05-16 23:05:13,657][0m Trial 3 finished with value: 3.7806561347645884 and parameters: {'num_leaves': 24, 'feature_fraction': 0.8619956556704884, 'bagging_fraction': 0.7810460732167053, 'min_child_samples': 40}. Best is trial 1 with value: 3.658324281137176.[0m




[32m[I 2023-05-16 23:05:40,486][0m Trial 4 finished with value: 3.9911780778781347 and parameters: {'num_leaves': 189, 'feature_fraction': 0.42567367226435765, 'bagging_fraction': 0.22377528594409776, 'min_child_samples': 99}. Best is trial 1 with value: 3.658324281137176.[0m


In [7]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 209, 'feature_fraction': 0.60696724131277, 'bagging_fraction': 0.8052778200971784, 'min_child_samples': 13}


In [8]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)



In [9]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.9489


In [10]:
from src.plot import plot_one_sample

plot_one_sample(
    features=X_test,
    targets=y_test,
    example_id=2979,
    predictions=pd.Series(predictions)
)

In [18]:
plot_one_sample(
    features=X_test,
    targets=y_test,
    example_id=3979,
    predictions=pd.Series(predictions)
)