In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from datetime import datetime
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, TimeSeriesSplit
import optuna

from src.paths import PROCESSED_DATA_DIR
from src.data_split  import trainTestSplit
from src.model import getPipeline

df = pd.read_parquet(PROCESSED_DATA_DIR / "tabular_data.parquet")

In [19]:
X_train, y_train, X_test, y_test = trainTestSplit(
    df,
    cutoff_date=datetime(2022,8,1,0,0,0),
    tgt_col_name='tgt_rides_next_hr'
)

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_test.shape=}")

X_train.shape=(48208, 674)
y_train.shape=(48208,)
X_test.shape=(40086, 674)
y_test.shape=(40086,)


In [27]:
y_train

0        17.0
1         9.0
2         3.0
3         3.0
4         1.0
         ... 
48203     0.0
48204     0.0
48205     0.0
48206     0.0
48207     0.0
Name: tgt_rides_next_hr, Length: 48208, dtype: float32

In [20]:
def objective(trial: optuna.trial.Trial) ->float:
    """Given set of hyper-parameters, train a model and computes an avg validation error
    based on a TimeSeriesSplit

    Args:
        trial (optuna.trial.Trial): _description_

    Returns:
        float: _description_
    """
    # Pick hyper-params
    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100)
    }
    
    ts_split = TimeSeriesSplit(n_splits=4)
    
    scores = []
    
    for train_index, val_index in ts_split.split(X_train):
        
        # Split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = getPipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)
        
        scores.append(mae)
        
    
    return np.array(scores).mean()
    
        

In [21]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[32m[I 2023-05-07 20:03:27,314][0m A new study created in memory with name: no-name-d0cbf2ce-e98b-4b3a-883a-5ce152303839[0m




[32m[I 2023-05-07 20:04:18,120][0m Trial 0 finished with value: 1.4663495484134648 and parameters: {'num_leaves': 245, 'feature_fraction': 0.9331099739159932, 'bagging_fraction': 0.44677672704265015, 'min_child_samples': 52}. Best is trial 0 with value: 1.4663495484134648.[0m




[32m[I 2023-05-07 20:04:32,847][0m Trial 1 finished with value: 1.4440665890744784 and parameters: {'num_leaves': 118, 'feature_fraction': 0.4083333548909616, 'bagging_fraction': 0.8056991541527343, 'min_child_samples': 90}. Best is trial 1 with value: 1.4440665890744784.[0m




[32m[I 2023-05-07 20:04:55,659][0m Trial 2 finished with value: 1.7147454861057816 and parameters: {'num_leaves': 188, 'feature_fraction': 0.2697122009380797, 'bagging_fraction': 0.5333700381407815, 'min_child_samples': 10}. Best is trial 1 with value: 1.4440665890744784.[0m




[32m[I 2023-05-07 20:05:10,535][0m Trial 3 finished with value: 1.5801232705779393 and parameters: {'num_leaves': 215, 'feature_fraction': 0.22880581249207596, 'bagging_fraction': 0.5751002940727467, 'min_child_samples': 72}. Best is trial 1 with value: 1.4440665890744784.[0m




[32m[I 2023-05-07 20:05:42,542][0m Trial 4 finished with value: 1.628345840281481 and parameters: {'num_leaves': 224, 'feature_fraction': 0.28024497667395853, 'bagging_fraction': 0.4262314213710213, 'min_child_samples': 16}. Best is trial 1 with value: 1.4440665890744784.[0m


In [22]:
best_params = study.best_params
print(f"Best params: {best_params}")

Best params: {'num_leaves': 118, 'feature_fraction': 0.4083333548909616, 'bagging_fraction': 0.8056991541527343, 'min_child_samples': 90}


In [23]:
pipeline = getPipeline(**best_params)
pipeline.fit(X_train, y_train)



In [24]:
preds = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, preds)
print(f"Test MAE: {test_mae}")


Test MAE: 2.553513604928686


In [28]:
from src.plot import plotSample

plotSample(
    features=X_test,
    eg_id=2979,
    targets=y_test,
    predictions=pd.Series(preds)
)