In [26]:
import warnings
warnings.filterwarnings("ignore")

In [35]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

data = pd.read_parquet(TRANSFORMED_DATA_DIR / 'yellow_tripdata_features_target.parquet')

In [29]:
from lightgbm import LGBMRegressor

from src.training import train_test_split, get_cutoff_training_date

X_train, y_train, X_test, y_test = train_test_split(
    data,
    cutoff_date = get_cutoff_training_date(data),
    target_column_name='target_rides_next_hour'
    )

In [30]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

import optuna

from src.model import get_pipeline
from typing import Callable

def lgbm_param_suggestion(trial: optuna.trial.Trial) -> dict:
    """ Define the hyperparameters search space """

    return {
        "metric": "rmse",
        "verbosity": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

def objective(
        trial: optuna.trial.Trial, model: Callable, param_suggestion_func: Callable
        ) -> float:
    """
    Objective function for hyperparameter optimization, aiming to minimize the RMSE (Root Mean Squared Error) 
    of a specified model using cross-validated training.

    Parameters
    ----------
    trial : optuna.trial.Trial
        An Optuna Trial object used to suggest values for hyperparameters within the defined search space.
    model : Callable
        A callable that initializes the machine learning model (e.g., LGBMRegressor) with the suggested hyperparameters.
    param_suggestion_func : Callable
        A function that takes `trial` as input and returns a dictionary of hyperparameters for the model,
        defining the search space for each hyperparameter.
        
    Returns
    -------
    float
        The mean RMSE score across cross-validation splits, representing the model's performance 
        on unseen data for the current set of hyperparameters.
        
    Process
    -------
    1. Hyperparameter Definition:
       The function calls `param_suggestion_func(trial)`, which returns a dictionary of hyperparameters for the model,
       dynamically generated using `trial.suggest_*` methods to explore the search space.

    2. Cross-Validation:
       A `TimeSeriesSplit` object with 5 splits is used for cross-validation, ensuring that training data precedes
       validation data in time, which is crucial for time-series modeling.

       For each split:
        - `X_train_cv` and `y_train_cv`: Data used for training in this split.
        - `X_val_cv` and `y_val_cv`: Data used for validation in this split.
       
    3. Model Training and Evaluation:
       - The pipeline is defined using `get_pipeline(model, **hyperparameters)`, which integrates the model 
         with the specified set of hyperparameters.
       - The pipeline is trained on the training split.
       - Predictions are made on the validation split.
       - RMSE between the predicted and actual values is calculated and stored.

    4. Average RMSE Calculation:
       The function returns the mean RMSE score across all cross-validation folds, serving as the metric 
       for Optuna to minimize.

    Example
    -------
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, LGBMRegressor, lgbm_param_suggestions), n_trials=100)
    """
    # Define the hyperparameters
    hyperparameters = param_suggestion_func(trial)

    # Define the cross-validation strategy
    cv = TimeSeriesSplit(n_splits=5)

    # Define the list to store the RMSE
    scores = []

    # Loop over the cross-validation splits
    for train_index, test_index in cv.split(X_train):
        X_train_cv, X_val_cv = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
        y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[test_index]

        # Define the pipeline
        pipeline = get_pipeline(model,**hyperparameters)
        # Train the model
        pipeline.fit(X_train_cv, y_train_cv)
        # Make predictions
        y_pred = pipeline.predict(X_val_cv)
        # Compute and append the RMSE
        rmse = root_mean_squared_error(y_val_cv, y_pred)
        scores.append(rmse)
    
    return np.array(scores).mean()

    

In [31]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="minimize")

study.optimize(lambda trial: objective(trial, LGBMRegressor, lgbm_param_suggestion), n_trials=10)

In [32]:
best_params = study.best_trial.params
print(f'Best hyperparameters: {best_params}')

Best hyperparameters: {'num_leaves': 241, 'feature_fraction': 0.6509195804782792, 'bagging_fraction': 0.6888523806497837, 'min_child_samples': 10}


In [33]:
# Retrain the model with the best hyperparameters and
# entire training data

pipeline = get_pipeline(LGBMRegressor,**best_params)
pipeline.fit(X_train, y_train)

In [34]:
y_pred = pipeline.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f'RMSE on test set: {rmse}')

RMSE on test set: 9.13483895705977


In [25]:
from src.plot import plot_one_sample

plot_one_sample(
    features = X_test,
    target = y_test,
    prediction = pd.Series(y_pred),
    sample_idx = 3000,
)