In [1]:
import warnings
import sys
import os

warnings.filterwarnings("ignore")
# Using current working directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR
from datetime import datetime
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking
from dotenv import load_dotenv
import numpy as np
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
from mlflow.models.signature import infer_signature
from sklearn.model_selection import ParameterGrid

# Load data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

# Split data
X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Ensure X_train and X_test contain only numeric columns
X_train_only_numeric = X_train.select_dtypes(include=[np.number])
X_test_only_numeric = X_test.select_dtypes(include=[np.number])

# Step 1: Tune Learning Rate
def tune_learning_rate(X_train, y_train, X_test, y_test):
    learning_rates = [0.01, 0.05, 0.1, 0.2, 0.3]
    best_lr = None
    best_mae = float('inf')

    for lr in learning_rates:
        model = lgb.LGBMRegressor(
            learning_rate=lr,
            n_estimators=100,  # Fixed initially
            random_state=42
        )
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mae = mean_absolute_error(y_test, predictions)
        
        if mae < best_mae:
            best_mae = mae
            best_lr = lr
        
        print(f"Learning Rate: {lr}, MAE: {mae:.4f}")
    
    print(f"Best Learning Rate: {best_lr}, Best MAE: {best_mae:.4f}")
    return best_lr

# Step 2: Tune Other Hyperparameters
def tune_other_params(X_train, y_train, X_test, y_test, best_lr):
    param_grid = {
        'num_leaves': [20, 31, 50],
        'max_depth': [5, 10, -1],
        'min_child_samples': [10, 20, 30],
        'n_estimators': [100, 200, 300]
    }
    
    best_params = None
    best_mae = float('inf')
    best_model = None

    for params in ParameterGrid(param_grid):
        model = lgb.LGBMRegressor(
            learning_rate=best_lr,
            n_estimators=params['n_estimators'],
            num_leaves=params['num_leaves'],
            max_depth=params['max_depth'],
            min_child_samples=params['min_child_samples'],
            random_state=42
        )
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mae = mean_absolute_error(y_test, predictions)
        
        if mae < best_mae:
            best_mae = mae
            best_params = params
            best_model = model
        
        print(f"Params: {params}, MAE: {mae:.4f}")
    
    print(f"Best Params: {best_params}, Best MAE: {best_mae:.4f}")
    return best_model, best_params, best_mae

# Set up MLflow
load_dotenv()
mlflow = set_mlflow_tracking()

# Custom function to log LightGBM model to MLflow
def log_model_to_mlflow(model, X_test, experiment_name, metric_name, score, params=None):
    with mlflow.start_run():
        # Log parameters
        if params:
            mlflow.log_params(params)
        else:
            mlflow.log_param("learning_rate", model.learning_rate)
            mlflow.log_param("n_estimators", model.n_estimators)
            mlflow.log_param("num_leaves", model.num_leaves)
            mlflow.log_param("max_depth", model.max_depth)
            mlflow.log_param("min_child_samples", model.min_child_samples)
        
        # Log metric
        mlflow.log_metric(metric_name, score)
        
        # Infer signature
        predictions = model.predict(X_test)
        signature = infer_signature(X_test, predictions)
        
        # Log the model
        mlflow.lightgbm.log_model(model, "model", signature=signature)

# Main execution
print("Step 1: Tuning Learning Rate...")
best_lr = tune_learning_rate(X_train_only_numeric, y_train, X_test_only_numeric, y_test)

print("\nStep 2: Tuning Other Parameters...")
best_model, best_params, test_mae = tune_other_params(X_train_only_numeric, y_train, X_test_only_numeric, y_test, best_lr)

# Log the final model
params_with_lr = best_params.copy()
params_with_lr['learning_rate'] = best_lr
log_model_to_mlflow(best_model, X_test_only_numeric, "LightGBM", "mean_absolute_error", test_mae, params_with_lr)

print(f"Final Test MAE: {test_mae:.4f}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


INFO:src.experiment_utils:MLflow tracking URI and credentials set.


(55900, 674)
(55900,)
(31720, 674)
(31720,)
Step 1: Tuning Learning Rate...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.108359 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 159906
[LightGBM] [Info] Number of data points in the train set: 55900, number of used features: 673
[LightGBM] [Info] Start training from score 11.110286
Learning Rate: 0.01, MAE: 7.9715
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102534 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 159906
[LightGBM] [Info] Number of data points in the train set: 55900, number of used features: 673
[LightGBM] [Info] Start training from score 11.110286
Learning Rate: 0.05, MAE: 3.3919
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.106570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[Lig