# LightGBM Model

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor # The ML model
from utilsforecast.losses import rmse, mae
from mlforecast import MLForecast

from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.methods import MinTrace
from hierarchicalforecast.utils import aggregate

from valuation.infra.store.dataset import DatasetStore
from valuation.asset.identity.dataset import DatasetID
from valuation.core.stage import DatasetStage
from valuation.asset.identity.model import ModelPassport
from valuation.asset.model.mlforecast import MLForecastModel
from valuation.infra.store.model import ModelStore



## Model Parameters

In [None]:
N_ESTIMATORS = 100  # Number of trees for LightGBM
NUM_CORES = 24
SAFE_N_JOBS = max(1, NUM_CORES - 2)

## Training Data

In [None]:
store = DatasetStore()
dataset_id = DatasetID(name="train_val", stage=DatasetStage.MODEL)
passport = store.get_passport(dataset_id=dataset_id)
ds = store.get(passport=passport)
train_df = ds.data

## Define the Model
We instantate a LightGBM Model

In [None]:

models = [LGBMRegressor(
    random_state=42, 
    n_estimators=N_ESTIMATORS, # 100 trees is a good, fast start
    n_jobs=SAFE_N_JOBS # Use the 'safe' n_jobs we defined
)]

## Feature Engineering

In [None]:
mf = MLForecast(
    models=models,
    freq=pd.offsets.Week(weekday=2), # Weekly frequency ending on Wednesday
    # --- This is the automated feature engineering ---
    lags=[52], # Use the value from 52 weeks ago as a feature
    # lag_transforms={
    #     52: [ # On the 52-week lag, calculate...
    #         (np.mean, 4), # 4-week rolling avg of the 52-week lag
    #     ],
    # },
    date_features=['week', 'month', 'year'] 
)

## Blocked Cross-Validation
This generates the unreconciled forecasts for each fold. We must add fitted=True to get the in-sample forecasts for the reconciler.

In [None]:
cv_df_base = mf.cross_validation(
    df=train_df,
    h=52,
    n_windows=5,
    fitted=True
)

## Create and Persist the Model Object

In [None]:
passport = ModelPassport.create(
    name=f"lightgbm_model_{N_ESTIMATORS}_trees",
    description=f"LightGBM model with {N_ESTIMATORS} trees and basic feature engineering",
    )
model = MLForecastModel(passport=passport,model=mf)
model_store = ModelStore()
model_store.add(model=model)
model.save()

## Create Summing Matrix and Tags

In [None]:
hierarchy_df = train_df[['unique_id', 'ds']].drop_duplicates()
hierarchy_df['store'] = hierarchy_df['unique_id'].apply(lambda s: s.split('_')[0])
hierarchy_df['category'] = hierarchy_df['unique_id'].apply(lambda s: s.split('_')[1])
spec = [['store'], ['category'], ['store', 'category']]
_, S_df, tags = aggregate(df=hierarchy_df, spec=spec)

## Reconciler for CV Forecasts

In [None]:
reconcilers = [MinTrace(method='mint_ols')]
hrec = HierarchicalReconciliation(reconcilers=reconcilers)
cv_df_reconciled = hrec.reconcile(
    Y_hat_df=cv_df_base,
    Y_df=cv_df_base,
    S_df=S_df,
    tags=tags
)

## Evaluate CV Performance

In [None]:
actuals_df = cv_df_base[['unique_id', 'ds', 'cutoff', 'y']]
cv_df_eval = cv_df_reconciled.merge(actuals_df, on=['unique_id', 'ds', 'cutoff'])

performance = cv_df_eval.groupby('model').agg(
    RMSE=pd.NamedAgg(column='y', aggfunc=rmse),
    MAE=pd.NamedAgg(column='y', aggfunc=mae)
)
print("--- Cross-Validation Performance ---")
print(performance)