# LightGBM Model

In [None]:
import pandas as pd

from lightgbm import LGBMRegressor # The ML model
from utilsforecast.losses import rmse, mae
from mlforecast import MLForecast
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.methods import MinTrace, BottomUp
from hierarchicalforecast.utils import aggregate
from hierarchicalforecast.core import HierarchicalReconciliation

from valuation.infra.store.dataset import DatasetStore
from valuation.asset.identity.dataset import DatasetID
from valuation.core.stage import DatasetStage
from valuation.asset.identity.model import ModelPassport
from valuation.asset.model.mlforecast import MLForecastModel
from valuation.infra.store.model import ModelStore



## Model Parameters

In [None]:
N_ESTIMATORS = 100  # Number of trees for LightGBM
NUM_CORES = 24
SAFE_N_JOBS = max(1, NUM_CORES - 2)

## Training Data

In [None]:
store = DatasetStore()
dataset_id = DatasetID(name="train_val", stage=DatasetStage.MODEL)
passport = store.get_passport(dataset_id=dataset_id)
ds = store.get(passport=passport)
train_df = ds.data

## Define the Model
We instantate a LightGBM Model

In [None]:

models = [LGBMRegressor(
    random_state=42, 
    n_estimators=N_ESTIMATORS, # 100 trees is a good, fast start
    n_jobs=SAFE_N_JOBS # Use the 'safe' n_jobs we defined
)]

## Feature Engineering

In [None]:
mf = MLForecast(
    models=models,
    freq=pd.offsets.Week(weekday=2), # Weekly frequency ending on Wednesday
    # --- This is the automated feature engineering ---
    lags=[52], # Use the value from 52 weeks ago as a feature
    date_features=['week', 'month', 'year'] 
)

## Blocked Cross-Validation
This generates the unreconciled forecasts for each fold. We must add fitted=True to get the in-sample forecasts for the reconciler.

In [None]:
cv_df_base = mf.cross_validation(
    df=train_df,
    h=52,
    n_windows=5,
    fitted=True
)
mf.fit(df=train_df)

## Create and Persist the Model Object

In [None]:
passport = ModelPassport.create(
    name=f"lightgbm_model_{N_ESTIMATORS}_trees",
    description=f"LightGBM model with {N_ESTIMATORS} trees and basic feature engineering",
    )
model = MLForecastModel(passport=passport,model=mf)
model_store = ModelStore()
model_store.remove(passport=passport)
model_store.add(model=model)

## Create Summing Matrix and Tags

In [None]:
# 1. Start with the core data (unique_id, ds, y)
hierarchy_df = train_df[['unique_id', 'ds', 'y']].drop_duplicates() 

# 2. Create the grouping columns
hierarchy_df['store'] = hierarchy_df['unique_id'].apply(lambda s: s.split('_')[0])
hierarchy_df['category'] = hierarchy_df['unique_id'].apply(lambda s: s.split('_')[1])

# 3. Drop the 'unique_id' column from the input DF before aggregation
#    The aggregate function knows to use the combination of columns in 'spec'
#    to uniquely identify the time series levels.
hierarchy_df_clean = hierarchy_df.drop(columns=['unique_id']) # 👈 ADD THIS LINE

spec = [['store'], ['category'], ['store', 'category']]

# Pass the cleaned DataFrame to the aggregate function
_, S_df, tags = aggregate(df=hierarchy_df_clean, spec=spec)

## Aggregate Forecasts

In [None]:
# Clean and Prepare Y_hat_df_base
Y_hat_df_base = cv_df_base.drop(columns=['cutoff', 'y'], errors='ignore')

# 1. Add the hierarchy columns to the base forecasts
Y_hat_df_base_clean = Y_hat_df_base.copy()
Y_hat_df_base_clean['store'] = Y_hat_df_base_clean['unique_id'].apply(lambda s: s.split('_')[0])
Y_hat_df_base_clean['category'] = Y_hat_df_base_clean['unique_id'].apply(lambda s: s.split('_')[1])

# 2. Identify the forecast column(s) - typically 'LGBMRegressor' or similar model name
forecast_col = 'LGBMRegressor'  # Adjust if your column has a different name

# 3. Rename forecast column to 'y' temporarily for aggregation
Y_hat_df_base_for_agg = Y_hat_df_base_clean.copy()
Y_hat_df_base_for_agg = Y_hat_df_base_for_agg.rename(columns={forecast_col: 'y'})
Y_hat_df_base_for_agg = Y_hat_df_base_for_agg.drop(columns=['unique_id'])

# 4. Aggregate to create forecasts at all hierarchy levels
Y_hat_aggregated, _, _ = aggregate(df=Y_hat_df_base_for_agg, spec=spec)

# 5. Rename back to original forecast column name
Y_hat_aggregated = Y_hat_aggregated.rename(columns={'y': forecast_col})

print("Forecasts aggregated successfully across all hierarchy levels.")
print(f"Base forecasts shape: {Y_hat_df_base.shape}")
print(f"Aggregated forecasts shape: {Y_hat_aggregated.shape}")
print(f"Unique IDs in aggregated forecasts: {Y_hat_aggregated['unique_id'].nunique()}")


## Reconciler for CV Forecasts

In [None]:
reconcilers = [MinTrace(method='ols')]
hrec = HierarchicalReconciliation(reconcilers=reconcilers)

# Prepare aggregated actuals
Y_df_base = cv_df_base[['unique_id', 'ds', 'y']].copy()
Y_df_base['store'] = Y_df_base['unique_id'].apply(lambda s: s.split('_')[0])
Y_df_base['category'] = Y_df_base['unique_id'].apply(lambda s: s.split('_')[1])
Y_df_base_for_agg = Y_df_base.drop(columns=['unique_id'])
Y_df_actuals, _, _ = aggregate(df=Y_df_base_for_agg, spec=spec)

# Reconcile (this adjusts the aggregated forecasts for coherence)
cv_df_reconciled = hrec.reconcile(
    Y_hat_df=Y_hat_aggregated,  # Now has all hierarchy levels
    Y_df=Y_df_actuals,
    S_df=S_df,
    tags=tags
)

## Evaluate CV Performance

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import numpy as np

# 1. Get actuals at ALL hierarchy levels
actuals_base = cv_df_base[['unique_id', 'ds', 'cutoff', 'y']].copy()
actuals_base['store'] = actuals_base['unique_id'].apply(lambda s: s.split('_')[0])
actuals_base['category'] = actuals_base['unique_id'].apply(lambda s: s.split('_')[1])
actuals_base_for_agg = actuals_base.drop(columns=['unique_id'])

# Aggregate actuals
actuals_aggregated, _, _ = aggregate(df=actuals_base_for_agg, spec=spec)

# 2. Merge forecasts with actuals
cv_df_eval = cv_df_reconciled.merge(
    actuals_aggregated[['unique_id', 'ds', 'y']], 
    on=['unique_id', 'ds'],
    how='left'
)

# 3. Classify hierarchy levels properly
def classify_level(uid):
    if '_' in uid:
        return 'bottom'  # store_category (e.g., "100_beer")
    elif '/' in uid:
        return 'store_category'  # aggregated store/category (e.g., "100/beer")
    else:
        # Check if it's a store (numeric) or category (text)
        try:
            int(uid)
            return 'store'  # Just store (e.g., "100")
        except:
            return 'category'  # Just category (e.g., "beer")

cv_df_eval['level'] = cv_df_eval['unique_id'].apply(classify_level)

# 4. Get model columns
model_cols = [col for col in cv_df_reconciled.columns 
              if col not in ['unique_id', 'ds', 'cutoff']]

print(f"Found model columns: {model_cols}")
print(f"\nDataset overview:")
print(f"  Total forecasts: {len(cv_df_eval):,}")
print(f"  Unique series: {cv_df_eval['unique_id'].nunique():,}")
print(f"  Date range: {cv_df_eval['ds'].min()} to {cv_df_eval['ds'].max()}")
print(f"\nActual values summary:")
print(cv_df_eval['y'].describe())

# 5. Overall Performance
print("\n" + "="*80)
print("OVERALL PERFORMANCE (All Hierarchy Levels)")
print("="*80)

performance_results = []
for model_col in model_cols:
    mask = cv_df_eval[[model_col, 'y']].notna().all(axis=1)
    y_true = cv_df_eval.loc[mask, 'y']
    y_pred = cv_df_eval.loc[mask, model_col]
    
    performance_results.append({
        'model': model_col.replace('LGBMRegressor/', ''),  # Shorter names
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred),
        'MAPE': mean_absolute_percentage_error(y_true, y_pred) * 100,  # As percentage
        'Mean_Actual': y_true.mean(),
        'n_forecasts': len(y_true)
    })

overall_perf = pd.DataFrame(performance_results).set_index('model')
print(overall_perf.to_string())

# 6. Performance by Hierarchy Level
print("\n" + "="*80)
print("PERFORMANCE BY HIERARCHY LEVEL")
print("="*80)

levels_order = ['bottom', 'store_category', 'store', 'category']
level_results = []

for level in levels_order:
    level_data = cv_df_eval[cv_df_eval['level'] == level]
    
    if len(level_data) == 0:
        continue
        
    print(f"\n{level.upper()} Level:")
    print(f"  Unique series: {level_data['unique_id'].nunique():,}")
    print(f"  Total forecasts: {len(level_data):,}")
    print(f"  Actual mean: {level_data['y'].mean():.2f}, std: {level_data['y'].std():.2f}")
    
    for model_col in model_cols:
        mask = level_data[[model_col, 'y']].notna().all(axis=1)
        y_true = level_data.loc[mask, 'y']
        y_pred = level_data.loc[mask, model_col]
        
        if len(y_true) > 0:
            rmse_val = np.sqrt(mean_squared_error(y_true, y_pred))
            mae_val = mean_absolute_error(y_true, y_pred)
            mape_val = mean_absolute_percentage_error(y_true, y_pred) * 100
            
            level_results.append({
                'Level': level,
                'Model': model_col.replace('LGBMRegressor/', ''),
                'RMSE': rmse_val,
                'MAE': mae_val,
                'MAPE%': mape_val,
                'Mean_Actual': y_true.mean(),
                'n': len(y_true)
            })
            
            # Normalized error (MAE as % of mean)
            normalized_mae = (mae_val / y_true.mean() * 100) if y_true.mean() > 0 else 0
            
            print(f"    {model_col.replace('LGBMRegressor/', '')[:30]:30s} -> "
                  f"RMSE: {rmse_val:>10.2f}, MAE: {mae_val:>10.2f}, "
                  f"MAPE: {mape_val:>6.2f}%, Norm_MAE: {normalized_mae:>6.2f}%")

# 7. Comparison Table
print("\n" + "="*80)
print("RECONCILIATION IMPACT (Comparing Base vs Reconciled)")
print("="*80)

level_perf_df = pd.DataFrame(level_results)
if len(level_perf_df) > 0:
    comparison = level_perf_df.pivot_table(
        index='Level',
        columns='Model',
        values=['RMSE', 'MAE', 'MAPE%']
    )
    print(comparison.to_string())

# 8. Sample predictions vs actuals
print("\n" + "="*80)
print("SAMPLE PREDICTIONS (First 10 bottom-level forecasts)")
print("="*80)

sample = cv_df_eval[cv_df_eval['level'] == 'bottom'].head(10)[
    ['unique_id', 'ds', 'y'] + model_cols
].round(2)
print(sample.to_string())