# AutoML percentile-bias correction on M4 (XGBoost)

This notebook compares baseline AutoML forecasts vs. `percentile_correction=True` using M4 Weekly data.

In [None]:
import numpy as np
import pandas as pd

from datasetsforecast.m4 import M4, M4Info
from mlforecast.auto import AutoMLForecast, AutoXGBoost, PredictionIntervals

def eval_rmse_bias(valid_df: pd.DataFrame, preds_df: pd.DataFrame, model_col: str) -> dict:
    merged = valid_df.merge(preds_df, on=["unique_id", "ds"], how="inner")
    err = merged[model_col] - merged["y"]
    return {
        "rows": len(merged),
        "rmse": float(np.sqrt(np.mean(np.square(err)))),
        "bias": float(np.mean(err)),
    }

In [None]:
group = "Weekly"
M4.download("data", group=group)
df, *_ = M4.load(directory="data", group=group)
df["ds"] = df["ds"].astype(int)

h = M4Info[group].horizon
season_length = M4Info[group].seasonality

valid = df.groupby("unique_id").tail(h).copy()
train = df.drop(valid.index).reset_index(drop=True)

train["unique_id"] = train["unique_id"].astype("category")
valid["unique_id"] = valid["unique_id"].astype(train["unique_id"].dtype)

n_windows = 2
num_samples = 5  # increase for stronger tuning
optimize_kwargs = {"timeout": 300}

train.shape, valid.shape, h, season_length

In [None]:
auto_base = AutoMLForecast(
    models={"xgb": AutoXGBoost()},
    freq=1,
    season_length=season_length,
    num_threads=2,
)

auto_base.fit(
    df=train,
    n_windows=n_windows,
    h=h,
    num_samples=num_samples,
    optimize_kwargs=optimize_kwargs,
    prediction_intervals=PredictionIntervals(n_windows=n_windows, h=h),
)

preds_base = auto_base.predict(h=h)
metrics_base = eval_rmse_bias(valid, preds_base, "xgb")
metrics_base

In [None]:
auto_corr = AutoMLForecast(
    models={"xgb": AutoXGBoost()},
    freq=1,
    season_length=season_length,
    num_threads=2,
)

auto_corr.fit(
    df=train,
    n_windows=n_windows,
    h=h,
    num_samples=num_samples,
    optimize_kwargs=optimize_kwargs,
    prediction_intervals=PredictionIntervals(n_windows=n_windows, h=h),
    percentile_correction=True,
)

preds_corr = auto_corr.predict(h=h)
metrics_corr = eval_rmse_bias(valid, preds_corr, "xgb")
metrics_corr

In [None]:
metrics_df = pd.DataFrame([
    {"run": "baseline", **metrics_base},
    {"run": "percentile_correction", **metrics_corr},
]).set_index("run")

cv_metrics_df = pd.DataFrame([
    {"run": "baseline", **auto_base.cv_metrics_["xgb"]},
    {"run": "percentile_correction", **auto_corr.cv_metrics_["xgb"]},
]).set_index("run")

corr_map = auto_corr.percentile_correction_["xgb"]["id_to_col"]
corr_series = pd.Series(corr_map, name="selected_percentile_column")

print(f"Corrected ids: {len(corr_map)} / {train['unique_id'].nunique()}")
display(metrics_df)
display(cv_metrics_df)
display(corr_series.head(20))

In [None]:
# Optional: inspect which percentile/direction was selected most often
if corr_map:
    parsed = pd.Series(corr_map).str.extract(r"-(lo|hi)-(\d+)$")
    parsed.columns = ["direction", "percentile"]
    parsed["percentile"] = parsed["percentile"].astype(int)
    display(parsed.value_counts().rename("count").reset_index())
else:
    print("No systematic-bias ids were found for correction in this run.")