# Final model
Here the best combination of predictor and dataset features will be determined and validated.

## Model selection with feature combinations
We evaluate multiple feature sets (baseline, holidays, Fourier, trend, and their combination) together with different predictors. Each predictor is fine-tuned via a small hyperparameter grid, and the best-performing combination on the validation split is returned.

In [14]:
from __future__ import annotations
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import ParameterGrid
from statsforecast.core import StatsForecast
from statsforecast.models import AutoARIMA, AutoETS, SeasonalNaive
from statsforecast.models import AutoARIMA, AutoETS, SeasonalNaive
import inspect

DATA_DIR = Path("..") / "data" / "processed_data"
FEATURE_KEYS = ["none", "holidays", "fourier", "trend", "fourier+trend+holidays"]
SPLITS = ["train", "val"]
FREQ = "D"

def load_feature_sets():
    features = {k: {} for k in FEATURE_KEYS}
    future = {k: {} for k in FEATURE_KEYS}
    for key in FEATURE_KEYS:
        for split in SPLITS:
            path = DATA_DIR / f"{key}_{split}.parquet"
            features[key][split] = pl.read_parquet(path) if path.exists() else None
            future_path = DATA_DIR / f"{key}_{split}_future.parquet"
            future[key][split] = pl.read_parquet(future_path) if future_path.exists() else None
    return features, future

def prepare_target(df: pl.DataFrame) -> pd.DataFrame:
    return df.select(["unique_id", "ds", "y"]).to_pandas()

features, features_future = load_feature_sets()

In [3]:
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [4]:
def get_horizon(df_future: pl.DataFrame | None, val_df: pl.DataFrame) -> int:
    if df_future is not None:
        first_id = df_future["unique_id"][0]
        return df_future.filter(pl.col("unique_id") == first_id).height
    first_id = val_df["unique_id"][0]
    return val_df.filter(pl.col("unique_id") == first_id).height

MODEL_GRID = {
    "AutoARIMA": {
        "constructor": AutoARIMA,
        "params": {
            "season_length": [7],
            "stepwise": [True],
            "approximation": [True, False],
        },
    },
    "AutoETS": {
        "constructor": AutoETS,
        "params": {
            "season_length": [7],
            "model": ["ZZZ"],
        },
    },
    "SeasonalNaive": {
        "constructor": SeasonalNaive,
        "params": {
            "season_length": [7, 14],
        },
    },
    # RandomForestRegressor handled separately (not a StatsForecast model)
    "RandomForestRegressor": {
        "constructor": RandomForestRegressor,
        "params": {
            "n_estimators": [200, 400, 600],
            "max_depth": [None, 10, 20],
            "random_state": [42],
            "n_jobs": [-1],
        },
    },
}


def _fit_with_exog(sf: StatsForecast, df: pd.DataFrame):
    return sf.fit(df=df)

def _predict_with_exog(fitted: StatsForecast, h: int):
    return fitted.predict(h=h)

def evaluate_feature_model(feature_key: str):
    train = features[feature_key].get("train")
    val = features[feature_key].get("val")
    val_future = features_future[feature_key].get("val")
    if train is None or val is None:
        return []
    y_train = prepare_target(train)
    y_val = prepare_target(val)
    h = get_horizon(val_future, val)

    results = []
    for model_name, cfg in MODEL_GRID.items():
        for params in ParameterGrid(cfg["params"]):
            # RandomForest branch uses all non-(unique_id, ds, y) columns already in the dataset
            if model_name == "RandomForestRegressor":
                train_df = train.to_pandas()
                val_df = val.to_pandas()
                feature_cols = [c for c in train_df.columns if c not in ["unique_id", "ds", "y"]]
                if not feature_cols:
                    print(f"Skipping RandomForest for {feature_key}: no feature columns available")
                    continue

                train_df = train_df[["unique_id", "ds", "y", *feature_cols]].fillna("")
                val_df = val_df[["unique_id", "ds", "y", *feature_cols]].fillna("")

                cat_cols = [c for c in feature_cols if train_df[c].dtype == "object"]
                train_X = pd.get_dummies(train_df[feature_cols], columns=cat_cols, drop_first=False)
                val_X = pd.get_dummies(val_df[feature_cols], columns=cat_cols, drop_first=False)

                cols = sorted(set(train_X.columns) | set(val_X.columns))
                train_X = train_X.reindex(columns=cols, fill_value=0)
                val_X = val_X.reindex(columns=cols, fill_value=0)

                rf = cfg["constructor"](**params)
                rf.fit(train_X, train_df["y"])
                preds = rf.predict(val_X)
                merged = val_df.copy()
                merged["yhat"] = preds
                mae = mean_absolute_error(merged["y"], merged["yhat"])
                rmse = root_mean_squared_error(merged["y"], merged["yhat"])
                smape = np.mean(200 * np.abs(merged["yhat"] - merged["y"]) / (np.abs(merged["y"]) + np.abs(merged["yhat"]) + 1e-8))
                r2 = r2_score(merged["y"], merged["yhat"])
                results.append({
                    "feature_set": feature_key,
                    "model": model_name,
                    "params": params,
                    "used_exog": True,
                    "mae": mae,
                    "rmse": rmse,
                    "smape": smape,
                    "r2": r2,
                })
                continue

            model = cfg["constructor"](**params)
            sf = StatsForecast(models=[model], freq=FREQ, n_jobs=-1)
            fitted = _fit_with_exog(sf, y_train)
            fcst = _predict_with_exog(fitted, h)
            yhat_col = [c for c in fcst.columns if c not in ("unique_id", "ds")][0]
            preds = fcst.rename(columns={yhat_col: "yhat"})
            merged = y_val.merge(preds, on=["unique_id", "ds"], how="inner")
            mae = mean_absolute_error(merged["y"], merged["yhat"])
            rmse = root_mean_squared_error(merged["y"], merged["yhat"])
            smape = np.mean(200 * np.abs(merged["yhat"] - merged["y"]) / (np.abs(merged["y"]) + np.abs(merged["yhat"]) + 1e-8))
            r2 = r2_score(merged["y"], merged["yhat"])
            results.append({
                "feature_set": feature_key,
                "model": model_name,
                "params": params,
                "used_exog": False,
                "mae": mae,
                "rmse": rmse,
                "smape": smape,
                "r2": r2,
            })
    return results

In [5]:
all_results: list[dict] = []
for feature_key in FEATURE_KEYS:
    all_results.extend(evaluate_feature_model(feature_key))

if not all_results:
    raise RuntimeError("No feature/model results were produced. Ensure processed parquet files exist.")

results_df = pd.DataFrame(all_results).sort_values(["rmse", "mae"]).reset_index(drop=True)
display(results_df.head(10))

best_combo = results_df.iloc[0]
best_combo

Skipping RandomForest for none: no feature columns available
Skipping RandomForest for none: no feature columns available
Skipping RandomForest for none: no feature columns available
Skipping RandomForest for none: no feature columns available
Skipping RandomForest for none: no feature columns available
Skipping RandomForest for none: no feature columns available
Skipping RandomForest for none: no feature columns available
Skipping RandomForest for none: no feature columns available
Skipping RandomForest for none: no feature columns available


Unnamed: 0,feature_set,model,params,used_exog,mae,rmse,smape,r2
0,none,AutoETS,"{'model': 'ZZZ', 'season_length': 7}",False,78.507702,169.319382,19.019246,0.980389
1,holidays,AutoETS,"{'model': 'ZZZ', 'season_length': 7}",False,78.507702,169.319382,19.019246,0.980389
2,fourier,AutoETS,"{'model': 'ZZZ', 'season_length': 7}",False,78.507702,169.319382,19.019246,0.980389
3,trend,AutoETS,"{'model': 'ZZZ', 'season_length': 7}",False,78.507702,169.319382,19.019246,0.980389
4,fourier+trend+holidays,AutoETS,"{'model': 'ZZZ', 'season_length': 7}",False,78.507702,169.319382,19.019246,0.980389
5,none,AutoARIMA,"{'approximation': True, 'season_length': 7, 's...",False,81.335207,174.744147,19.149107,0.979113
6,holidays,AutoARIMA,"{'approximation': True, 'season_length': 7, 's...",False,81.335207,174.744147,19.149107,0.979113
7,fourier,AutoARIMA,"{'approximation': True, 'season_length': 7, 's...",False,81.335207,174.744147,19.149107,0.979113
8,trend,AutoARIMA,"{'approximation': True, 'season_length': 7, 's...",False,81.335207,174.744147,19.149107,0.979113
9,fourier+trend+holidays,AutoARIMA,"{'approximation': True, 'season_length': 7, 's...",False,81.335207,174.744147,19.149107,0.979113


feature_set                                    none
model                                       AutoETS
params         {'model': 'ZZZ', 'season_length': 7}
used_exog                                     False
mae                                       78.507702
rmse                                     169.319382
smape                                     19.019246
r2                                         0.980389
Name: 0, dtype: object

## Test set evaluation
Load test split, retrain best model on train+val, and generate predictions against baseline.

In [6]:
# Load test split
SPLITS_TEST = ["train", "val", "test"]

def load_all_splits(feature_key: str):
    splits_data = {}
    futures_data = {}
    for split in SPLITS_TEST:
        path = DATA_DIR / f"{feature_key}_{split}.parquet"
        splits_data[split] = pl.read_parquet(path) if path.exists() else None
        future_path = DATA_DIR / f"{feature_key}_{split}_future.parquet"
        futures_data[split] = pl.read_parquet(future_path) if future_path.exists() else None
    return splits_data, futures_data

best_feature = best_combo["feature_set"]
best_model_name = best_combo["model"]
best_params = best_combo["params"]
used_exog = best_combo["used_exog"]

print(f"Best: {best_model_name} with {best_feature} features")
print(f"Params: {best_params}")
print(f"Used exog: {used_exog}")

splits, futures = load_all_splits(best_feature)
splits.keys()

Best: AutoETS with none features
Params: {'model': 'ZZZ', 'season_length': 7}
Used exog: False


dict_keys(['train', 'val', 'test'])

In [7]:
# Combine train+val for final training
y_train_val = pd.concat([
    prepare_target(splits["train"]),
    prepare_target(splits["val"])
], ignore_index=True).sort_values(["unique_id", "ds"]).reset_index(drop=True)

y_test = prepare_target(splits["test"])

# Feature columns already present in dataset (non-unique_id/ds/y)
train_full = splits["train"].to_pandas()
val_full = splits["val"].to_pandas()
test_full = splits["test"].to_pandas()
feature_cols = [c for c in train_full.columns if c not in ["unique_id", "ds", "y"]]

X_train_val = None
X_test_future = None
if feature_cols:
    X_train_val = pd.concat([
        train_full[["unique_id", "ds", *feature_cols]],
        val_full[["unique_id", "ds", *feature_cols]]
    ], ignore_index=True).sort_values(["unique_id", "ds"]).reset_index(drop=True).fillna(0)
    X_test_future = test_full[["unique_id", "ds", *feature_cols]].sort_values(["unique_id", "ds"]).reset_index(drop=True).fillna(0)

h_test = get_horizon(futures["test"], splits["test"])
print(f"Train+Val shape: {y_train_val.shape}")
print(f"Test shape: {y_test.shape}")
print(f"Test horizon: {h_test}")
print(f"Feature columns available: {feature_cols}")

Train+Val shape: (19490, 3)
Test shape: (310, 3)
Test horizon: 31
Feature columns available: []


In [8]:
# Train baseline (SeasonalNaive 7-day)
baseline_model = SeasonalNaive(season_length=7)
sf_baseline = StatsForecast(models=[baseline_model], freq=FREQ, n_jobs=-1)
sf_baseline.fit(df=y_train_val)
fcst_baseline = sf_baseline.predict(h=h_test)

baseline_col = [c for c in fcst_baseline.columns if c not in ("unique_id", "ds")][0]
fcst_baseline = fcst_baseline.rename(columns={baseline_col: "baseline_pred"})

# Train best model
best_model_constructor = MODEL_GRID[best_model_name]["constructor"]
best_model = best_model_constructor(**best_params)

if best_model_name == "RandomForestRegressor":
    if X_train_val is None or X_test_future is None:
        raise RuntimeError("RandomForestRegressor selected but no feature columns are available.")
    train_df = y_train_val.merge(X_train_val, on=["unique_id", "ds"], how="left").fillna(0)
    test_df = y_test.merge(X_test_future, on=["unique_id", "ds"], how="left").fillna(0)
    feature_cols_rf = [c for c in train_df.columns if c not in ["unique_id", "ds", "y"]]
    best_model.fit(train_df[feature_cols_rf], train_df["y"])
    test_preds = best_model.predict(test_df[feature_cols_rf])
    fcst_best = test_df[["unique_id", "ds"]].copy()
    fcst_best["best_pred"] = test_preds
else:
    sf_best = StatsForecast(models=[best_model], freq=FREQ, n_jobs=-1)
    fitted_best = sf_best.fit(df=y_train_val)
    fcst_best = fitted_best.predict(h=h_test)
    best_col = [c for c in fcst_best.columns if c not in ("unique_id", "ds")][0]
    fcst_best = fcst_best.rename(columns={best_col: "best_pred"})

print("✓ Baseline and best model trained")

✓ Baseline and best model trained


In [9]:
# Merge predictions with actuals
results_test = y_test.merge(fcst_baseline, on=["unique_id", "ds"], how="left")
results_test = results_test.merge(fcst_best, on=["unique_id", "ds"], how="left")

# Calculate metrics
mae_baseline = mean_absolute_error(results_test["y"], results_test["baseline_pred"])
rmse_baseline = root_mean_squared_error(results_test["y"], results_test["baseline_pred"])
smape_baseline = np.mean(200 * np.abs(results_test["baseline_pred"] - results_test["y"]) / (np.abs(results_test["y"]) + np.abs(results_test["baseline_pred"]) + 1e-8))
r2_baseline = r2_score(results_test["y"], results_test["baseline_pred"])

mae_best = mean_absolute_error(results_test["y"], results_test["best_pred"])
rmse_best = root_mean_squared_error(results_test["y"], results_test["best_pred"])
smape_best = np.mean(200 * np.abs(results_test["best_pred"] - results_test["y"]) / (np.abs(results_test["y"]) + np.abs(results_test["best_pred"]) + 1e-8))
r2_best = r2_score(results_test["y"], results_test["best_pred"])

metrics = pd.DataFrame({
    "model": ["Baseline (SeasonalNaive-7)", f"Best ({best_model_name})"],
    "feature_set": ["none", best_feature],
    "mae": [mae_baseline, mae_best],
    "rmse": [rmse_baseline, rmse_best],
    "smape": [smape_baseline, smape_best],
    "r2": [r2_baseline, r2_best],
})

display(metrics)
metrics

Unnamed: 0,model,feature_set,mae,rmse,smape,r2
0,Baseline (SeasonalNaive-7),none,100.566129,164.751355,24.294597,0.976751
1,Best (AutoETS),none,86.852207,153.463049,17.51223,0.979828


Unnamed: 0,model,feature_set,mae,rmse,smape,r2
0,Baseline (SeasonalNaive-7),none,100.566129,164.751355,24.294597,0.976751
1,Best (AutoETS),none,86.852207,153.463049,17.51223,0.979828


In [10]:
# Save predictions and metrics
PRED_DIR = Path("..") / "data" / "predictions"
PRED_DIR.mkdir(parents=True, exist_ok=True)

# Save predictions
results_test_pl = pl.from_pandas(results_test)
pred_path = PRED_DIR / "test_predictions.parquet"
results_test_pl.write_parquet(pred_path)
print(f"Saved predictions to {pred_path}")

# Save metrics
metrics_pl = pl.from_pandas(metrics)
metrics_path = PRED_DIR / "test_metrics.parquet"
metrics_pl.write_parquet(metrics_path)
print(f"Saved metrics to {metrics_path}")

# Save best model configuration
best_config = pd.DataFrame([{
    "model": best_model_name,
    "feature_set": best_feature,
    "params": str(best_params),
    "used_exog": used_exog,
    "val_mae": best_combo["mae"],
    "val_rmse": best_combo["rmse"],
    "val_smape": best_combo["smape"],
    "val_r2": best_combo.get("r2", np.nan),
    "test_mae": mae_best,
    "test_rmse": rmse_best,
    "test_smape": smape_best,
    "test_r2": r2_best,
}])

config_pl = pl.from_pandas(best_config)
config_path = PRED_DIR / "best_model_config.parquet"
config_pl.write_parquet(config_path)
print(f"Saved best model config to {config_path}")

Saved predictions to ..\data\predictions\test_predictions.parquet
Saved metrics to ..\data\predictions\test_metrics.parquet
Saved best model config to ..\data\predictions\best_model_config.parquet


In [11]:
example_1 = pd.read_parquet("../data/predictions/best_model_config.parquet")
example_2 = pd.read_parquet("../data/predictions/test_metrics.parquet")
example_3 = pd.read_parquet("../data/predictions/test_predictions.parquet")
example_4 = pd.read_parquet("../data/predictions/test_metrics_long.parquet")
example_5 = pd.read_parquet("../data/predictions/test_predictions_long.parquet")
example_4.head(), example_5.head()

(                   model_name metrics  preditions
 0  Baseline (SeasonalNaive-7)     mae  100.566129
 1              Best (AutoETS)     mae   86.852207
 2  Baseline (SeasonalNaive-7)    rmse  164.751355
 3              Best (AutoETS)    rmse  153.463049
 4  Baseline (SeasonalNaive-7)   smape   24.294597,
                    model_name     unique_id         ds       y  preditions
 0  Baseline (SeasonalNaive-7)  "Bubble tea" 2025-11-01  1148.0      1382.0
 1  Baseline (SeasonalNaive-7)  "Bubble tea" 2025-11-02  1095.0      1316.0
 2  Baseline (SeasonalNaive-7)  "Bubble tea" 2025-11-03  1097.0      1229.0
 3  Baseline (SeasonalNaive-7)  "Bubble tea" 2025-11-04  1089.0      1429.0
 4  Baseline (SeasonalNaive-7)  "Bubble tea" 2025-11-05  1218.0      1112.0)

## Long-format outputs
Create long-format metrics and predictions and save to data/predictions.

In [12]:
# Create long-format metrics (only numeric metrics)
metric_cols = ["mae", "rmse", "smape", "r2"]
metrics_long = (
    metrics[["model", *metric_cols]]
    .melt(id_vars=["model"], var_name="metrics", value_name="preditions")
    .rename(columns={"model": "model_name"})
)

# Create long-format predictions
baseline_long = results_test[["unique_id", "ds", "y", "baseline_pred"]].rename(columns={"baseline_pred": "preditions"})
baseline_long["model_name"] = "Baseline (SeasonalNaive-7)"
best_long = results_test[["unique_id", "ds", "y", "best_pred"]].rename(columns={"best_pred": "preditions"})
best_long["model_name"] = f"Best ({best_model_name})"
predictions_long = pd.concat([baseline_long, best_long], ignore_index=True)[["model_name", "unique_id", "ds", "y", "preditions"]]

# Save long-format files
metrics_long_pl = pl.from_pandas(metrics_long)
metrics_long_path = PRED_DIR / "test_metrics_long.parquet"
metrics_long_pl.write_parquet(metrics_long_path)
print(f"Saved long metrics to {metrics_long_path}")

predictions_long_pl = pl.from_pandas(predictions_long)
predictions_long_path = PRED_DIR / "test_predictions_long.parquet"
predictions_long_pl.write_parquet(predictions_long_path)
print(f"Saved long predictions to {predictions_long_path}")

Saved long metrics to ..\data\predictions\test_metrics_long.parquet
Saved long predictions to ..\data\predictions\test_predictions_long.parquet


In [13]:
predictions_long

Unnamed: 0,model_name,unique_id,ds,y,preditions
0,Baseline (SeasonalNaive-7),"""Bubble tea""",2025-11-01,1148.0,1382.000000
1,Baseline (SeasonalNaive-7),"""Bubble tea""",2025-11-02,1095.0,1316.000000
2,Baseline (SeasonalNaive-7),"""Bubble tea""",2025-11-03,1097.0,1229.000000
3,Baseline (SeasonalNaive-7),"""Bubble tea""",2025-11-04,1089.0,1429.000000
4,Baseline (SeasonalNaive-7),"""Bubble tea""",2025-11-05,1218.0,1112.000000
...,...,...,...,...,...
615,Best (AutoETS),"""Tea""",2025-11-27,1239.0,1237.329451
616,Best (AutoETS),"""Tea""",2025-11-28,1239.0,1170.729862
617,Best (AutoETS),"""Tea""",2025-11-29,1218.0,1125.839810
618,Best (AutoETS),"""Tea""",2025-11-30,1331.0,1194.476400


## Extend predictions to include December
Generate additional daily predictions for December 2024 and append to existing predictions.

In [15]:
# Check current prediction date range
print(f"Current predictions start: {results_test['ds'].min()}")
print(f"Current predictions end: {results_test['ds'].max()}")

# Calculate days needed to reach end of December 2024
last_pred_date = pd.to_datetime(results_test['ds'].max())
december_end = pd.Timestamp('2025-12-31')

if last_pred_date < december_end:
    additional_days = (december_end - last_pred_date).days
    print(f"\nNeed {additional_days} additional days to cover all of December")
    
    # Calculate new total horizon
    h_extended = h_test + additional_days
    
    # Generate extended predictions with baseline model
    fcst_baseline_ext = sf_baseline.predict(h=h_extended)
    baseline_col = [c for c in fcst_baseline_ext.columns if c not in ("unique_id", "ds")][0]
    fcst_baseline_ext = fcst_baseline_ext.rename(columns={baseline_col: "baseline_pred"})
    
    # Generate extended predictions with best model
    if best_model_name == "RandomForestRegressor":
        # For RandomForest, we need to generate future features for December
        # Get the full training data
        train_full_extended = pd.concat([
            splits["train"].to_pandas(),
            splits["val"].to_pandas(),
            splits["test"].to_pandas()
        ], ignore_index=True).sort_values(["unique_id", "ds"]).reset_index(drop=True)
        
        # Generate future dates for December
        unique_ids = train_full_extended["unique_id"].unique()
        last_date = pd.to_datetime(train_full_extended['ds'].max())
        future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), end=december_end, freq='D')
        
        # Create future dataframe with same features structure
        future_rows = []
        for uid in unique_ids:
            for date in future_dates:
                row = {"unique_id": uid, "ds": date}
                # Add feature columns with default values or derived features
                for col in feature_cols:
                    if col in train_full_extended.columns:
                        # Use most recent value or zero as placeholder
                        row[col] = 0
                future_rows.append(row)
        
        future_df = pd.DataFrame(future_rows).fillna(0)
        
        # Make predictions
        cat_cols = [c for c in feature_cols if future_df[c].dtype == "object"]
        future_X = pd.get_dummies(future_df[feature_cols], columns=cat_cols, drop_first=False)
        
        # Align columns with training data
        train_df_check = y_train_val.merge(X_train_val, on=["unique_id", "ds"], how="left").fillna(0)
        feature_cols_rf = [c for c in train_df_check.columns if c not in ["unique_id", "ds", "y"]]
        train_X_check = pd.get_dummies(train_df_check[feature_cols_rf], drop_first=False)
        
        cols = sorted(set(train_X_check.columns) | set(future_X.columns))
        future_X = future_X.reindex(columns=cols, fill_value=0)
        
        dec_preds = best_model.predict(future_X)
        fcst_best_ext = future_df[["unique_id", "ds"]].copy()
        fcst_best_ext["best_pred"] = dec_preds
        
        # Combine with existing predictions
        fcst_best_ext = pd.concat([fcst_best, fcst_best_ext], ignore_index=True)
    else:
        fcst_best_ext = sf_best.predict(h=h_extended)
        best_col = [c for c in fcst_best_ext.columns if c not in ("unique_id", "ds")][0]
        fcst_best_ext = fcst_best_ext.rename(columns={best_col: "best_pred"})
    
    # Create extended results (note: we don't have actual 'y' values for December)
    # Get unique IDs from existing results
    unique_ids = results_test["unique_id"].unique()
    
    # Create December rows with NaN for actual values
    december_rows = []
    for uid in unique_ids:
        uid_baseline = fcst_baseline_ext[fcst_baseline_ext["unique_id"] == uid]
        uid_best = fcst_best_ext[fcst_best_ext["unique_id"] == uid]
        
        for _, base_row in uid_baseline.iterrows():
            if pd.to_datetime(base_row["ds"]) > last_pred_date:
                best_row = uid_best[uid_best["ds"] == base_row["ds"]]
                if len(best_row) > 0:
                    december_rows.append({
                        "unique_id": uid,
                        "ds": base_row["ds"],
                        "y": np.nan,  # No actual values for future
                        "baseline_pred": base_row["baseline_pred"],
                        "best_pred": best_row["best_pred"].values[0]
                    })
    
    december_df = pd.DataFrame(december_rows)
    
    # Append December predictions to existing results
    results_test_extended = pd.concat([results_test, december_df], ignore_index=True).sort_values(["unique_id", "ds"]).reset_index(drop=True)
    
    print(f"\n✓ Extended predictions to {results_test_extended['ds'].max()}")
    print(f"Total predictions: {len(results_test_extended)} rows")
    print(f"Date range: {results_test_extended['ds'].min()} to {results_test_extended['ds'].max()}")
    
    # Save extended predictions
    results_extended_pl = pl.from_pandas(results_test_extended)
    pred_extended_path = PRED_DIR / "test_predictions_extended.parquet"
    results_extended_pl.write_parquet(pred_extended_path)
    print(f"Saved extended predictions to {pred_extended_path}")
    
    # Update long-format predictions with December
    baseline_long_ext = results_test_extended[["unique_id", "ds", "y", "baseline_pred"]].rename(columns={"baseline_pred": "preditions"})
    baseline_long_ext["model_name"] = "Baseline (SeasonalNaive-7)"
    best_long_ext = results_test_extended[["unique_id", "ds", "y", "best_pred"]].rename(columns={"best_pred": "preditions"})
    best_long_ext["model_name"] = f"Best ({best_model_name})"
    predictions_long_extended = pd.concat([baseline_long_ext, best_long_ext], ignore_index=True)[["model_name", "unique_id", "ds", "y", "preditions"]]
    
    # Save extended long-format predictions
    predictions_long_ext_pl = pl.from_pandas(predictions_long_extended)
    predictions_long_ext_path = PRED_DIR / "test_predictions_long_extended.parquet"
    predictions_long_ext_pl.write_parquet(predictions_long_ext_path)
    print(f"Saved extended long predictions to {predictions_long_ext_path}")
    
else:
    print(f"\nPredictions already cover December (end date: {last_pred_date})")
    results_test_extended = results_test
    predictions_long_extended = predictions_long

Current predictions start: 2025-11-01 00:00:00
Current predictions end: 2025-12-01 00:00:00

Need 30 additional days to cover all of December

✓ Extended predictions to 2025-12-31 00:00:00
Total predictions: 610 rows
Date range: 2025-11-01 00:00:00 to 2025-12-31 00:00:00
Saved extended predictions to ..\data\predictions\test_predictions_extended.parquet
Saved extended long predictions to ..\data\predictions\test_predictions_long_extended.parquet


In [17]:
predictions_long_extended

Unnamed: 0,model_name,unique_id,ds,y,preditions
0,Baseline (SeasonalNaive-7),"""Bubble tea""",2025-11-01,1148.0,1382.000000
1,Baseline (SeasonalNaive-7),"""Bubble tea""",2025-11-02,1095.0,1316.000000
2,Baseline (SeasonalNaive-7),"""Bubble tea""",2025-11-03,1097.0,1229.000000
3,Baseline (SeasonalNaive-7),"""Bubble tea""",2025-11-04,1089.0,1429.000000
4,Baseline (SeasonalNaive-7),"""Bubble tea""",2025-11-05,1218.0,1112.000000
...,...,...,...,...,...
1215,Best (AutoETS),"""Tea""",2025-12-27,,1125.839810
1216,Best (AutoETS),"""Tea""",2025-12-28,,1194.476400
1217,Best (AutoETS),"""Tea""",2025-12-29,,1255.664828
1218,Best (AutoETS),"""Tea""",2025-12-30,,1254.233791


In [16]:
# Display sample of extended predictions
print("\nSample of extended predictions (including December):")
display(results_test_extended.tail(10))
print(f"\nPredictions by month:")
results_test_extended['month'] = pd.to_datetime(results_test_extended['ds']).dt.to_period('M')
print(results_test_extended.groupby('month').size())


Sample of extended predictions (including December):


Unnamed: 0,unique_id,ds,y,baseline_pred,best_pred
600,"""Tea""",2025-12-22,,1415.0,1255.664828
601,"""Tea""",2025-12-23,,1368.0,1254.233791
602,"""Tea""",2025-12-24,,1274.0,1253.418165
603,"""Tea""",2025-12-25,,1286.0,1237.329451
604,"""Tea""",2025-12-26,,1110.0,1170.729862
605,"""Tea""",2025-12-27,,1265.0,1125.83981
606,"""Tea""",2025-12-28,,1282.0,1194.4764
607,"""Tea""",2025-12-29,,1415.0,1255.664828
608,"""Tea""",2025-12-30,,1368.0,1254.233791
609,"""Tea""",2025-12-31,,1274.0,1253.418165



Predictions by month:
month
2025-11    300
2025-12    310
Freq: M, dtype: int64
