In [1]:
# ======================================================================
# 🔮 PARAMETERIZED FORECAST GENERATION SCRIPT (CFPR 2026)
# ======================================================================

import os
import time
import torch
import numpy as np
import pandas as pd
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame


# ----------------------------------------------------------------------
# 🧾 Helper: Save Forecasts in CFPR Standardized Format
# ----------------------------------------------------------------------
def save_standard_forecast(pred_df, category, model_name, forecast_year, output_root):
    """
    Convert AutoGluon forecast DataFrame to standardized CFPR format:
    ['timestamp', 'q_0.5', 'q_0.01', 'q_0.05', 'q_0.1', 'q_0.25', 'q_0.75', 'q_0.9', 'q_0.95', 'q_0.99']
    """
    pred_df["timestamp"] = pd.to_datetime(pred_df["timestamp"], errors="coerce")

    quantile_cols = {
        "0.01": "q_0.01", "0.05": "q_0.05", "0.1": "q_0.1",
        "0.25": "q_0.25", "0.5": "q_0.5", "0.75": "q_0.75",
        "0.9": "q_0.9", "0.95": "q_0.95", "0.99": "q_0.99"
    }
    pred_df = pred_df.rename(columns=quantile_cols)

    # Fallback: if only mean exists
    if "mean" in pred_df.columns and "q_0.5" not in pred_df.columns:
        pred_df["q_0.5"] = pred_df["mean"]

    # Ensure all expected columns exist
    desired_cols = ["timestamp", "q_0.5", "q_0.01", "q_0.05", "q_0.1", "q_0.25",
                    "q_0.75", "q_0.9", "q_0.95", "q_0.99"]
    for col in desired_cols:
        if col not in pred_df.columns:
            pred_df[col] = np.nan
    pred_df = pred_df[desired_cols]

    pred_df = pred_df.round(3)

    out_dir = os.path.join(output_root, category.replace("/", "_"))
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{category}_{model_name}_forecasts.csv")
    pred_df.to_csv(out_path, index=False)
    print(f"💾 Saved standardized forecast → {out_path}")


# ----------------------------------------------------------------------
# ⚙️ MAIN FORECASTING PIPELINE
# ----------------------------------------------------------------------
def generate_forecasts(
    merged_path,
    output_dir,
    train_cutoff="2025-08-01",
    forecast_horizon=16,
    selected_categories=None,
    selected_models=None,
    time_limit=300,
):
    """
    Train models up to a fixed cutoff and forecast into 2026.
    Parameters
    ----------
    merged_path : str
        Path to merged CFPR dataset (must include timestamp + categories)
    output_dir : str
        Directory where forecast outputs will be saved
    train_cutoff : str
        Cutoff date for training (default '2025-08-01')
    forecast_horizon : int
        How many months ahead to forecast (default 16)
    selected_categories : list or None
        Food categories to forecast; if None → use all in dataset
    selected_models : list or None
        Models to train; if None → use default list
    time_limit : int
        Training time limit per category (seconds)
    """

    # ------------------------------------------------------------------
    # Load data
    # ------------------------------------------------------------------
    df = pd.read_csv(merged_path, parse_dates=["timestamp"])
    print(f"✅ Loaded dataset with shape {df.shape}")

    df_long = df.melt(id_vars=["timestamp"], var_name="item_id", value_name="target")
    ts_df = TimeSeriesDataFrame.from_data_frame(df_long, id_column="item_id", timestamp_column="timestamp")

    all_categories = ts_df.index.get_level_values("item_id").unique().tolist()
    if selected_categories is None:
        selected_categories = all_categories

    # ------------------------------------------------------------------
    # Select models
    # ------------------------------------------------------------------
    if selected_models is None:
        selected_models = [
            "DeepAR",
            "AutoETS",
            "TemporalFusionTransformer",
            "SimpleFeedForward",
        ]
        if torch.cuda.is_available():
            selected_models.append("Chronos")

    print(f"🧩 Using models: {selected_models}")
    print(f"🍱 Forecasting {len(selected_categories)} categories (up to {train_cutoff})")

    # ------------------------------------------------------------------
    # Forecast Loop
    # ------------------------------------------------------------------
    os.makedirs(output_dir, exist_ok=True)
    train_cutoff_ts = pd.Timestamp(train_cutoff)

    for category in selected_categories:
        print("\n" + "=" * 70)
        print(f"🔮 Generating forecasts for: {category}")
        print("=" * 70)

        cat_ts = ts_df.loc[[category]]
        train_data = cat_ts.loc[cat_ts.index.get_level_values("timestamp") <= train_cutoff_ts]

        # Sanity check
        print(f"📆 Training range: {train_data.index.get_level_values('timestamp').min().date()} → {train_data.index.get_level_values('timestamp').max().date()}")

        save_dir = os.path.join(output_dir, category.replace("/", "_"))
        os.makedirs(save_dir, exist_ok=True)

        # Train model
        predictor = TimeSeriesPredictor(
            target="target",
            prediction_length=forecast_horizon,
            freq="MS",
            path=save_dir,
            eval_metric="MAPE",
        )

        start_time = time.time()
        predictor.fit(
            train_data=train_data,
            presets="medium_quality",
            time_limit=time_limit,
            hyperparameters={m: {} for m in selected_models},
        )
        train_duration = time.time() - start_time
        print(f"✅ Finished training {category} in {train_duration:.1f} sec")

        # Predict into the future (2025-09 → 2026-12)
        trained_models = predictor.get_model_names()
        print(f"🧩 Trained models: {trained_models}")

        for model_name in trained_models:
            try:
                print(f"📈 Predicting with {model_name} ...")
                pred_df = predictor.predict(train_data, model=model_name).reset_index()
                save_standard_forecast(pred_df, category, model_name, "2026", output_dir)
            except Exception as e:
                print(f"⚠️ Skipping {model_name} for {category} due to error: {e}")
                continue

    print("\n✅ ALL FORECASTS GENERATED AND SAVED SUCCESSFULLY ✅")


# ----------------------------------------------------------------------
# 🏁 Example Usage
# ----------------------------------------------------------------------
if __name__ == "__main__":
    merged_path = "/h/kupfersk/cfpr_2026/data_limited_2026/CFPR_2026_master_dataset.csv"
    output_dir = "/h/kupfersk/cfpr_2026/generate_forecasts/output/Forecasts_2026"

    # Example: pick categories and models
    food_categories = [
        "Bakery and cereal products (excluding baby food)",
        "Dairy products and eggs",
        "Fish, seafood and other marine products",
        "Food purchased from restaurants",
        "Food",
        "Fruit, fruit preparations and nuts",
        "Meat",
        "Other food products and non-alcoholic beverages",
        "Vegetables and vegetable preparations",
    ]

    selected_models = ["DeepAR", "TemporalFusionTransformer"]

    generate_forecasts(
        merged_path=merged_path,
        output_dir=output_dir,
        train_cutoff="2025-08-01",
        forecast_horizon=16,
        selected_categories=selected_categories,
        selected_models=selected_models,
        time_limit=1800,  # e.g. 2 min per category for testing
    )


ValueError: Missing column provided to 'parse_dates': 'timestamp'