In [2]:
import pickle
import pandas as pd
import numpy as np
import os

# =====================================================================
# 0. LOAD PREVIOUSLY SAVED DICTS
# =====================================================================

load_dir = "./cfpr_evaluation_dicts"

with open(os.path.join(load_dir, "category_results_mape.pkl"), "rb") as f:
    category_results_mape = pickle.load(f)

with open(os.path.join(load_dir, "category_residual_results.pkl"), "rb") as f:
    category_residual_results = pickle.load(f)

print("✅ Loaded evaluation dictionaries")

# =====================================================================
# 1. BUILD UNIFIED TABLES FOR EACH CATEGORY
# =====================================================================

combined_summaries = {}

for category in category_results_mape.keys():
    if category not in category_residual_results:
        continue

    # --- Get per-model mean MAPE for this category ---
    mape_series = category_results_mape[category].mean().rename("MAPE")

    # --- Get YoY residuals (already top-10 ranked, but we’ll re-use full values) ---
    yoy_series = category_residual_results[category].rename("YoY_Residual")

    # --- Combine both into one DataFrame ---
    combo = pd.concat([mape_series, yoy_series], axis=1).dropna()

    # --- Compute ranks explicitly and clearly ---
    combo["Rank_MAPE"] = combo["MAPE"].rank(method="min", ascending=True)
    combo["Rank_Residual"] = combo["YoY_Residual"].abs().rank(method="min", ascending=True)
    combo["Rank_Mean"] = combo[["Rank_MAPE", "Rank_Residual"]].mean(axis=1)

    # --- Sort by combined rank ---
    combo = combo.sort_values("Rank_Mean")

    # Store in dictionary
    combined_summaries[category] = combo

# =====================================================================
# 2. PRINT TOP MODELS PER CATEGORY
# =====================================================================

print("\n===== Agreement between MAPE & YoY Residuals (Corrected Ranks) =====")

for category, df_cat in combined_summaries.items():
    print(f"\n{category}")
    print(df_cat.head(5).round(3))


✅ Loaded evaluation dictionaries

===== Agreement between MAPE & YoY Residuals (Corrected Ranks) =====

Food
                                                     MAPE  YoY_Residual  \
ag_local_TemporalFusionTransformerModel             0.480         0.246   
ag_global_all_SimpleFeedForwardModel_exp5_geopo...  0.921         0.568   
ag_local_AutoARIMAModel                             0.743         0.596   
ag_global_all_ChronosModel                          1.323        -0.459   
ag_global_all_SimpleFeedForwardModel_exp1_human_    0.928         0.804   

                                                    Rank_MAPE  Rank_Residual  \
ag_local_TemporalFusionTransformerModel                   1.0            1.0   
ag_global_all_SimpleFeedForwardModel_exp5_geopo...        3.0            3.0   
ag_local_AutoARIMAModel                                   2.0            4.0   
ag_global_all_ChronosModel                                7.0            2.0   
ag_global_all_SimpleFeedForwardModel_exp

In [3]:
# =====================================================================
# 3. GLOBAL ANALYSIS: WHICH MODELS ARE CONSISTENTLY BEST?
# =====================================================================

# Combine all category-level tables into one DataFrame
stacked = pd.concat(
    [df.assign(category=cat) for cat, df in combined_summaries.items()],
    axis=0
).reset_index().rename(columns={"index": "model"})

# --- Compute overall statistics per model ---
# Lower MAPE and lower abs(Residual) = better
overall_summary = (
    stacked.groupby("model")[["MAPE", "YoY_Residual", "Rank_Mean"]]
    .agg({
        "MAPE": "mean",
        "YoY_Residual": lambda x: np.mean(np.abs(x)),
        "Rank_Mean": "mean"
    })
    .sort_values("Rank_Mean")
)

print("\n===== Global Model Ranking (Lower = Better Across All Food Categories) =====")
display(overall_summary.round(3))

# =====================================================================
# 4. DETERMINE WHICH MODELS TO RETAIN
# =====================================================================

# --- Core set: top N models overall ---
N = 5  # you can adjust this number easily
top_models = overall_summary.head(N).index.tolist()

print(f"\n🧠 Suggested Core Models to Retain ({N} models):")
for i, m in enumerate(top_models, start=1):
    print(f"{i}. {m}")

# --- Also identify coverage: how many categories each model appears in the top-3 ---
coverage = (
    stacked.groupby(["category", "model"])["Rank_Mean"]
    .mean()
    .reset_index()
    .sort_values("Rank_Mean")
)

coverage["Top3"] = coverage.groupby("category")["Rank_Mean"].rank(method="min") <= 3

model_coverage = coverage.groupby("model")["Top3"].sum().sort_values(ascending=False)

print("\n📊 Model Coverage — Number of Categories where Model is Top-3:")
display(model_coverage.head(10))



===== Global Model Ranking (Lower = Better Across All Food Categories) =====


Unnamed: 0_level_0,MAPE,YoY_Residual,Rank_Mean
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gemini-1.5-pro_report_True,0.444,0.017,1.5
ag_global_all_TemporalFusionTransformerModel_exp1_llm_,1.205,0.114,1.5
gemini-1.5-pro_report_False,0.425,0.152,2.5
claude-3-5-sonnet-20240620_report_False,1.018,0.175,2.667
ag_local_AutoARIMAModel,0.57,0.35,2.75
ag_local_TemporalFusionTransformerModel,0.804,0.377,3.875
ag_global_all_SimpleFeedForwardModel_exp5_climate_,1.559,0.26,4.333
ag_local_DeepARModel,1.569,0.487,4.625
claude-3-5-sonnet-20240620_report_True,1.147,0.453,4.75
ag_global_all_DeepARModel_exp1_llm_,0.636,0.335,5.0



🧠 Suggested Core Models to Retain (5 models):
1. gemini-1.5-pro_report_True
2. ag_global_all_TemporalFusionTransformerModel_exp1_llm_
3. gemini-1.5-pro_report_False
4. claude-3-5-sonnet-20240620_report_False
5. ag_local_AutoARIMAModel

📊 Model Coverage — Number of Categories where Model is Top-3:


model
ag_local_DeepARModel                            3
claude-3-5-sonnet-20240620_report_False         3
ag_local_DLinearModel                           2
ag_local_AutoETSModel                           2
ag_local_AutoARIMAModel                         2
ag_local_TemporalFusionTransformerModel         2
ag_global_all_TemporalFusionTransformerModel    2
ag_global_all_DeepARModel                       2
ag_global_all_SimpleFeedForwardModel            1
gemini-1.5-pro_report_True                      1
Name: Top3, dtype: int64

In [4]:
# =====================================================================
# 5️⃣ SAVE CROSS-ANALYSIS RESULTS
# =====================================================================

import pickle

save_dir = "./cfpr_evaluation_dicts"
os.makedirs(save_dir, exist_ok=True)

# --- 1️⃣ Save detailed combined per-category tables
with open(os.path.join(save_dir, "combined_summaries.pkl"), "wb") as f:
    pickle.dump(combined_summaries, f)

# --- 2️⃣ Save overall summary table
overall_summary.reset_index().to_csv(
    os.path.join(save_dir, "overall_summary_cross.csv"), index=False
)

# --- 3️⃣ Save model coverage info
model_coverage.to_csv(os.path.join(save_dir, "model_coverage_cross.csv"))

print("✅ Saved cross-analysis outputs:")
print(f"• combined_summaries.pkl — detailed category-level comparison (MAPE + YoY residual + ranks)")
print(f"• overall_summary_cross.csv — global summary of model performance")
print(f"• model_coverage_cross.csv — number of categories where each model ranked top-3\n")

# --- Optional confirmation preview ---
print("\n===== Top 10 Models by Combined Rank =====")
print(overall_summary.head(10).round(3))

print("\n===== Model Coverage Summary (Top 10) =====")
print(model_coverage.head(10))


✅ Saved cross-analysis outputs:
• combined_summaries.pkl — detailed category-level comparison (MAPE + YoY residual + ranks)
• overall_summary_cross.csv — global summary of model performance
• model_coverage_cross.csv — number of categories where each model ranked top-3


===== Top 10 Models by Combined Rank =====
                                                     MAPE  YoY_Residual  \
model                                                                     
gemini-1.5-pro_report_True                          0.444         0.017   
ag_global_all_TemporalFusionTransformerModel_ex...  1.205         0.114   
gemini-1.5-pro_report_False                         0.425         0.152   
claude-3-5-sonnet-20240620_report_False             1.018         0.175   
ag_local_AutoARIMAModel                             0.570         0.350   
ag_local_TemporalFusionTransformerModel             0.804         0.377   
ag_global_all_SimpleFeedForwardModel_exp5_climate_  1.559         0.260   
ag_local_D