In [2]:
"""
Debug script to compare ensemble files with dashboard expectations
"""
import pandas as pd
from pathlib import Path

print("="*60)
print("DEBUGGING DASHBOARD DATA LOADING")
print("="*60)

# Load all the data
print("\n1. Loading files...")
try:
    forecast_data = pd.read_parquet('data/all_forecasts.parquet')
    print(f"✅ forecast_data.pq: {len(forecast_data):,} rows")
except Exception as e:
    print(f"❌ Error loading forecast_data.pq: {e}")
    forecast_data = pd.DataFrame()

try:
    ensemble_data = pd.read_parquet('data/ensemble_forecasts.pq')
    print(f"✅ ensemble_forecasts.pq: {len(ensemble_data):,} rows")
except Exception as e:
    print(f"❌ Error loading ensemble_forecasts.pq: {e}")
    ensemble_data = pd.DataFrame()

# Compare column names
print("\n2. Column comparison:")
print("\nForecast data columns:")
print(forecast_data.columns.tolist())

print("\nEnsemble data columns:")
print(ensemble_data.columns.tolist())

# Check for mismatches
forecast_cols = set(forecast_data.columns)
ensemble_cols = set(ensemble_data.columns)

missing_in_ensemble = forecast_cols - ensemble_cols
extra_in_ensemble = ensemble_cols - forecast_cols

if missing_in_ensemble:
    print(f"\n⚠️ Columns in forecast but NOT in ensemble: {missing_in_ensemble}")
if extra_in_ensemble:
    print(f"\n⚠️ Columns in ensemble but NOT in forecast: {extra_in_ensemble}")

# Check key columns exist
print("\n3. Checking key columns:")
required_cols = ['reference_date', 'target_end_date', 'location', 'horizon', 
                 'target', 'output_type', 'output_type_id', 'value']

for col in required_cols:
    in_forecast = col in forecast_data.columns
    in_ensemble = col in ensemble_data.columns
    status = "✅" if (in_forecast and in_ensemble) else "❌"
    print(f"{status} {col}: forecast={in_forecast}, ensemble={in_ensemble}")

# Check model column (could be 'model' or 'Model')
print("\n4. Model column check:")
if 'model' in forecast_data.columns:
    print(f"✅ forecast_data has 'model' column")
    print(f"   Models: {sorted(forecast_data['model'].unique())[:5]}...")
elif 'Model' in forecast_data.columns:
    print(f"✅ forecast_data has 'Model' column (capital M)")
    print(f"   Models: {sorted(forecast_data['Model'].unique())[:5]}...")
else:
    print(f"❌ forecast_data has NO model column!")

if 'model' in ensemble_data.columns:
    print(f"✅ ensemble_data has 'model' column")
    print(f"   Models: {ensemble_data['model'].unique()}")
elif 'Model' in ensemble_data.columns:
    print(f"✅ ensemble_data has 'Model' column (capital M)")
    print(f"   Models: {ensemble_data['Model'].unique()}")
else:
    print(f"❌ ensemble_data has NO model column!")

# Check data types
print("\n5. Data type comparison:")
for col in required_cols:
    if col in forecast_data.columns and col in ensemble_data.columns:
        forecast_dtype = forecast_data[col].dtype
        ensemble_dtype = ensemble_data[col].dtype
        match = "✅" if forecast_dtype == ensemble_dtype else "⚠️"
        print(f"{match} {col}: forecast={forecast_dtype}, ensemble={ensemble_dtype}")

# Check sample values
print("\n6. Sample data:")
print("\nForecast data sample:")
print(forecast_data.head(2))

print("\nEnsemble data sample:")
print(ensemble_data.head(2))

# Try combining them
print("\n7. Testing combine:")
try:
    # Standardize model column
    if 'Model' in ensemble_data.columns and 'model' not in ensemble_data.columns:
        ensemble_data['model'] = ensemble_data['Model']
        ensemble_data = ensemble_data.drop(columns=['Model'])
    
    combined = pd.concat([forecast_data, ensemble_data], ignore_index=True)
    print(f"✅ Successfully combined: {len(combined):,} rows")
    print(f"   Models: {combined['model'].nunique() if 'model' in combined.columns else 'N/A'}")
    
    # Check if Median_Ensemble is there
    if 'model' in combined.columns:
        has_ensemble = 'Median_Ensemble' in combined['model'].values
        print(f"   Median_Ensemble present: {'✅' if has_ensemble else '❌'}")
        
        if has_ensemble:
            ensemble_subset = combined[combined['model'] == 'Median_Ensemble']
            print(f"   Median_Ensemble rows: {len(ensemble_subset):,}")
            print(f"   Targets: {ensemble_subset['target'].unique()}")
            print(f"   Output types: {ensemble_subset['output_type'].unique()}")
    
except Exception as e:
    print(f"❌ Error combining: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "="*60)

DEBUGGING DASHBOARD DATA LOADING

1. Loading files...
✅ forecast_data.pq: 471,731 rows
✅ ensemble_forecasts.pq: 68,698 rows

2. Column comparison:

Forecast data columns:
['reference_date', 'target', 'horizon', 'target_end_date', 'location', 'output_type', 'output_type_id', 'value', 'model']

Ensemble data columns:
['reference_date', 'location', 'horizon', 'target', 'target_end_date', 'output_type', 'output_type_id', 'value', 'model']

3. Checking key columns:
✅ reference_date: forecast=True, ensemble=True
✅ target_end_date: forecast=True, ensemble=True
✅ location: forecast=True, ensemble=True
✅ horizon: forecast=True, ensemble=True
✅ target: forecast=True, ensemble=True
✅ output_type: forecast=True, ensemble=True
✅ output_type_id: forecast=True, ensemble=True
✅ value: forecast=True, ensemble=True

4. Model column check:
✅ forecast_data has 'model' column
   Models: ['CEPH-Rtrend_fluH', 'Gatech-ensemble_prob', 'Gatech-ensemble_stat', 'MIGHTE-Joint', 'MIGHTE-Nsemble']...
✅ ensemble_data

In [3]:
# Check date formats
st.write("DEBUG: reference_date dtype:", forecast_data['reference_date'].dtype)
st.write("DEBUG: target_end_date dtype:", forecast_data['target_end_date'].dtype)

NameError: name 'st' is not defined