In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

# --- CONFIGURATION ---
BASE_DIR = os.path.dirname(os.getcwd())
OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
PREDICTIONS_DIR = os.path.join(BASE_DIR, "predictions")
FORECAST_FILE = os.path.join(BASE_DIR, "data", "processed", "forecast_ready.parquet")

# Target cities
TARGET_CITIES = ["Boston", "Paris", "Beijing", "Tokyo", "Berlin", "London"]

# Load data
print("Loading data for report generation...")
historical = pd.read_parquet(FORECAST_FILE)
forecasts = pd.read_parquet(os.path.join(PREDICTIONS_DIR, 'pm25_forecast_2026_01.parquet'))

# Filter for target cities
historical = historical[historical['City'].isin(TARGET_CITIES)]
forecasts = forecasts[forecasts['City'].isin(TARGET_CITIES)]

historical['Date'] = pd.to_datetime(historical['Date'])
forecasts['Date'] = pd.to_datetime(forecasts['Date'])
historical['Month'] = historical['Date'].dt.month
historical['Year'] = historical['Date'].dt.year

# Get actual cities present
cities = sorted(forecasts['City'].unique())

# === GENERATE REPORT ===
report_lines = []

def add_line(text=""):
    report_lines.append(text)

def add_separator(char="=", length=70):
    report_lines.append(char * length)

# Header
add_separator("=")
add_line("PM2.5 FORECAST EVALUATION REPORT - JANUARY 2026")
add_separator("=")
add_line()

# --- 1. FORECAST SUMMARY ---
add_line("1. FORECAST SUMMARY")
add_separator("-")

summary = forecasts.groupby('City').agg({
    'Prediction': ['mean', 'std', 'min', 'max']
}).round(2)

summary.columns = ['mean', 'std', 'min', 'max']
add_line(summary.to_string())
add_line()

# --- 2. COMPARISON WITH HISTORICAL JANUARY ---
add_line("2. COMPARISON WITH HISTORICAL JANUARY")
add_separator("-")

# Historical January averages
jan_historical = historical[historical['Month'] == 1].groupby('City')['median'].agg(['mean', 'std']).round(2)
jan_historical.columns = ['Historical_Jan_Mean', 'Historical_Jan_Std']

# Forecast January average
jan_forecast = forecasts.groupby('City')['Prediction'].mean().round(2)

comparison = pd.DataFrame({
    'Historical_Jan_Mean': jan_historical['Historical_Jan_Mean'],
    'Historical_Jan_Std': jan_historical['Historical_Jan_Std'],
    'Forecast_Jan_2026': jan_forecast,
    'Difference': (jan_forecast - jan_historical['Historical_Jan_Mean']).round(2),
    'Pct_Change': ((jan_forecast / jan_historical['Historical_Jan_Mean'] - 1) * 100).round(1)
})

add_line(comparison.to_string())
add_line()

# --- 3. UNCERTAINTY ANALYSIS ---
add_line("3. UNCERTAINTY ANALYSIS")
add_separator("-")

uncertainty = forecasts.groupby('City').agg({
    'Prediction': 'mean',
    'Lower_95': 'mean',
    'Upper_95': 'mean'
}).round(2)

uncertainty['CI_Width'] = (uncertainty['Upper_95'] - uncertainty['Lower_95']).round(2)
uncertainty['Relative_Uncertainty'] = (uncertainty['CI_Width'] / uncertainty['Prediction'] * 100).round(1)

add_line(uncertainty.to_string())
add_line()

# --- 4. HEALTH RISK ASSESSMENT ---
add_line("4. HEALTH RISK ASSESSMENT")
add_separator("-")

def classify_risk(value):
    if value < 12:
        return 'Good'
    elif value < 35:
        return 'Moderate'
    elif value < 55:
        return 'Unhealthy for Sensitive'
    elif value < 150:
        return 'Unhealthy'
    else:
        return 'Very Unhealthy'

risk_assessment = []

for city in cities:
    fcst_city = forecasts[forecasts['City'] == city]
    
    days_good = sum(fcst_city['Prediction'] < 12)
    days_moderate = sum((fcst_city['Prediction'] >= 12) & (fcst_city['Prediction'] < 35))
    days_unhealthy_sensitive = sum((fcst_city['Prediction'] >= 35) & (fcst_city['Prediction'] < 55))
    days_unhealthy = sum((fcst_city['Prediction'] >= 55) & (fcst_city['Prediction'] < 150))
    days_very_unhealthy = sum(fcst_city['Prediction'] >= 150)
    
    avg_level = fcst_city['Prediction'].mean()
    max_level = fcst_city['Prediction'].max()
    
    risk_assessment.append({
        'City': city,
        'Avg_Level': round(avg_level, 1),
        'Max_Level': round(max_level, 1),
        'Days_Good': days_good,
        'Days_Moderate': days_moderate,
        'Days_Unhealthy_Sensitive': days_unhealthy_sensitive,
        'Days_Unhealthy': days_unhealthy,
        'Days_Very_Unhealthy': days_very_unhealthy,
        'Overall_Risk': classify_risk(avg_level)
    })

risk_df = pd.DataFrame(risk_assessment)
add_line(risk_df.to_string(index=False))
add_line()

# --- 5. ADDITIONAL INSIGHTS ---
add_line("5. KEY INSIGHTS")
add_separator("-")
add_line()

# Find most/least polluted
most_polluted = risk_df.loc[risk_df['Avg_Level'].idxmax()]
least_polluted = risk_df.loc[risk_df['Avg_Level'].idxmin()]

add_line(f"Most Polluted: {most_polluted['City']} ({most_polluted['Avg_Level']:.1f} ¬µg/m¬≥)")
add_line(f"Least Polluted: {least_polluted['City']} ({least_polluted['Avg_Level']:.1f} ¬µg/m¬≥)")
add_line()

# Biggest changes from historical
biggest_increase = comparison.loc[comparison['Pct_Change'].idxmax()]
biggest_decrease = comparison.loc[comparison['Pct_Change'].idxmin()]

add_line(f"Largest Increase vs Historical: {biggest_increase.name} (+{biggest_increase['Pct_Change']:.1f}%)")
add_line(f"Largest Decrease vs Historical: {biggest_decrease.name} ({biggest_decrease['Pct_Change']:.1f}%)")
add_line()

# Uncertainty assessment
most_uncertain = uncertainty.loc[uncertainty['Relative_Uncertainty'].idxmax()]
most_certain = uncertainty.loc[uncertainty['Relative_Uncertainty'].idxmin()]

add_line(f"Highest Uncertainty: {most_uncertain.name} ({most_uncertain['Relative_Uncertainty']:.1f}% relative CI)")
add_line(f"Lowest Uncertainty: {most_certain.name} ({most_certain['Relative_Uncertainty']:.1f}% relative CI)")
add_line()

# --- 6. RECOMMENDATIONS ---
add_line("6. ACTIONABLE RECOMMENDATIONS")
add_separator("-")
add_line()

for _, row in risk_df.iterrows():
    city = row['City']
    risk = row['Overall_Risk']
    avg = row['Avg_Level']
    
    add_line(f"{city} ({avg:.1f} ¬µg/m¬≥ avg):")
    
    if risk == 'Very Unhealthy' or avg >= 150:
        add_line("  üö® CRITICAL ACTION REQUIRED:")
        add_line("     - Issue public health emergency")
        add_line("     - Mandatory N95 masks for all outdoor activities")
        add_line("     - School closures on peak days")
        add_line("     - Implement vehicle restrictions (odd-even)")
        add_line("     - Increase hospital capacity by 30%")
    
    elif risk == 'Unhealthy' or avg >= 55:
        add_line("  ‚ö†Ô∏è  HIGH RISK - PROTECTIVE MEASURES:")
        add_line("     - Health advisories for all residents")
        add_line("     - Limit outdoor activities")
        add_line("     - Sensitive groups stay indoors")
        add_line("     - Consider temporary traffic restrictions")
    
    elif risk == 'Unhealthy for Sensitive' or avg >= 35:
        add_line("  ‚ö†Ô∏è  MODERATE RISK:")
        add_line("     - Advisories for children, elderly, and people with respiratory conditions")
        add_line("     - Reduce prolonged outdoor exertion")
        add_line("     - Monitor air quality regularly")
    
    elif risk == 'Moderate' or avg >= 12:
        add_line("  ‚úì LOW RISK:")
        add_line("     - Standard monitoring")
        add_line("     - Very sensitive individuals may consider limiting prolonged outdoor activities")
    
    else:
        add_line("  ‚úÖ GOOD AIR QUALITY:")
        add_line("     - No special precautions needed")
        add_line("     - Continue regular monitoring")
    
    add_line()

# --- 7. DATA QUALITY NOTES ---
add_line("7. DATA QUALITY & LIMITATIONS")
add_separator("-")
add_line()
add_line("Forecast Period: January 1-31, 2026 (31 days)")
add_line(f"Cities Analyzed: {len(cities)}")
add_line(f"Historical Data: {historical['Date'].min().strftime('%Y-%m-%d')} to {historical['Date'].max().strftime('%Y-%m-%d')}")
add_line()
add_line("Limitations:")
add_line("  ‚Ä¢ Forecasts do not account for sudden policy changes")
add_line("  ‚Ä¢ Weather conditions (temperature, wind) not included in model")
add_line("  ‚Ä¢ Does not predict unpredictable events (fires, industrial accidents)")
add_line("  ‚Ä¢ Prediction intervals based on historical residuals")
add_line("  ‚Ä¢ Some cities show high uncertainty (wide confidence intervals)")
add_line()

# --- FOOTER ---
add_separator("=")
add_line(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
add_line("Source: PM2.5 Forecasting Pipeline (9-Model Ensemble)")
add_separator("=")

# === SAVE REPORT ===
report_text = "\n".join(report_lines)

report_file = os.path.join(OUTPUT_DIR, 'forecast_report.txt')
with open(report_file, 'w', encoding='utf-8') as f:
    f.write(report_text)

# Also print to console
print(report_text)

print(f"\n‚úÖ Report saved to: {report_file}")

# === SAVE DETAILED CSV TABLES ===
# Save each section as CSV for easy import to presentations
summary.to_csv(os.path.join(OUTPUT_DIR, 'forecast_summary.csv'))
comparison.to_csv(os.path.join(OUTPUT_DIR, 'historical_comparison.csv'))
uncertainty.to_csv(os.path.join(OUTPUT_DIR, 'uncertainty_analysis.csv'))
risk_df.to_csv(os.path.join(OUTPUT_DIR, 'health_risk_assessment.csv'), index=False)

print(f"‚úÖ CSV tables saved to: {OUTPUT_DIR}")
print("   - forecast_summary.csv")
print("   - historical_comparison.csv")
print("   - uncertainty_analysis.csv")
print("   - health_risk_assessment.csv")

Loading data for report generation...
PM2.5 FORECAST EVALUATION REPORT - JANUARY 2026

1. FORECAST SUMMARY
----------------------------------------------------------------------
           mean    std    min     max
City                                 
Beijing   86.11  13.60  51.96  121.00
Berlin    45.12   4.78  41.23   64.17
Boston    31.59   4.13  23.03   38.02
London    39.87   1.30  37.43   43.40
Paris    101.72  19.50  74.75  134.99
Tokyo     40.66   4.03  30.92   49.20

2. COMPARISON WITH HISTORICAL JANUARY
----------------------------------------------------------------------
         Historical_Jan_Mean  Historical_Jan_Std  Forecast_Jan_2026  Difference  Pct_Change
City                                                                                       
Beijing               111.86               73.47              86.11      -25.75       -23.0
Berlin                 44.20               22.27              45.12        0.92         2.1
Boston                 28.65            