# Forecasting Showdown: Prediction Markets vs. Frontier LLMs

## Project Overview

This project compares forecasting accuracy across three categories of forecasters:

1. **Prediction Markets** (Polymarket + Kalshi) â€” real-money markets where participants trade on event outcomes
2. **Frontier LLMs â€” Vanilla** (GPT-5, Gemini, Claude) â€” prompted with only the question, relying on training data
3. **Frontier LLMs â€” Tool-Augmented** (same models with real-time data tools) â€” given access to FRED and EIA APIs

### Metrics

**Brier Score** measures calibration â€” how well probability estimates match actual outcomes:

$$BS = \frac{1}{N} \sum_{i=1}^{N} (p_i - o_i)^2$$

- $p_i$ = predicted probability, $o_i$ = outcome (0 or 1)
- Lower is better: 0 = perfect, 0.25 = no skill (always predicting 50%), 1 = worst

**Hypothetical Returns** test profitability via a threshold-based betting strategy against prediction market prices.

### Domains
- **Federal Funds Rate**: Will the Fed cut rates at upcoming FOMC meetings?
- **Gas Prices**: Will US national average gasoline prices exceed/fall below specific thresholds?

In [None]:
# Environment Setup
# Loads API keys from a .env file (local) or Colab Secrets (Google Colab).
import os
import sys
from pathlib import Path

# --- Google Colab Support ---
if "google.colab" in sys.modules:
    print("Running in Google Colab")
    print("Add your API keys to Colab Secrets (key icon in the left sidebar).")
    try:
        from google.colab import userdata
        for key in [
            "OPENAI_API_KEY", "GOOGLE_API_KEY", "GEMINI_API_KEY",
            "ANTHROPIC_API_KEY", "FRED_API_KEY", "EIA_API_KEY",
        ]:
            try:
                os.environ[key] = userdata.get(key)
            except Exception:
                pass  # Key not set in Colab secrets -- that's ok
    except ImportError:
        pass

# --- Local: load from .env file ---
else:
    try:
        from dotenv import load_dotenv
        env_file = Path(".env")
        if env_file.exists():
            load_dotenv(env_file, override=True)
            print("Loaded API keys from .env")
        else:
            print("No .env file found -- using existing environment variables.")
            print("Copy .env.example to .env and fill in your keys.")
    except ImportError:
        print("python-dotenv not installed -- run: pip install python-dotenv")

# --- Report key status ---
KEYS = {
    "OPENAI_API_KEY":    "Required -- GPT models",
    "GOOGLE_API_KEY":    "Required -- Gemini models",
    "GEMINI_API_KEY":    "Required -- Gemini models (alias)",
    "ANTHROPIC_API_KEY": "Required -- Claude models",
    "FRED_API_KEY":      "Required -- Fed rate data  (free at fred.stlouisfed.org/docs/api/api_key.html)",
    "EIA_API_KEY":       "Required -- Gas price data (free at eia.gov/opendata)",
}
all_set = True
for key, desc in KEYS.items():
    status = "set" if os.environ.get(key) else "MISSING"
    if not os.environ.get(key):
        all_set = False
    print(f"  {status}  {key}  ({desc})")

if not all_set:
    print("Some keys are missing. See README.md for setup instructions.")
else:
    print("All API keys loaded.")

In [2]:
import hashlib
import json
import re
import time
from datetime import datetime, timedelta
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from fredapi import Fred
from sklearn.metrics import brier_score_loss
from tqdm.auto import tqdm

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage
from langchain_core.tools import tool

# --- Model Configuration ---
MODELS = {
    "gemini": {
        "name": "Gemini 2.5 Flash Lite",
        "provider": "google",
        "model_id": "gemini-2.5-flash-lite",
    },
    "gpt": {
        "name": "GPT-5",
        "provider": "openai",
        "model_id": "gpt-5-2025-08-07",
    },
    "claude": {
        "name": "Claude Sonnet 4.5",
        "provider": "anthropic",
        "model_id": "claude-sonnet-4-5-20250929",
    },
}

# --- API Keys ---
FRED_API_KEY = os.environ.get("FRED_API_KEY", "")
EIA_API_KEY = os.environ.get("EIA_API_KEY", "")

# --- Prediction Market API Base URLs ---
POLYMARKET_GAMMA_BASE = "https://gamma-api.polymarket.com"
POLYMARKET_CLOB_BASE = "https://clob.polymarket.com"
KALSHI_BASE = "https://api.elections.kalshi.com/trade-api/v2"

# --- FRED Series IDs ---
FRED_SERIES = {
    "fed_funds_rate": "DFF",
    "fed_funds_target_upper": "DFEDTARU",
    "fed_funds_target_lower": "DFEDTARL",
}

# --- EIA Endpoint ---
EIA_GAS_PRICE_URL = "https://api.eia.gov/v2/petroleum/pri/gp/data/"

# --- 2026 FOMC Meeting Dates (announcement day) ---
FOMC_DATES_2026 = [
    {"meeting": "January",   "date": "2026-01-28"},
    {"meeting": "March",     "date": "2026-03-18"},
    {"meeting": "April",     "date": "2026-04-29"},
    {"meeting": "June",      "date": "2026-06-17"},
    {"meeting": "July",      "date": "2026-07-29"},
    {"meeting": "September", "date": "2026-09-16"},
    {"meeting": "October",   "date": "2026-10-28"},
    {"meeting": "December",  "date": "2026-12-09"},
]

# Ensure cache directory exists
Path("cache").mkdir(exist_ok=True)

print("Configuration loaded.")
print(f"FRED API key: {'set' if FRED_API_KEY else 'MISSING'}")
print(f"EIA API key: {'set' if EIA_API_KEY else 'MISSING'}")

Configuration loaded.
FRED API key: MISSING
EIA API key: MISSING


In [None]:
# --- Caching Infrastructure ---
# Same pattern as lab_02: JSON file-based cache to avoid redundant API calls

CACHE_FILE = Path("cache/response_cache.json")


def load_cache():
    if CACHE_FILE.exists():
        return json.loads(CACHE_FILE.read_text())
    return {}


def save_cache(cache):
    CACHE_FILE.write_text(json.dumps(cache, indent=2))


def get_cache_key(prefix, *args):
    key_str = f"{prefix}:" + ":".join(str(a) for a in args)
    return hashlib.sha256(key_str.encode()).hexdigest()[:16]


def cached_call(cache, key, func, *args, **kwargs):
    """Execute func if key not in cache; otherwise return cached result."""
    if key in cache:
        return cache[key]
    result = func(*args, **kwargs)
    cache[key] = result
    save_cache(cache)
    return result


print("Caching infrastructure ready.")

# Section 1: Data Collection

We pull data from four external sources:

| Source | What It Provides | API | Auth |
|--------|-----------------|-----|------|
| **FRED** (St. Louis Fed) | Federal funds rate (daily), target rate range | `fredapi` Python library | Free API key |
| **EIA** (Energy Information Administration) | Weekly US retail gasoline prices | REST API v2 | Free API key |
| **Polymarket** | Prediction market probabilities | Gamma + CLOB REST APIs | None |
| **Kalshi** | Prediction market probabilities | REST API v2 | None (public data) |

The FRED and EIA data serve two purposes:
1. **Ground truth** for resolving our binary questions
2. **Tool data** for the tool-augmented LLM category

In [None]:
# --- 1.1 FRED: Federal Funds Rate ---

def fetch_fred_data():
    """Fetch federal funds rate data from FRED."""
    fred = Fred(api_key=FRED_API_KEY)

    fed_funds = fred.get_series("DFF", observation_start="2024-01-01")
    target_upper = fred.get_series("DFEDTARU", observation_start="2024-01-01")
    target_lower = fred.get_series("DFEDTARL", observation_start="2024-01-01")

    return {
        "fed_funds_rate": fed_funds,
        "target_upper": target_upper,
        "target_lower": target_lower,
    }


fred_data = fetch_fred_data()
print(f"Fed funds rate: {len(fred_data['fed_funds_rate'])} observations")
print(f"Latest effective rate: {fred_data['fed_funds_rate'].dropna().iloc[-1]:.2f}%")
print(
    f"Current target range: {fred_data['target_lower'].dropna().iloc[-1]:.2f}%"
    f" - {fred_data['target_upper'].dropna().iloc[-1]:.2f}%"
)

In [None]:
# --- 1.2 EIA: US Retail Gasoline Prices ---

def fetch_eia_gas_prices(n_weeks=200):
    """Fetch weekly US regular gasoline retail prices from EIA API v2."""
    params = {
        "api_key": EIA_API_KEY,
        "frequency": "weekly",
        "data[0]": "value",
        "facets[product][]": "EPMR",   # Regular gasoline, all formulations
        "facets[duoarea][]": "NUS",     # National US
        "sort[0][column]": "period",
        "sort[0][direction]": "desc",
        "offset": 0,
        "length": n_weeks,
    }
    response = requests.get(EIA_GAS_PRICE_URL, params=params)
    response.raise_for_status()
    data = response.json()

    records = data["response"]["data"]
    df = pd.DataFrame(records)
    df["period"] = pd.to_datetime(df["period"])
    df["value"] = pd.to_numeric(df["value"])
    df = df.sort_values("period").reset_index(drop=True)
    return df


gas_prices = fetch_eia_gas_prices()
print(f"Gas price data: {len(gas_prices)} weekly observations")
print(
    f"Latest: ${gas_prices['value'].iloc[-1]:.3f}/gal"
    f" (week of {gas_prices['period'].iloc[-1].strftime('%Y-%m-%d')})"
)

In [None]:
# --- 1.3 Polymarket ---

def fetch_polymarket_events(search_term, limit=50):
    """Search Polymarket Gamma API for events matching a term."""
    url = f"{POLYMARKET_GAMMA_BASE}/events"
    params = {"closed": "false", "limit": limit}
    response = requests.get(url, params=params)
    response.raise_for_status()
    events = response.json()

    relevant = []
    for event in events:
        title = event.get("title", "").lower()
        if search_term.lower() in title:
            relevant.append(event)
    return relevant


def fetch_polymarket_price_history(token_id, interval="max", fidelity=60):
    """Fetch price history for a Polymarket CLOB token."""
    url = f"{POLYMARKET_CLOB_BASE}/prices-history"
    params = {"market": token_id, "interval": interval, "fidelity": fidelity}
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    history = data.get("history", [])
    df = pd.DataFrame(history)
    if not df.empty:
        df["timestamp"] = pd.to_datetime(df["t"], unit="s")
        df["price"] = df["p"].astype(float)
    return df


# Search for relevant markets
poly_fed_events = fetch_polymarket_events("fed")
poly_gas_events = fetch_polymarket_events("gas")
print(f"Polymarket: {len(poly_fed_events)} Fed-related events, {len(poly_gas_events)} gas-related events")

for e in poly_fed_events[:5]:
    print(f"  Fed: {e.get('title', 'N/A')}")
for e in poly_gas_events[:5]:
    print(f"  Gas: {e.get('title', 'N/A')}")

In [None]:
# --- 1.4 Kalshi ---

def fetch_kalshi_markets(series_ticker, status="open", limit=100):
    """Fetch markets from Kalshi for a given series."""
    url = f"{KALSHI_BASE}/markets"
    params = {"series_ticker": series_ticker, "status": status, "limit": limit}
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    return data.get("markets", [])


# Fetch Fed rate and gas price markets
kalshi_fed = fetch_kalshi_markets("KXFED")
kalshi_gas = fetch_kalshi_markets("KXAAAGASM")
print(f"Kalshi: {len(kalshi_fed)} Fed rate markets (KXFED), {len(kalshi_gas)} gas price markets (KXAAAGASM)")

print("\nFed rate markets:")
for m in kalshi_fed[:5]:
    yes_bid = m.get("yes_bid", m.get("last_price", "N/A"))
    print(f"  {m.get('ticker', 'N/A')}: {m.get('title', 'N/A')} | Yes: {yes_bid}")

print("\nGas price markets:")
for m in kalshi_gas[:5]:
    yes_bid = m.get("yes_bid", m.get("last_price", "N/A"))
    print(f"  {m.get('ticker', 'N/A')}: {m.get('title', 'N/A')} | Yes: {yes_bid}")

## 1.5 Temperature Markets

We add a third forecasting domain: **monthly average daily high temperatures** for six
geographically diverse US cities.

| Source | What It Provides | API | Auth |
|--------|-----------------|-----|------|
| **Open-Meteo** | Daily max temperature (ERA5 reanalysis) | REST API | None — no key required |
| **Kalshi** | Temperature market prices | REST API v2 | None (public data) |

**Question design**: Each binary question asks whether the average daily high will exceed
the 10-year historical median for that city/month.  Using the median as the threshold sets
the expected base rate at ≈ 50%, which maximises statistical power and calibration signal.

**Why these months?**  March–June 2026 resolution dates are well past all frontier LLM
training cutoffs (≤ Aug 2025), ensuring models must genuinely forecast rather than recall
outcomes from training data.

In [None]:
# --- 1.5 Open-Meteo + Kalshi: Temperature Data ---

OPEN_METEO_ARCHIVE = "https://archive-api.open-meteo.com/v1/archive"

# Six geographically diverse US cities
CITIES = {
    "new_york":    {"name": "New York",    "lat": 40.71, "lon": -74.01},
    "chicago":     {"name": "Chicago",     "lat": 41.85, "lon": -87.65},
    "miami":       {"name": "Miami",       "lat": 25.77, "lon": -80.19},
    "los_angeles": {"name": "Los Angeles", "lat": 34.05, "lon": -118.24},
    "denver":      {"name": "Denver",      "lat": 39.74, "lon": -104.98},
    "seattle":     {"name": "Seattle",     "lat": 47.61, "lon": -122.33},
}

# Forecast months — all well past any frontier LLM training cutoff (≤ Aug 2025)
TEMP_MONTHS = [
    {"year": 2026, "month": 3, "name": "March"},
    {"year": 2026, "month": 4, "name": "April"},
    {"year": 2026, "month": 5, "name": "May"},
    {"year": 2026, "month": 6, "name": "June"},
]


def fetch_temp_history(lat, lon, start_date, end_date):
    """Fetch daily max temperatures (°F) from Open-Meteo ERA5 archive."""
    params = {
        "latitude": lat, "longitude": lon,
        "start_date": start_date, "end_date": end_date,
        "daily": "temperature_2m_max",
        "temperature_unit": "fahrenheit",
        "timezone": "auto",
    }
    resp = requests.get(OPEN_METEO_ARCHIVE, params=params)
    resp.raise_for_status()
    data = resp.json()
    df = pd.DataFrame({
        "date": pd.to_datetime(data["daily"]["time"]),
        "temp_max_f": pd.to_numeric(data["daily"]["temperature_2m_max"], errors="coerce"),
    })
    return df.dropna()


def compute_city_normals(city_key, lat, lon, n_years=10):
    """
    Compute 10-year median monthly avg-high (°F) for all 12 months.
    Makes one API call per city for the full historical window; cached after first run.
    """
    cache = load_cache()
    ck = get_cache_key("temp_normals", city_key, n_years)
    if ck in cache:
        return {int(k): v for k, v in cache[ck].items()}

    end_year = datetime.now().year - 1
    start_year = end_year - n_years + 1
    print(f"  Fetching {n_years}-year history for {city_key} ({start_year}–{end_year})...")
    df = fetch_temp_history(lat, lon, f"{start_year}-01-01", f"{end_year}-12-31")
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    # Monthly avg-high per year, then take the median across years for each month
    monthly = df.groupby(["year", "month"])["temp_max_f"].mean()
    normals = monthly.groupby("month").median().to_dict()

    cache[ck] = {str(k): v for k, v in normals.items()}
    save_cache(cache)
    return normals


def fetch_kalshi_temp_markets(limit=200):
    """Search Kalshi open markets for temperature / weather contracts."""
    url = f"{KALSHI_BASE}/markets"
    params = {"status": "open", "limit": limit}
    try:
        resp = requests.get(url, params=params)
        resp.raise_for_status()
        markets = resp.json().get("markets", [])
    except Exception as e:
        print(f"Warning: Kalshi temperature market fetch failed: {e}")
        return []
    temp_keywords = ["temperature", "high temp", "low temp", "weather", "degrees", "\u00b0f"]
    return [
        m for m in markets
        if any(kw in m.get("title", "").lower() for kw in temp_keywords)
        or "weather" in m.get("category", "").lower()
    ]


# --- Run ---
print("Computing 10-year climate normals (ERA5, cached after first run)...")
city_normals = {}
for city_key, city_info in CITIES.items():
    city_normals[city_key] = compute_city_normals(city_key, city_info["lat"], city_info["lon"])

import calendar as _cal
print("\nHistorical median avg-high (°F):")
print(f"{'City':<14}" + "".join(f"  {m['name'][:3]}" for m in TEMP_MONTHS))
for city_key, normals in city_normals.items():
    row = f"{CITIES[city_key]['name']:<14}"
    for m in TEMP_MONTHS:
        val = normals.get(m["month"], float("nan"))
        row += f"  {val:5.1f}"
    print(row)

kalshi_temp_markets = fetch_kalshi_temp_markets()
print(f"\nKalshi temperature markets found: {len(kalshi_temp_markets)}")
for m in kalshi_temp_markets[:5]:
    print(f"  {m.get('ticker', 'N/A')}: {m.get('title', 'N/A')}")

## 1.6 Kalshi: Historic NYC Temperature Markets

We pull the full history of **finalized** Kalshi NYC high-temperature markets from the
public API.  Each market resolves as YES/NO against a daily high-temperature threshold.

This data establishes our **train / validate** benchmark for the temperature domain —
we have both the market probability (pre-settlement price) and the actual outcome.

| Split | Period | Purpose |
|-------|--------|---------|
| **Train** | before 2025-01-01 | Calibration & baseline Brier |
| **Validate** | 2025 | Threshold-tuning (e.g. betting delta) |
| **Test** | 2026 | Held-out evaluation |

All six cities are retained for LLM evaluation; Kalshi NYC data is a supplemental
historical benchmark for the temperature domain.

In [None]:
# --- 1.6 Kalshi: Historic NYC Temperature Markets ---

# ── Split boundaries ────────────────────────────────────────────────────────
SPLIT_TRAIN_END = "2025-01-01"
SPLIT_VAL_END   = "2026-01-01"

def assign_split(resolution_date_str):
    """Assign train / validate / test based on resolution date."""
    dt = datetime.strptime(resolution_date_str[:10], "%Y-%m-%d")
    if dt < datetime(2025, 1, 1):
        return "train"
    elif dt < datetime(2026, 1, 1):
        return "validate"
    else:
        return "test"


def fetch_kalshi_nyc_weather_history(max_pages=15):
    """
    Fetch all finalized Kalshi markets for NYC daily high temperature.
    Tries the KXHIGHNYC series first; falls back to a keyword search across
    all finalized markets.  Results are cached.

    Returns a DataFrame with columns:
        ticker, title, resolution_date, threshold_f, market_prob, outcome, split
    """
    cache = load_cache()
    ck = get_cache_key("kalshi_nyc_history", "v1", max_pages)
    if ck in cache:
        df = pd.DataFrame(cache[ck])
        if not df.empty:
            df["resolution_date"] = pd.to_datetime(df["resolution_date"])
        return df

    url = f"{KALSHI_BASE}/markets"
    all_markets = []

    # 1. Try known series tickers for NYC temperature
    for series in ["KXHIGHNYC", "HIGHNYC", "KXHIGH-NYC", "KXTEMPHIGHNYC"]:
        params = {"series_ticker": series, "status": "finalized", "limit": 200}
        try:
            resp = requests.get(url, params=params, timeout=10)
            resp.raise_for_status()
            data = resp.json()
            batch = data.get("markets", [])
            if batch:
                all_markets.extend(batch)
                cursor = data.get("cursor")
                for _ in range(max_pages - 1):
                    if not cursor:
                        break
                    r2 = requests.get(url, params={**params, "cursor": cursor}, timeout=10)
                    r2.raise_for_status()
                    d2 = r2.json()
                    all_markets.extend(d2.get("markets", []))
                    cursor = d2.get("cursor")
                print(f"  Found {len(all_markets)} markets via series {series}")
                break
        except Exception:
            continue

    # 2. Fallback: keyword search through all finalized markets
    if not all_markets:
        print("  Series search returned nothing; scanning finalized markets...")
        cursor = None
        nyc_kws  = ["new york", "nyc", " ny "]
        temp_kws = ["high temp", "temperature", "degrees", "\u00b0f", "high of"]
        for page in range(max_pages):
            params = {"status": "finalized", "limit": 200}
            if cursor:
                params["cursor"] = cursor
            try:
                resp = requests.get(url, params=params, timeout=10)
                resp.raise_for_status()
                data = resp.json()
                for m in data.get("markets", []):
                    t = m.get("title", "").lower()
                    if any(k in t for k in nyc_kws) and any(k in t for k in temp_kws):
                        all_markets.append(m)
                cursor = data.get("cursor")
                if not cursor:
                    break
            except Exception as e:
                print(f"    page {page}: {e}")
                break

    # 3. Parse into rows
    rows = []
    for m in all_markets:
        title = m.get("title", "")

        # Extract temperature threshold from title (e.g. "exceed 45°F" or "above 72")
        temps = re.findall(r"(\d+(?:\.\d+)?)\s*\u00b0?\s*[Ff]", title)
        threshold_f = float(temps[0]) if temps else np.nan

        # Resolution date
        res_date = None
        for df_key in ["close_time", "expiration_time", "end_date"]:
            raw = m.get(df_key)
            if raw:
                try:
                    res_date = pd.to_datetime(raw).strftime("%Y-%m-%d")
                    break
                except Exception:
                    pass

        # Market probability — the pre-settlement price
        market_prob = np.nan
        for price_key in ["last_price", "previous_price", "yes_bid"]:
            val = m.get(price_key)
            if val is not None:
                v = float(val)
                # Skip exact settlement values (0, 1, 0, 100)
                if v not in {0, 1, 0.0, 1.0, 100, 100.0}:
                    market_prob = v / 100.0 if v > 1.0 else v
                    break

        # Outcome
        result = m.get("result", "").lower()
        outcome = 1 if result == "yes" else (0 if result == "no" else np.nan)

        if res_date and not np.isnan(outcome):
            rows.append({
                "ticker":          m.get("ticker", ""),
                "title":           title,
                "resolution_date": res_date,
                "threshold_f":     threshold_f,
                "market_prob":     market_prob,
                "outcome":         outcome,
                "split":           assign_split(res_date),
            })

    df = pd.DataFrame(rows)
    if not df.empty:
        df["resolution_date"] = pd.to_datetime(df["resolution_date"])
        df = df.sort_values("resolution_date").reset_index(drop=True)

    # Cache
    cache_payload = df.copy()
    if not cache_payload.empty:
        cache_payload["resolution_date"] = cache_payload["resolution_date"].astype(str)
    cache[ck] = cache_payload.to_dict("records")
    save_cache(cache)
    return df


# ── Run ─────────────────────────────────────────────────────────────────────
kalshi_nyc_history = fetch_kalshi_nyc_weather_history()
print(f"Kalshi NYC historical temperature markets: {len(kalshi_nyc_history)}")

if not kalshi_nyc_history.empty:
    for split_name, grp in kalshi_nyc_history.groupby("split"):
        cov   = grp["market_prob"].notna().mean()
        valid = grp.dropna(subset=["market_prob"])
        bs    = ((valid["market_prob"] - valid["outcome"]) ** 2).mean() if len(valid) > 0 else np.nan
        print(f"  {split_name:10s}: {len(grp):4d} markets | "
              f"Kalshi Brier={bs:.4f} | price coverage={cov:.0%}")
    print()
    print("Sample markets:")
    for _, row in kalshi_nyc_history.head(3).iterrows():
        print(f"  {row['ticker']}: threshold={row['threshold_f']}\u00b0F  "
              f"outcome={'YES' if row['outcome']==1 else 'NO'}  "
              f"mkt_prob={row['market_prob']:.2f}" if pd.notna(row['market_prob'])
              else f"  {row['ticker']}: threshold={row['threshold_f']}\u00b0F  "
              f"outcome={'YES' if row['outcome']==1 else 'NO'}  mkt_prob=N/A")
else:
    print("  No historical data returned (Kalshi may not expose public temperature history)")
    print("  Proceeding with LLM-only train/validate evaluation.")

# Section 2: Binary Question Design

## Design Principles
1. **Clear resolution criteria**: each question has an unambiguous yes/no outcome
2. **Authoritative data source**: resolution determined by a specific FRED/EIA data release
3. **Alignment with prediction markets**: questions map to existing Kalshi/Polymarket contracts where possible
4. **Diverse time horizons**: mix of near-term and medium-term questions

## Question Categories

### Category A: Federal Funds Rate (FOMC Decisions)
**Template**: "Will the Fed cut the federal funds rate at the [Month] 2026 FOMC meeting?"
- Resolves YES if the FRED target rate upper bound (`DFEDTARU`) decreases after the meeting
- Resolves NO otherwise

### Category B: US Retail Gas Prices
**Template**: "Will the US national average gas price exceed $X.XX per gallon by [date]?"
- Resolves YES if EIA weekly price exceeds the threshold
- Thresholds set relative to the current price

In [None]:
# --- 2.1 Question Generation (all splits) ---
import calendar as _cal

# ── Helpers ─────────────────────────────────────────────────────────────────
def _add_meta(questions, split_name):
    """Stamp each question with its split and a 30-day-before forecast_date."""
    for q in questions:
        q["split"] = split_name
        res = datetime.strptime(q["resolution_date"][:10], "%Y-%m-%d")
        q.setdefault("forecast_date", (res - timedelta(days=30)).strftime("%Y-%m-%d"))
    return questions


# ── Fed-rate question generators ─────────────────────────────────────────────
def generate_fed_questions(fomc_dates, current_rate_upper):
    """2026 FOMC questions (test split)."""
    questions = []
    for meeting in fomc_dates:
        meeting_date = datetime.strptime(meeting["date"], "%Y-%m-%d")
        if meeting_date > datetime.now():
            questions.append({
                "id": f"fed_cut_{meeting['meeting'].lower()}_2026",
                "category": "fed_rate",
                "text": (f"Will the Fed cut the federal funds rate at the"
                         f" {meeting['meeting']} 2026 FOMC meeting?"),
                "resolution_date": meeting["date"],
                "resolution_source": "FRED DFEDTARU",
                "current_rate": float(current_rate_upper),
                "kalshi_series": "KXFED",
                "meeting_month": meeting["meeting"].lower(),
                "split": "test",
                "forecast_date": datetime.now().strftime("%Y-%m-%d"),
            })
    return questions


def generate_fed_questions_historical(fomc_dates, year):
    """Historical FOMC questions (train or validate split)."""
    questions = []
    for meeting in fomc_dates:
        meeting_date = datetime.strptime(meeting["date"], "%Y-%m-%d")
        forecast_date = meeting_date - timedelta(days=30)
        # Rate as-of 30 days before meeting (from FRED data)
        rate_series = fred_data["target_upper"]
        pre = rate_series[rate_series.index < pd.Timestamp(forecast_date)]
        hist_rate = float(pre.iloc[-1]) if len(pre) > 0 else float(current_rate)
        questions.append({
            "id": f"fed_cut_{meeting['meeting'].lower()}_{year}",
            "category": "fed_rate",
            "text": (f"Will the Fed cut the federal funds rate at the"
                     f" {meeting['meeting']} {year} FOMC meeting?"),
            "resolution_date": meeting["date"],
            "resolution_source": "FRED DFEDTARU",
            "current_rate": hist_rate,
            "kalshi_series": "KXFED",
            "meeting_month": meeting["meeting"].lower(),
            "forecast_date": forecast_date.strftime("%Y-%m-%d"),
            "split": assign_split(meeting["date"]),
        })
    return questions


# ── Gas-price question generators ────────────────────────────────────────────
def generate_gas_questions(current_price, weeks_ahead=(4, 8, 12)):
    """2026 gas-price questions (test split)."""
    questions = []
    offsets = [0.25, 0.50, -0.25]
    for weeks in weeks_ahead:
        target_date = datetime.now() + timedelta(weeks=weeks)
        target_date_str = target_date.strftime("%Y-%m-%d")
        for offset in offsets:
            threshold = round(current_price + offset, 2)
            direction = "exceed" if offset > 0 else "fall below"
            questions.append({
                "id": f"gas_{'above' if offset > 0 else 'below'}_{threshold:.2f}_{weeks}w",
                "category": "gas_price",
                "text": (f"Will the US national average gas price {direction}"
                         f" ${threshold:.2f}/gal by the week of {target_date_str}?"),
                "resolution_date": target_date_str,
                "resolution_source": "EIA Weekly Retail Gasoline Prices",
                "threshold": threshold,
                "current_price": float(current_price),
                "kalshi_series": "KXAAAGASM",
                "split": "test",
                "forecast_date": datetime.now().strftime("%Y-%m-%d"),
            })
    return questions


def generate_historical_gas_questions(gas_prices_df, as_of_dates, split_name):
    """Historical gas questions from past quarterly as-of dates."""
    questions = []
    offsets = [0.25, -0.25]      # two thresholds per period (keeps count manageable)
    weeks_ahead = (4, 8)
    for as_of in as_of_dates:
        prices_at = gas_prices_df[gas_prices_df["period"] <= pd.Timestamp(as_of)]
        if len(prices_at) == 0:
            continue
        current_price = float(prices_at["value"].iloc[-1])
        compact = as_of.replace("-", "")[:6]
        for weeks in weeks_ahead:
            target_date = datetime.strptime(as_of, "%Y-%m-%d") + timedelta(weeks=weeks)
            target_str = target_date.strftime("%Y-%m-%d")
            for offset in offsets:
                threshold = round(current_price + offset, 2)
                direction = "exceed" if offset > 0 else "fall below"
                questions.append({
                    "id": f"gas_hist_{compact}_{'ab' if offset > 0 else 'bl'}_{threshold:.2f}_{weeks}w",
                    "category": "gas_price",
                    "text": (f"Will the US national average gas price {direction}"
                             f" ${threshold:.2f}/gal by the week of {target_str}?"),
                    "resolution_date": target_str,
                    "resolution_source": "EIA Weekly Retail Gasoline Prices",
                    "threshold": threshold,
                    "current_price": current_price,
                    "kalshi_series": "KXAAAGASM",
                    "forecast_date": as_of,
                    "split": split_name,
                })
    return questions


# ── Temperature question generator ───────────────────────────────────────────
def generate_temperature_questions(cities, months, normals_by_city):
    """
    Binary questions: 'Will avg-high in [city] exceed [threshold]\u00b0F in [month]?'
    Threshold = 10-year historical median avg-high  →  base rate \u2248 50%.
    All 6 cities (New York, Chicago, Miami, Los Angeles, Denver, Seattle) are included.
    """
    questions = []
    for m_info in months:
        yr, mo, mo_name = m_info["year"], m_info["month"], m_info["name"]
        last_day = _cal.monthrange(yr, mo)[1]
        resolution_date = f"{yr}-{mo:02d}-{last_day}"
        for city_key, city_info in cities.items():
            threshold = normals_by_city.get(city_key, {}).get(mo)
            if threshold is None or np.isnan(threshold):
                continue
            threshold = round(threshold, 1)
            questions.append({
                "id": f"temp_{city_key}_{mo_name.lower()}_{yr}",
                "category": "temperature",
                "text": (f"Will the average daily high temperature in {city_info['name']}"
                         f" exceed {threshold}\u00b0F in {mo_name} {yr}?"),
                "resolution_date": resolution_date,
                "resolution_source": "Open-Meteo ERA5",
                "city_key": city_key,
                "city_name": city_info["name"],
                "lat": city_info["lat"],
                "lon": city_info["lon"],
                "month": mo,
                "year": yr,
                "threshold_f": threshold,
                "historical_normal_f": threshold,
            })
    return questions


# ── Historical FOMC dates ────────────────────────────────────────────────────
FOMC_DATES_2024 = [
    {"meeting": "January",   "date": "2024-01-31"},
    {"meeting": "March",     "date": "2024-03-20"},
    {"meeting": "May",       "date": "2024-05-01"},
    {"meeting": "June",      "date": "2024-06-12"},
    {"meeting": "July",      "date": "2024-07-31"},
    {"meeting": "September", "date": "2024-09-18"},
    {"meeting": "November",  "date": "2024-11-07"},
    {"meeting": "December",  "date": "2024-12-18"},
]
FOMC_DATES_2025 = [
    {"meeting": "January",   "date": "2025-01-29"},
    {"meeting": "March",     "date": "2025-03-19"},
    {"meeting": "May",       "date": "2025-05-07"},
    {"meeting": "June",      "date": "2025-06-18"},
    {"meeting": "July",      "date": "2025-07-30"},
    {"meeting": "September", "date": "2025-09-17"},
    {"meeting": "October",   "date": "2025-10-29"},
    {"meeting": "December",  "date": "2025-12-10"},
]

# Historical temperature months
HIST_TEMP_MONTHS_TRAIN = [
    {"year": 2024, "month": m, "name": _cal.month_name[m]}
    for m in range(3, 13)  # Mar–Dec 2024
]
HIST_TEMP_MONTHS_VAL = [
    {"year": 2025, "month": m, "name": _cal.month_name[m]}
    for m in range(1, 13)  # All of 2025
    if datetime(2025, m, _cal.monthrange(2025, m)[1]) < datetime.now()
]

# Historical gas as-of dates (quarterly)
GAS_AS_OF_TRAIN = ["2024-01-08", "2024-04-01", "2024-07-08", "2024-10-07"]
GAS_AS_OF_VAL   = ["2025-01-06", "2025-04-07", "2025-07-07", "2025-10-06"]


# ── Generate all questions ───────────────────────────────────────────────────
current_rate = fred_data["target_upper"].dropna().iloc[-1]
current_gas  = gas_prices["value"].iloc[-1]

# TEST split (2026)
fed_questions  = generate_fed_questions(FOMC_DATES_2026, current_rate)
gas_questions  = generate_gas_questions(current_gas)
temp_questions = _add_meta(
    generate_temperature_questions(CITIES, TEMP_MONTHS, city_normals), "test"
)
for q in temp_questions:
    q["forecast_date"] = datetime.now().strftime("%Y-%m-%d")

# TRAIN split (2024)
hist_fed_train  = generate_fed_questions_historical(FOMC_DATES_2024, 2024)
hist_gas_train  = generate_historical_gas_questions(gas_prices, GAS_AS_OF_TRAIN, "train")
hist_temp_train = _add_meta(
    generate_temperature_questions(CITIES, HIST_TEMP_MONTHS_TRAIN, city_normals), "train"
)

# VALIDATE split (2025)
hist_fed_val  = generate_fed_questions_historical(FOMC_DATES_2025, 2025)
hist_gas_val  = generate_historical_gas_questions(gas_prices, GAS_AS_OF_VAL, "validate")
hist_temp_val = _add_meta(
    generate_temperature_questions(CITIES, HIST_TEMP_MONTHS_VAL, city_normals), "validate"
)

all_questions = (
    fed_questions + gas_questions + temp_questions
    + hist_fed_train + hist_gas_train + hist_temp_train
    + hist_fed_val   + hist_gas_val   + hist_temp_val
)

# Convenience: test-only questions (for tool-augmented forecasting)
test_questions = [q for q in all_questions if q["split"] == "test"]

# ── Summary ──────────────────────────────────────────────────────────────────
split_counts = {}
for q in all_questions:
    k = (q["split"], q["category"])
    split_counts[k] = split_counts.get(k, 0) + 1

print(f"Total questions: {len(all_questions)}")
print(f"{'Split':<12} {'Category':<14} {'Count':>5}")
for (split_name, cat), cnt in sorted(split_counts.items()):
    print(f"  {split_name:<10} {cat:<14} {cnt:>5}")
print(f"\nTest questions: {len(test_questions)}")

In [None]:
# --- 2.2 Collect Prediction Market Probabilities ---

def match_kalshi_market(markets, question):
    """Find the Kalshi market that best matches our question (Fed / gas)."""
    for m in markets:
        title_lower = m.get("title", "").lower()
        if question["category"] == "fed_rate":
            month = question["meeting_month"]
            if month in title_lower and ("cut" in title_lower or "rate" in title_lower):
                return m
        elif question["category"] == "gas_price":
            threshold_str = f"{question['threshold']:.2f}"
            if threshold_str in title_lower or str(question["threshold"]) in title_lower:
                return m
    return None


def match_kalshi_temp_market(markets, question):
    """Best-effort: find a Kalshi temperature market overlapping our city/threshold."""
    city_words = [w for w in question["city_name"].lower().split() if len(w) > 3]
    threshold = question["threshold_f"]
    for m in markets:
        title_lower = m.get("title", "").lower()
        city_match = any(w in title_lower for w in city_words)
        numbers = re.findall(r"\\d+\\.?\\d*", title_lower)
        temp_match = any(abs(float(n) - threshold) <= 5 for n in numbers)
        if city_match and temp_match:
            return m
    return None


def match_polymarket_event(events, question):
    """Find the Polymarket event that best matches our question."""
    for e in events:
        title_lower = e.get("title", "").lower()
        if question["category"] == "fed_rate":
            month = question["meeting_month"]
            if month in title_lower and "fed" in title_lower:
                return e
        elif question["category"] == "gas_price":
            if "gas" in title_lower:
                return e
        elif question["category"] == "temperature":
            city_words = [w for w in question["city_name"].lower().split() if len(w) > 3]
            if any(w in title_lower for w in city_words) and (
                "temperature" in title_lower or "weather" in title_lower
            ):
                return e
    return None


def get_market_price(market_obj, source):
    """Extract the YES probability from a market object."""
    if source == "kalshi":
        for key in ["yes_bid", "last_price", "yes_ask"]:
            val = market_obj.get(key)
            if val is not None:
                val = float(val)
                return val / 100 if val > 1 else val
    elif source == "polymarket":
        markets = market_obj.get("markets", [])
        if markets:
            for mkt in markets:
                price = mkt.get("outcomePrices")
                if price:
                    prices = json.loads(price) if isinstance(price, str) else price
                    if prices:
                        return float(prices[0])
        if "price" in market_obj:
            return float(market_obj["price"])
    return np.nan


def collect_market_probabilities(questions, kalshi_fed, kalshi_gas, kalshi_temp,
                                  poly_fed, poly_gas):
    """Collect prediction market probabilities for all questions."""
    results = []
    for q in questions:
        row = {"question_id": q["id"], "question_text": q["text"]}

        # Kalshi
        if q["category"] == "fed_rate":
            match = match_kalshi_market(kalshi_fed, q)
        elif q["category"] == "gas_price":
            match = match_kalshi_market(kalshi_gas, q)
        elif q["category"] == "temperature":
            match = match_kalshi_temp_market(kalshi_temp, q)
        else:
            match = None

        if match:
            row["kalshi_prob"]   = get_market_price(match, "kalshi")
            row["kalshi_ticker"] = match.get("ticker", "")
        else:
            row["kalshi_prob"]   = np.nan
            row["kalshi_ticker"] = None

        # Polymarket (temperature markets rarely on Polymarket; fall back to NaN)
        if q["category"] == "temperature":
            poly_pool = []
        elif q["category"] == "fed_rate":
            poly_pool = poly_fed
        else:
            poly_pool = poly_gas
        match = match_polymarket_event(poly_pool, q)
        row["polymarket_prob"] = get_market_price(match, "polymarket") if match else np.nan

        results.append(row)
    return pd.DataFrame(results)


market_probs = collect_market_probabilities(
    all_questions, kalshi_fed, kalshi_gas, kalshi_temp_markets,
    poly_fed_events, poly_gas_events,
)

print("Prediction Market Probabilities:")
print(market_probs[["question_id", "kalshi_prob", "polymarket_prob"]].to_string(index=False))

kalshi_cov = market_probs["kalshi_prob"].notna().sum()
poly_cov   = market_probs["polymarket_prob"].notna().sum()
print(f"\nCoverage: Kalshi {kalshi_cov}/{len(all_questions)}, Polymarket {poly_cov}/{len(all_questions)}")

temp_mask = market_probs["question_id"].str.startswith("temp_")
print(f"Temperature Kalshi coverage: {market_probs.loc[temp_mask, 'kalshi_prob'].notna().sum()}"
      f"/{temp_mask.sum()}  (many will be NaN — Kalshi temp markets use daily, not monthly, contracts)")

# Section 3: LLM Forecasting

## 3.1 Vanilla Prompting (No Tools)

Each model receives:
1. A **system prompt** establishing the forecasting persona
2. A **user prompt** with the specific binary question and resolution criteria
3. Instructions to output a probability between 0.0 and 1.0

No external data access â€” the model relies entirely on its training data and reasoning.

**Key design choice**: prompts deliberately exclude prediction market prices to ensure LLM forecasts are independent and can be fairly compared against market prices.

In [None]:
# --- 3.1 Vanilla Prompting Setup ---

VANILLA_SYSTEM_PROMPT = """You are an expert forecaster and superforecaster. Your task is to estimate
the probability that a specific event will occur. You must provide a single
probability estimate between 0.0 (certainly will NOT happen) and 1.0 (certainly WILL happen).

Guidelines:
- Consider base rates and historical patterns
- Account for current economic conditions based on your training data
- Be well-calibrated: events you assign 70% probability should occur about 70% of the time
- Avoid anchoring to round numbers (0.5, 0.25, 0.75) unless truly justified
- Consider both arguments for and against the event occurring

You MUST end your response with exactly one line in this format:
PROBABILITY: X.XX

where X.XX is your probability estimate between 0.00 and 1.00."""

VANILLA_USER_TEMPLATE = """Today's date is {today_date}.

Question: {question_text}

Resolution criteria: {resolution_criteria}

Please reason through this step by step, then provide your probability estimate."""


def parse_probability_from_response(text):
    """Extract the probability value from an LLM response."""
    text = str(text)
    # Look for PROBABILITY: X.XX pattern
    match = re.search(r"PROBABILITY:\s*(0\.\d+|1\.00?|0\.0+|1\.0)", text)
    if match:
        return float(match.group(1))
    # Fallback: look for any decimal between 0 and 1 near the end
    matches = re.findall(r"\b(0\.\d+|1\.0)\b", text[-300:])
    if matches:
        return float(matches[-1])
    return np.nan


def get_llm_instance(model_key, temperature=0):
    """Factory function to create an LLM instance."""
    config = MODELS[model_key]
    if config["provider"] == "google":
        return ChatGoogleGenerativeAI(model=config["model_id"], temperature=temperature)
    elif config["provider"] == "openai":
        return ChatOpenAI(model=config["model_id"], temperature=temperature)
    elif config["provider"] == "anthropic":
        return ChatAnthropic(model=config["model_id"], temperature=temperature)
    else:
        raise ValueError(f"Unknown provider: {config['provider']}")


def get_resolution_criteria(question):
    """Build resolution criteria string for a question."""
    if question["category"] == "fed_rate":
        return (
            f"Resolves YES if the FRED federal funds target rate upper bound (DFEDTARU)"
            f" decreases after the {question['resolution_date']} FOMC meeting."
            f" Current target rate upper bound: {question['current_rate']:.2f}%."
        )
    else:
        return (
            f"Resolves YES if the EIA weekly US national average retail gasoline price"
            f" exceeds ${question['threshold']:.2f}/gal by {question['resolution_date']}."
            f" Current price: ${question['current_price']:.3f}/gal."
        )


print("Vanilla prompting setup complete.")
print(f"Models to evaluate: {', '.join(MODELS[k]['name'] for k in MODELS)}")

In [None]:
# --- 3.1 Run Vanilla Forecasting ---
# Runs on ALL questions (train, validate, test).
# For historical questions the prompt date is set to 30 days before resolution,
# so the LLM is asked to forecast "as of" that earlier date.

def run_vanilla_forecasting(questions, cache):
    """Run vanilla (no-tool) LLM forecasting across all models and questions."""
    results = []
    today_default = datetime.now().strftime("%Y-%m-%d")

    for model_key in MODELS:
        print(f"\nForecasting with {MODELS[model_key]['name']} (vanilla)...")
        llm = get_llm_instance(model_key, temperature=0)

        for q in tqdm(questions, desc=MODELS[model_key]["name"]):
            cache_key = get_cache_key("vanilla", model_key, q["id"])
            forecast_date = q.get("forecast_date", today_default)

            if cache_key in cache:
                output = cache[cache_key]["output"]
            else:
                criteria = get_resolution_criteria(q)
                messages = [
                    SystemMessage(content=VANILLA_SYSTEM_PROMPT),
                    HumanMessage(content=VANILLA_USER_TEMPLATE.format(
                        today_date=forecast_date,
                        question_text=q["text"],
                        resolution_criteria=criteria,
                    )),
                ]
                try:
                    response = llm.invoke(messages)
                    output = response.content
                except Exception as e:
                    print(f"  Error ({model_key}, {q['id']}): {e}")
                    output = f"ERROR: {str(e)}"

                cache[cache_key] = {"output": output}
                save_cache(cache)

            prob = parse_probability_from_response(output)
            results.append({
                "question_id":   q["id"],
                "model":         MODELS[model_key]["name"],
                "method":        "vanilla",
                "probability":   prob,
                "split":         q.get("split", "test"),
                "raw_output":    str(output)[-300:],
            })

    return pd.DataFrame(results)


cache = load_cache()
vanilla_results = run_vanilla_forecasting(all_questions, cache)
print(f"\nCollected {len(vanilla_results)} vanilla forecasts")
print(vanilla_results.groupby(["split", "model"])["probability"].describe().round(3))

## 3.2 Tool-Augmented LLM Forecasting

Now we give the same models access to real-time data tools:
1. **`get_federal_funds_rate`**: Fetch current and historical federal funds rate data from FRED
2. **`get_gas_prices`**: Fetch current and historical gasoline prices from EIA
3. **`get_fomc_schedule`**: Get the 2026 FOMC meeting schedule with past/upcoming status

We use LangChain's `bind_tools()` interface, which works across all three providers. The model decides which tools to call, receives the results, and then produces its forecast.

**The key question: does tool access improve forecasting accuracy?**

In [None]:
# --- 3.2 Tool Definitions ---

@tool
def get_federal_funds_rate(lookback_days: int = 90) -> str:
    """Fetch the current and recent federal funds rate data from FRED.

    Args:
        lookback_days: Number of days of historical data to return (default 90)

    Returns:
        A string summary of the federal funds rate data.
    """
    fred = Fred(api_key=FRED_API_KEY)
    start = (datetime.now() - timedelta(days=lookback_days)).strftime("%Y-%m-%d")

    rate         = fred.get_series("DFF",      observation_start=start)
    target_upper = fred.get_series("DFEDTARU", observation_start=start)
    target_lower = fred.get_series("DFEDTARL", observation_start=start)

    current_rate  = rate.dropna().iloc[-1]
    current_upper = target_upper.dropna().iloc[-1]
    current_lower = target_lower.dropna().iloc[-1]

    changes = target_upper.diff().dropna()
    cuts  = changes[changes < 0]
    hikes = changes[changes > 0]

    return (
        f"Federal Funds Rate Data (last {lookback_days} days):\n"
        f"- Current effective rate: {current_rate:.2f}%\n"
        f"- Current target range: {current_lower:.2f}% - {current_upper:.2f}%\n"
        f"- Rate cuts in period: {len(cuts)} (total: {cuts.sum():.2f}pp)\n"
        f"- Rate hikes in period: {len(hikes)} (total: {hikes.sum():.2f}pp)\n"
        f"- Rate on {rate.dropna().index[-1].strftime('%Y-%m-%d')}: {current_rate:.2f}%\n"
        f"- Rate {lookback_days} days ago: {rate.dropna().iloc[0]:.2f}%"
    )


@tool
def get_gas_prices(weeks: int = 12) -> str:
    """Fetch recent US retail gasoline price data from the EIA.

    Args:
        weeks: Number of weeks of historical data to return (default 12)

    Returns:
        A string summary of gasoline price data.
    """
    params = {
        "api_key": EIA_API_KEY,
        "frequency": "weekly",
        "data[0]": "value",
        "facets[product][]": "EPMR",
        "facets[duoarea][]": "NUS",
        "sort[0][column]": "period",
        "sort[0][direction]": "desc",
        "offset": 0,
        "length": weeks,
    }
    response = requests.get(EIA_GAS_PRICE_URL, params=params)
    data = response.json()["response"]["data"]

    prices = [(d["period"], float(d["value"])) for d in data]
    prices.sort(key=lambda x: x[0])

    current = prices[-1][1]
    high    = max(p[1] for p in prices)
    low     = min(p[1] for p in prices)
    avg     = sum(p[1] for p in prices) / len(prices)
    trend   = current - prices[0][1]

    return (
        f"US Retail Gasoline Prices (last {weeks} weeks):\n"
        f"- Current price: ${current:.3f}/gal (week of {prices[-1][0]})\n"
        f"- {weeks}-week high: ${high:.3f}/gal\n"
        f"- {weeks}-week low: ${low:.3f}/gal\n"
        f"- {weeks}-week average: ${avg:.3f}/gal\n"
        f"- Trend: {'Up' if trend > 0 else 'Down'} ${abs(trend):.3f}/gal over {weeks} weeks\n"
        f"- Recent weekly prices: {', '.join(f'${p[1]:.3f}' for p in prices[-6:])}"
    )


@tool
def get_fomc_schedule() -> str:
    """Get the 2026 FOMC meeting schedule and status.

    Returns:
        A string listing upcoming FOMC meetings with dates and status.
    """
    lines = []
    for m in FOMC_DATES_2026:
        meeting_date = datetime.strptime(m["date"], "%Y-%m-%d")
        status = "PAST" if meeting_date < datetime.now() else "UPCOMING"
        lines.append(f"  {m['meeting']} 2026 ({m['date']}): {status}")

    return (
        "2026 FOMC Meeting Schedule:\n"
        + "\n".join(lines)
        + "\n\nNote: The Fed announces its rate decision on the second day of each meeting."
    )


@tool
def get_temperature_data(city_name: str, lookback_months: int = 3) -> str:
    """Get recent average daily high temperatures and 10-year climate normals for a US city.

    Args:
        city_name: City to query. One of: New York, Chicago, Miami, Los Angeles,
                   Denver, Seattle.
        lookback_months: Months of recent history to fetch (default 3).

    Returns:
        Recent monthly avg-high temperatures compared to historical normals, plus
        the historical median for each upcoming forecast month.
    """
    # Match city name
    city_key = None
    for k, v in CITIES.items():
        if city_name.lower() in [k.replace("_", " "), v["name"].lower(), k]:
            city_key = k
            break
    if city_key is None:
        for k, v in CITIES.items():
            if city_name.lower() in v["name"].lower() or v["name"].lower() in city_name.lower():
                city_key = k
                break
    if city_key is None:
        return (
            f"City '{city_name}' not recognized. "
            f"Available: {', '.join(v['name'] for v in CITIES.values())}"
        )

    city     = CITIES[city_key]
    end_date = datetime.now().strftime("%Y-%m-%d")
    start_date = (datetime.now() - timedelta(days=lookback_months * 31)).strftime("%Y-%m-%d")

    try:
        recent_df = fetch_temp_history(city["lat"], city["lon"], start_date, end_date)
        recent_df["month"] = recent_df["date"].dt.month
        recent_df["year"]  = recent_df["date"].dt.year
        monthly_recent = recent_df.groupby(["year", "month"])["temp_max_f"].mean()

        normals = city_normals.get(city_key, {})
        import calendar as _cal

        lines = [f"Temperature data for {city['name']}:"]
        lines.append("\nRecent monthly avg-high temperatures:")
        for (yr, mo), mean in monthly_recent.items():
            normal = normals.get(mo, float("nan"))
            if not np.isnan(normal):
                dev = mean - normal
                lines.append(
                    f"  {_cal.month_name[mo]} {yr}: {mean:.1f}\u00b0F"
                    f"  ({dev:+.1f}\u00b0F vs 10-yr normal of {normal:.1f}\u00b0F)"
                )
            else:
                lines.append(f"  {_cal.month_name[mo]} {yr}: {mean:.1f}\u00b0F")

        lines.append("\n10-year median avg-high (forecast months):")
        for m_info in TEMP_MONTHS:
            mo = m_info["month"]
            normal = normals.get(mo, float("nan"))
            if not np.isnan(normal):
                lines.append(f"  {m_info['name']} 2026: {normal:.1f}\u00b0F (historical median)")

        return "\n".join(lines)

    except Exception as e:
        return f"Error fetching temperature data for {city['name']}: {e}"


TOOLS = [get_federal_funds_rate, get_gas_prices, get_fomc_schedule, get_temperature_data]
print(f"Defined {len(TOOLS)} tools: {[t.name for t in TOOLS]}")

In [None]:
# --- 3.2 Run Tool-Augmented Forecasting ---
# Tool-augmented forecasting runs on TEST questions only.
# Tools fetch real-time data (FRED, EIA, Open-Meteo) which is irrelevant for
# historical questions; using it on train/validate would contaminate the
# train/validate Brier comparison.

TOOL_SYSTEM_PROMPT = """You are an expert forecaster with access to real-time economic data tools.
Your task is to estimate the probability that a specific event will occur.

You have access to the following tools:
- get_federal_funds_rate: Fetch current and historical federal funds rate data
- get_gas_prices: Fetch recent US retail gasoline price data
- get_fomc_schedule: Get the 2026 FOMC meeting schedule
- get_temperature_data: Fetch recent avg-high temperatures and 10-year normals for a US city

Instructions:
1. FIRST, use the relevant tools to gather current data
2. THEN, reason through the question using the data you retrieved
3. Consider base rates, trends, and current conditions
4. Provide a well-calibrated probability estimate

You MUST end your response with exactly one line in this format:
PROBABILITY: X.XX"""

TOOL_USER_TEMPLATE = """Today's date is {today_date}.

Question: {question_text}

Resolution criteria: {resolution_criteria}

Please use the available tools to gather relevant data, then reason through this
step by step and provide your probability estimate."""


def run_tool_augmented_forecasting(questions, cache):
    """Run tool-augmented LLM forecasting (test questions only)."""
    results = []
    today = datetime.now().strftime("%Y-%m-%d")
    tool_map = {t.name: t for t in TOOLS}

    for model_key in MODELS:
        print(f"\nForecasting with {MODELS[model_key]['name']} (with tools)...")
        llm = get_llm_instance(model_key, temperature=0)
        llm_with_tools = llm.bind_tools(TOOLS)

        for q in tqdm(questions, desc=f"{MODELS[model_key]['name']} + tools"):
            cache_key = get_cache_key("tool", model_key, q["id"])

            if cache_key in cache:
                output = cache[cache_key]["output"]
            else:
                criteria = get_resolution_criteria(q)
                messages = [
                    SystemMessage(content=TOOL_SYSTEM_PROMPT),
                    HumanMessage(content=TOOL_USER_TEMPLATE.format(
                        today_date=today,
                        question_text=q["text"],
                        resolution_criteria=criteria,
                    )),
                ]
                output = "ERROR: max iterations reached"
                for _ in range(5):
                    try:
                        resp = llm_with_tools.invoke(messages)
                    except Exception as e:
                        output = f"ERROR: {str(e)}"
                        break
                    messages.append(resp)
                    if resp.tool_calls:
                        for tc in resp.tool_calls:
                            tool_fn = tool_map[tc["name"]]
                            tool_result = tool_fn.invoke(tc["args"])
                            messages.append(ToolMessage(
                                content=str(tool_result),
                                tool_call_id=tc["id"],
                            ))
                    else:
                        output = resp.content
                        break
                cache[cache_key] = {"output": output}
                save_cache(cache)

            prob = parse_probability_from_response(output)
            results.append({
                "question_id": q["id"],
                "model":       MODELS[model_key]["name"],
                "method":      "tool_augmented",
                "probability": prob,
                "split":       q.get("split", "test"),
                "raw_output":  str(output)[-300:],
            })

    return pd.DataFrame(results)


cache = load_cache()
tool_results = run_tool_augmented_forecasting(test_questions, cache)
print(f"\nCollected {len(tool_results)} tool-augmented forecasts (test split only)")
print(tool_results.groupby("model")["probability"].describe().round(3))

# Section 4: Scoring and Evaluation

## 4.1 Brier Score

$$BS = \frac{1}{N} \sum_{i=1}^{N} (p_i - o_i)^2$$

Reference benchmarks:
- **Perfect forecaster**: BS = 0.000
- **Always predict 50%** (no skill): BS = 0.250
- **Always 100% confident and wrong**: BS = 1.000

## 4.2 Hypothetical Returns

We simulate a threshold-based betting strategy:
- If forecast differs from market price by more than $\delta$ (default 10pp):
  - **Buy YES** if forecast > market + $\delta$ (cost = market price, payout = $1 if YES)
  - **Buy NO** if forecast < market - $\delta$ (cost = 1 - market price, payout = $1 if NO)
- Each bet is $1 notional

This tests whether the forecaster can identify **mispriced** markets.

In [None]:
# --- 4.1 Resolution & Scoring ---

def resolve_questions(questions, fred_data, gas_prices):
    """Determine the actual outcomes for resolved questions."""
    outcomes  = {}
    today     = datetime.now()
    cache_obj = load_cache()

    for q in questions:
        res_date = datetime.strptime(q["resolution_date"], "%Y-%m-%d")
        if res_date > today:
            outcomes[q["id"]] = np.nan
            continue

        if q["category"] == "fed_rate":
            target = fred_data["target_upper"]
            pre  = target[target.index <  res_date]
            post = target[target.index >= res_date]
            if len(pre) > 0 and len(post) > 0:
                outcomes[q["id"]] = 1 if post.iloc[0] < pre.iloc[-1] else 0
            else:
                outcomes[q["id"]] = np.nan

        elif q["category"] == "gas_price":
            prices_before = gas_prices[gas_prices["period"] <= res_date.strftime("%Y-%m-%d")]
            if len(prices_before) > 0:
                outcomes[q["id"]] = 1 if prices_before["value"].iloc[-1] > q["threshold"] else 0
            else:
                outcomes[q["id"]] = np.nan

        elif q["category"] == "temperature":
            import calendar as _cal
            yr, mo = q["year"], q["month"]
            ck = get_cache_key("temp_actual", q["city_key"], yr, mo)
            if ck in cache_obj:
                actual_mean = cache_obj[ck]
            else:
                try:
                    last_day = _cal.monthrange(yr, mo)[1]
                    df = fetch_temp_history(
                        q["lat"], q["lon"],
                        f"{yr}-{mo:02d}-01",
                        f"{yr}-{mo:02d}-{last_day}",
                    )
                    actual_mean = float(df["temp_max_f"].mean()) if len(df) > 0 else float("nan")
                except Exception:
                    actual_mean = float("nan")
                cache_obj[ck] = actual_mean
                save_cache(cache_obj)

            if np.isnan(actual_mean):
                outcomes[q["id"]] = np.nan
            else:
                outcomes[q["id"]] = 1 if actual_mean > q["threshold_f"] else 0

    return outcomes


def get_resolution_criteria(question):
    """Build a resolution-criteria string for the LLM prompt."""
    if question["category"] == "fed_rate":
        return (
            f"Resolves YES if the FRED federal funds target rate upper bound (DFEDTARU)"
            f" decreases after the {question['resolution_date']} FOMC meeting."
            f" Current target rate upper bound: {question['current_rate']:.2f}%."
        )
    elif question["category"] == "gas_price":
        return (
            f"Resolves YES if the EIA weekly US national average retail gasoline price"
            f" exceeds ${question['threshold']:.2f}/gal by {question['resolution_date']}."
            f" Current price: ${question['current_price']:.3f}/gal."
        )
    elif question["category"] == "temperature":
        return (
            f"Resolves YES if the average daily high temperature in {question['city_name']}"
            f" exceeds {question['threshold_f']}\u00b0F in"
            f" {question['year']}-{question['month']:02d},"
            f" measured as the mean of ERA5 daily maximum temperatures from Open-Meteo."
            f" Threshold = 10-year historical median ({question['historical_normal_f']}\u00b0F),"
            f" so the expected base rate is ~50%."
        )
    else:
        return f"Resolution criteria not defined for category: {question['category']}"


def compute_brier_scores(forecast_df, outcomes):
    """Compute Brier scores for each forecast."""
    df = forecast_df.copy()
    df["outcome"] = df["question_id"].map(outcomes)
    df = df.dropna(subset=["outcome", "probability"])
    if len(df) == 0:
        print("WARNING: No resolved questions with valid forecasts. Brier scores cannot be computed.")
        return df
    df["brier_score"] = (df["probability"] - df["outcome"]) ** 2
    return df


def compute_returns(forecast_df, market_probs, outcomes, delta=0.10):
    """Compute hypothetical returns from threshold-based betting against Kalshi."""
    df = forecast_df.copy()
    df["outcome"] = df["question_id"].map(outcomes)
    df = df.merge(market_probs[["question_id", "kalshi_prob"]], on="question_id", how="left")
    df = df.dropna(subset=["outcome", "probability", "kalshi_prob"])

    rows = []
    for _, row in df.iterrows():
        p_f, p_m, outcome = row["probability"], row["kalshi_prob"], row["outcome"]
        if p_f > p_m + delta:
            rows.append({**row, "action": "BUY_YES", "profit": outcome * 1.0 - p_m})
        elif p_f < p_m - delta:
            rows.append({**row, "action": "BUY_NO",  "profit": (1 - outcome) * 1.0 - (1 - p_m)})
        else:
            rows.append({**row, "action": "NO_BET",  "profit": 0.0})
    return pd.DataFrame(rows)


print("Scoring functions defined.")

In [None]:
# --- 4.2 Compute Results ---

# Build split lookup from questions
split_map = {q["id"]: q.get("split", "test") for q in all_questions}

# Combine all forecasts and stamp split
all_forecasts = pd.concat([vanilla_results, tool_results], ignore_index=True)
if "split" not in all_forecasts.columns:
    all_forecasts["split"] = all_forecasts["question_id"].map(split_map)

# Add prediction market forecasts
market_rows = []
for _, row in market_probs.iterrows():
    s = split_map.get(row["question_id"], "test")
    if pd.notna(row.get("kalshi_prob")):
        market_rows.append({
            "question_id": row["question_id"],
            "model": "Kalshi (Market)",
            "method": "prediction_market",
            "probability": row["kalshi_prob"],
            "split": s,
            "raw_output": "",
        })
    if pd.notna(row.get("polymarket_prob")):
        market_rows.append({
            "question_id": row["question_id"],
            "model": "Polymarket (Market)",
            "method": "prediction_market",
            "probability": row["polymarket_prob"],
            "split": s,
            "raw_output": "",
        })
if market_rows:
    all_forecasts = pd.concat([all_forecasts, pd.DataFrame(market_rows)], ignore_index=True)

print(f"Total forecasts: {len(all_forecasts)}")
print(all_forecasts.groupby(["split", "method", "model"]).size().reset_index(name="n").to_string(index=False))

# Resolve questions (all splits)
outcomes = resolve_questions(all_questions, fred_data, gas_prices)
by_split = {}
for q in all_questions:
    s = q.get("split", "test")
    by_split.setdefault(s, {"resolved": 0, "pending": 0})
    if pd.notna(outcomes.get(q["id"])):
        by_split[s]["resolved"] += 1
    else:
        by_split[s]["pending"] += 1

print("\nResolution status by split:")
for s, counts in sorted(by_split.items()):
    print(f"  {s}: {counts['resolved']} resolved, {counts['pending']} pending")

# Brier scores — overall and by split
scored_df = compute_brier_scores(all_forecasts, outcomes)
if len(scored_df) > 0:
    # Add split column to scored_df
    if "split" not in scored_df.columns:
        scored_df["split"] = scored_df["question_id"].map(split_map)

    print("\n" + "=" * 70)
    print("BRIER SCORES — overall (all resolved questions)")
    print("=" * 70)
    overall = (scored_df.groupby(["method", "model"])["brier_score"]
               .agg(["mean", "std", "count"]).rename(
               columns={"mean": "Mean", "std": "Std", "count": "N"})
               .sort_values("Mean").round(4))
    print(overall)

    print("\n" + "=" * 70)
    print("BRIER SCORES — by split")
    print("=" * 70)
    by_split_bs = (scored_df.groupby(["split", "method", "model"])["brier_score"]
                   .agg(["mean", "count"]).rename(columns={"mean": "Mean Brier", "count": "N"})
                   .round(4))
    print(by_split_bs)

    # Kalshi NYC historical benchmark
    if not kalshi_nyc_history.empty:
        nyc_valid = kalshi_nyc_history.dropna(subset=["market_prob"])
        if len(nyc_valid) > 0:
            print("\n" + "=" * 70)
            print("KALSHI NYC HISTORICAL BENCHMARK (Brier by split)")
            print("=" * 70)
            for s, g in nyc_valid.groupby("split"):
                bs = ((g["market_prob"] - g["outcome"]) ** 2).mean()
                print(f"  {s}: N={len(g)}, Brier={bs:.4f}")
else:
    print("\nNo questions resolved yet — re-run after resolution dates.")

# Hypothetical returns (test split + market benchmark only)
returns_df = compute_returns(all_forecasts, market_probs, outcomes, delta=0.10)
if len(returns_df) > 0 and returns_df["profit"].abs().sum() > 0:
    print("\n" + "=" * 70)
    print("HYPOTHETICAL RETURNS vs. Kalshi (delta=0.10)")
    print("=" * 70)
    ret_sum = (returns_df.groupby(["method", "model"])["profit"]
               .agg(["sum", "mean", "count"]).rename(
               columns={"sum": "Total P&L", "mean": "Avg/Bet", "count": "N Bets"})
               .sort_values("Total P&L", ascending=False).round(4))
    print(ret_sum)

# Section 5: Visualizations

In [None]:
# --- 5.1 Brier Score and Returns Charts ---

fig, axs = plt.subplots(1, 3, figsize=(18, 6))

# Plot 1: Mean Brier Score by forecaster (overall)
if len(scored_df) > 0:
    brier_by_method = scored_df.groupby(["model", "method"])["brier_score"].mean().reset_index()
    sns.barplot(data=brier_by_method, x="model", y="brier_score", hue="method", ax=axs[0])
    axs[0].set_title("Mean Brier Score by Forecaster\n(all resolved questions, lower = better)")
    axs[0].set_xlabel("")
    axs[0].set_ylabel("Mean Brier Score")
    axs[0].tick_params(axis="x", rotation=45)
    axs[0].axhline(y=0.25, color="red", linestyle="--", alpha=0.5, label="No-skill (0.25)")
    axs[0].legend(fontsize=7)
else:
    axs[0].text(0.5, 0.5, "No resolved questions yet", ha="center", va="center",
                transform=axs[0].transAxes)
    axs[0].set_title("Mean Brier Score (pending)")

# Plot 2: Brier Score by split — shows train/validate/test generalisation gap
if len(scored_df) > 0 and "split" in scored_df.columns:
    split_brier = (scored_df.groupby(["split", "method"])["brier_score"]
                   .mean().reset_index())
    split_order = [s for s in ["train", "validate", "test"] if s in split_brier["split"].values]
    sns.barplot(data=split_brier, x="split", y="brier_score", hue="method",
                order=split_order, ax=axs[1])
    axs[1].set_title("Brier Score by Split\n(train/validate = in-training-data)")
    axs[1].set_xlabel("")
    axs[1].set_ylabel("Mean Brier Score")
    axs[1].axhline(y=0.25, color="red", linestyle="--", alpha=0.5)
    axs[1].legend(fontsize=7)
else:
    axs[1].text(0.5, 0.5, "No resolved questions yet", ha="center", va="center",
                transform=axs[1].transAxes)
    axs[1].set_title("Brier by Split (pending)")

# Plot 3: Cumulative returns
if len(returns_df) > 0 and returns_df["profit"].abs().sum() > 0:
    for name, group in returns_df.groupby(["model", "method"]):
        label = f"{name[0]} ({name[1]})"
        cumulative = group["profit"].cumsum()
        axs[2].plot(range(len(cumulative)), cumulative, label=label, marker="o", markersize=3)
    axs[2].set_title("Cumulative Hypothetical Returns")
    axs[2].set_xlabel("Bet Number")
    axs[2].set_ylabel("Cumulative P&L ($)")
    axs[2].axhline(y=0, color="black", linestyle="-", linewidth=0.5)
    axs[2].legend(fontsize=6)
else:
    axs[2].text(0.5, 0.5, "No bets placed yet", ha="center", va="center",
                transform=axs[2].transAxes)
    axs[2].set_title("Cumulative Returns (pending)")

plt.tight_layout()
plt.show()

In [None]:
# --- 5.2 Calibration Plot ---

def plot_calibration(scored_df, n_bins=5):
    """Plot calibration curves for each forecaster."""
    fig, ax = plt.subplots(figsize=(8, 8))

    for (model, method), group in scored_df.groupby(["model", "method"]):
        probs = group["probability"].values
        outcomes_arr = group["outcome"].values

        bins = np.linspace(0, 1, n_bins + 1)
        bin_means = []
        bin_freqs = []
        for i in range(n_bins):
            mask = (probs >= bins[i]) & (probs < bins[i + 1])
            if mask.sum() > 0:
                bin_means.append(probs[mask].mean())
                bin_freqs.append(outcomes_arr[mask].mean())

        if bin_means:
            ax.plot(bin_means, bin_freqs, marker="o", label=f"{model} ({method})")

    ax.plot([0, 1], [0, 1], "k--", label="Perfect calibration")
    ax.set_xlabel("Predicted Probability")
    ax.set_ylabel("Observed Frequency")
    ax.set_title("Calibration Plot")
    ax.legend(loc="lower right", fontsize=7)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    plt.tight_layout()
    plt.show()


if len(scored_df) > 0:
    plot_calibration(scored_df)
else:
    print("Calibration plot will be available once questions resolve.")

In [None]:
# --- 5.3 Forecast Comparison Heatmap ---

def plot_forecast_heatmap(forecasts, questions):
    """Heatmap of all probabilities: questions (rows) x forecasters (columns)."""
    forecasts = forecasts.copy()
    forecasts["forecaster"] = forecasts["model"] + "\n(" + forecasts["method"] + ")"

    pivot = forecasts.pivot_table(
        index="question_id",
        columns="forecaster",
        values="probability",
    )

    fig, ax = plt.subplots(figsize=(16, max(8, len(pivot) * 0.5)))
    sns.heatmap(
        pivot, annot=True, fmt=".2f", cmap="RdYlGn", center=0.5,
        vmin=0, vmax=1, ax=ax, cbar_kws={"label": "Probability"},
    )
    ax.set_title("Forecast Comparison Heatmap")
    ax.set_ylabel("Question")
    ax.set_xlabel("")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


plot_forecast_heatmap(all_forecasts, all_questions)

# Section 6: Discussion and Conclusions

## Key Questions

1. **Do prediction markets outperform LLMs?**
   - Compare Brier scores of Kalshi/Polymarket vs. vanilla LLMs vs. tool-augmented LLMs
   - Markets aggregate information from many participants; can a single LLM match this?

2. **Does tool access improve LLM forecasting?**
   - Compare vanilla vs. tool-augmented Brier scores for each model
   - Real-time data should help, but does the model use it effectively?

3. **Which LLM is the best forecaster?**
   - Rank GPT-5, Gemini, Claude by Brier score and returns
   - Does the ranking change between vanilla and tool-augmented conditions?

4. **Are there category-specific patterns?**
   - Fed rate questions may favor models with strong economic reasoning
   - Gas price questions may favor models with access to trend data

5. **Can any forecaster generate positive returns against the market?**
   - A positive total P&L means the forecaster identified genuine mispricings
   - How sensitive are returns to the betting threshold delta?

## Limitations

- **Small sample size**: limited by the number of resolvable questions within the project timeframe
- **Single market snapshot**: prediction market prices were captured at one point in time (markets update continuously)
- **LLM training cutoffs**: models may lack recent economic data in their training, which is exactly what tool augmentation addresses
- **Question design**: our questions may not perfectly overlap with existing prediction market contracts
- **Not all questions may resolve**: FOMC meetings later in 2026 won't have outcomes during the semester

In [None]:
# --- 6.1 Summary Statistics ---

if len(scored_df) > 0:
    # Overall ranking
    summary = scored_df.groupby(["method", "model"]).agg({
        "brier_score": ["mean", "std"],
        "probability": ["mean", "std"],
        "question_id": "count",
    }).round(4)
    summary.columns = ["Mean Brier", "Std Brier", "Mean Prob", "Std Prob", "N"]
    summary = summary.sort_values("Mean Brier")

    print("=" * 70)
    print("FINAL RESULTS: Forecaster Ranking by Brier Score (all resolved)")
    print("=" * 70)
    print(summary)
    print(f"\nBaseline (always 0.5): Brier = 0.2500  |  Perfect: Brier = 0.0000")

    # Split breakdown
    if "split" in scored_df.columns:
        print("\n" + "=" * 70)
        print("TRAIN / VALIDATE / TEST BREAKDOWN")
        print("(train & validate = in-LLM-training-data; test = post-cutoff)")
        print("=" * 70)
        split_summary = scored_df.groupby(["split", "method", "model"]).agg({
            "brier_score": ["mean", "count"],
        }).round(4)
        split_summary.columns = ["Mean Brier", "N"]
        print(split_summary.sort_index())

        # Generalisation gap: test Brier vs. validate Brier
        print("\n--- Generalisation gap (test - validate Brier) ---")
        for (method, model), grp in scored_df.groupby(["method", "model"]):
            val_bs  = grp.loc[grp["split"] == "validate", "brier_score"].mean()
            test_bs = grp.loc[grp["split"] == "test",     "brier_score"].mean()
            if pd.notna(val_bs) and pd.notna(test_bs):
                gap = test_bs - val_bs
                print(f"  {model} ({method}): validate={val_bs:.4f}  test={test_bs:.4f}  gap={gap:+.4f}")

    # Kalshi NYC historical benchmark
    if not kalshi_nyc_history.empty:
        nyc_v = kalshi_nyc_history.dropna(subset=["market_prob"])
        if len(nyc_v) > 0:
            print("\n" + "=" * 70)
            print("KALSHI NYC HISTORICAL MARKET ACCURACY")
            print("=" * 70)
            for s, g in nyc_v.groupby("split"):
                bs = ((g["market_prob"] - g["outcome"]) ** 2).mean()
                n  = len(g)
                print(f"  {s}: N={n:4d}, Brier={bs:.4f}")

else:
    print("=" * 70)
    print("FORECAST SUMMARY (questions not yet resolved)")
    print("=" * 70)
    prob_summary = all_forecasts.groupby(["split", "method", "model"])["probability"].agg(
        ["mean", "std", "count"])
    prob_summary.columns = ["Mean Prob", "Std Prob", "N"]
    print(prob_summary.round(4))
    print("\nBrier scores will be computed after resolution dates pass.")