# 01 — EDA: Raw Data Inspection

**Goal:** Validate that all 41 FOMC meetings load cleanly, inspect the canonical intraday bar format, check data coverage across sources, and surface any quality issues before building the pipeline.

**Sections:**
1. Setup
2. FOMC metadata
3. Policy rates
4. Intraday bars — structure & coverage
5. Sample bars around the announcement window
6. Coverage heatmap
7. Transcript availability

## 1. Setup

In [None]:
import sys
import pathlib

# Add project root to path so we can import from src/
PROJECT_ROOT = pathlib.Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

DATA_RAW = PROJECT_ROOT / "data-raw"
DATA_CLEAN = PROJECT_ROOT / "data-clean"
DATA_CLEAN.mkdir(exist_ok=True)

print(f"Project root : {PROJECT_ROOT}")
print(f"Data raw     : {DATA_RAW}")

In [None]:
import logging
import warnings

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import seaborn as sns

from src.config import load_config
from src.ingest import (
    load_fomc_metadata,
    load_intraday_bars,
    load_policy_rates,
    load_transcripts,
    save_transcripts_json,
)
from src.clean import coverage_report, qa_intraday_bars, write_parquet

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
warnings.filterwarnings("ignore", category=FutureWarning)

cfg = load_config(PROJECT_ROOT / "configs" / "config.yaml")
print("Config loaded. Pairs:", cfg.pairs)
print("Windows:", cfg.windows.statement, cfg.windows.digestion)

## 2. FOMC Metadata

In [None]:
fomc = load_fomc_metadata(DATA_RAW)
print(f"\n{len(fomc)} FOMC meetings loaded")
fomc

In [None]:
# Save clean metadata
write_parquet(fomc, DATA_CLEAN / "fomc_metadata.parquet")

meeting_ids = fomc["meeting_id"].tolist()
print(f"Meeting IDs: {meeting_ids[:5]} ... {meeting_ids[-3:]}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(13, 4))

# Rate midpoint over time
ax = axes[0]
ax.plot(fomc["announcement_et"], fomc["midpoint"], marker="o", ms=4, lw=1.5)
ax.set_title("Fed Funds Target Midpoint (%)")
ax.set_xlabel("Meeting date")
ax.set_ylabel("Rate (%)")
ax.grid(True, alpha=0.3)

# Hike / cut / hold counts
ax = axes[1]
action_counts = {"Hike": fomc["is_hike"].sum(), "Cut": fomc["is_cut"].sum(), "Hold": fomc["is_hold"].sum()}
bars = ax.bar(action_counts.keys(), action_counts.values(), color=["#d62728", "#2ca02c", "#7f7f7f"])
ax.bar_label(bars)
ax.set_title("FOMC Actions (2021–2026)")
ax.set_ylabel("Count")
ax.grid(True, alpha=0.3, axis="y")

plt.tight_layout()
plt.show()

In [None]:
# Votes and dissents
print("Dissent count distribution:")
print(fomc["votes_against"].value_counts().sort_index())

print("\nRate changes (bps):")
print(fomc["rate_change"].dropna().describe())

## 3. Policy Rates

In [None]:
policy = load_policy_rates(DATA_RAW)
write_parquet(policy.reset_index(), DATA_CLEAN / "policy_rates.parquet")
print(policy.tail())
print(f"\nShape: {policy.shape}")

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(13, 7), sharex=True)

# Policy rates
rate_cols = ["fed_rate", "ecb_rate", "boe_rate", "boc_rate", "boj_rate"]
labels = {"fed_rate": "Fed", "ecb_rate": "ECB", "boe_rate": "BoE", "boc_rate": "BoC", "boj_rate": "BoJ"}
for col in rate_cols:
    axes[0].plot(policy.index, policy[col], label=labels[col], lw=1.5)
axes[0].set_title("Central Bank Policy Rates (%)")
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Spreads
spread_cols = ["fed_minus_ecb", "fed_minus_boe", "fed_minus_boc", "fed_minus_boj"]
for col in spread_cols:
    axes[1].plot(policy.index, policy[col], label=col.replace("fed_minus_", "Fed − ").upper(), lw=1.5)
axes[1].axhline(0, color="k", lw=0.8, ls="--")
axes[1].set_title("Fed Funds Spread vs. Foreign CBs (Fed minus)")
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Intraday Bars — Structure & Coverage

In [None]:
# Load ALL sources for all meetings — may take ~30 seconds
bars_raw = load_intraday_bars(DATA_RAW, meeting_ids)
print(f"Raw bars: {len(bars_raw):,} rows")
print(f"Sources: {bars_raw['source'].unique().tolist()}")
bars_raw.head()

In [None]:
bars_clean, qa_report = qa_intraday_bars(bars_raw)

print("QA Report:")
for k, v in qa_report.items():
    if k != "large_gap_details":
        print(f"  {k}: {v}")

if qa_report["large_gap_details"]:
    print("\nLarge gaps (>10 min):")
    for g in qa_report["large_gap_details"]:
        print(f"  {g}")

In [None]:
# Save clean bars
write_parquet(bars_clean, DATA_CLEAN / "intraday_bars.parquet")

In [None]:
# Bar counts per source
source_summary = (
    bars_clean.groupby("source")
    .agg(n_bars=("close", "count"), n_meetings=("meeting_id", "nunique"))
    .reset_index()
    .assign(avg_bars_per_meeting=lambda d: (d["n_bars"] / d["n_meetings"]).round(1))
)
print(source_summary.to_string(index=False))

## 5. Sample Bars Around the Announcement Window

In [None]:
# Pick a meeting (first one with full data) and plot USDEUR close around 14:00
sample_mid = meeting_ids[0]  # 20210127
sample_pair = "USDEUR"

import pytz
from datetime import datetime

ET = pytz.timezone("America/New_York")

sample = bars_clean[
    (bars_clean["source"] == sample_pair)
    & (bars_clean["meeting_id"] == sample_mid)
].copy()
sample = sample.sort_values("timestamp_et")

# Keep only announcement day (the second calendar day in the file)
ann_date = fomc.loc[fomc["meeting_id"] == sample_mid, "announcement_et"].iloc[0].date()
sample = sample[sample["timestamp_et"].dt.date == ann_date]

print(f"Meeting {sample_mid} | {sample_pair} | {len(sample)} bars on {ann_date}")
sample[["timestamp_et", "open", "high", "low", "close"]].head(10)

In [None]:
fig, ax = plt.subplots(figsize=(13, 4))
ax.plot(sample["timestamp_et"], sample["close"], lw=1.5, label=sample_pair)

# Shade the two prediction windows
ann_et = fomc.loc[fomc["meeting_id"] == sample_mid, "announcement_et"].iloc[0]
stmt_start = ET.localize(datetime.combine(ann_date, datetime.strptime("14:00", "%H:%M").time()))
stmt_end   = ET.localize(datetime.combine(ann_date, datetime.strptime("14:30", "%H:%M").time()))
dig_end    = ET.localize(datetime.combine(ann_date, datetime.strptime("16:00", "%H:%M").time()))

ax.axvspan(stmt_start, stmt_end, alpha=0.2, color="steelblue",  label="Statement window")
ax.axvspan(stmt_end,   dig_end,  alpha=0.1, color="darkorange", label="Digestion window")
ax.axvline(ann_et, color="red", lw=1.2, ls="--", label=f"Announcement {ann_et.strftime('%H:%M')} ET")

ax.set_title(f"{sample_pair} — {sample_mid} (ET)")
ax.set_xlabel("Time (ET)")
ax.set_ylabel("Close (USD per EUR)")
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Coverage Heatmap

In [None]:
cov = coverage_report(bars_clean, fomc)

# Separate source columns from the date column
source_cols = [c for c in cov.columns if c != "announcement_et"]
cov_matrix = cov[source_cols].copy()

print(f"Coverage table ({len(cov)} meetings × {len(source_cols)} sources)")
print("Expected bars per meeting ≈ 288 (FX/treasury, 24h), 78 (SPX/VIX, ~6.5h)")
cov_matrix.describe().round(0)

In [None]:
# Normalise by expected bars per source for the heatmap
expected = {s: 288 if s not in ("SPX", "VIX") else 78 for s in source_cols}
cov_pct = cov_matrix.copy()
for s in source_cols:
    cov_pct[s] = (cov_matrix[s] / expected[s]).clip(0, 1)

fig, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(
    cov_pct,
    ax=ax,
    cmap="YlGn",
    vmin=0, vmax=1,
    linewidths=0.3,
    annot=cov_matrix,
    fmt="d",
    annot_kws={"size": 7},
    cbar_kws={"label": "Coverage (fraction of expected bars)"},
)
ax.set_title("Intraday Bar Coverage (bar count in cell, colour = % of expected)")
ax.set_xlabel("Source")
ax.set_ylabel("Meeting ID")
plt.tight_layout()
plt.show()

In [None]:
# Meetings with any source below 50% coverage
low_cov = cov_pct[cov_pct.min(axis=1) < 0.5]
if low_cov.empty:
    print("✓ No meetings with <50% coverage in any source")
else:
    print(f"Meetings with low coverage ({len(low_cov)}):")
    print(low_cov)

## 7. Transcript Availability

In [None]:
import os

stmt_dir  = DATA_RAW / "fomc-transcripts" / "statements"
pc_dir    = DATA_RAW / "fomc-transcripts" / "press_conf"

stmt_ids = {f.stem.replace("monetary", "").replace("a1", "") for f in stmt_dir.glob("monetary*.pdf")}
pc_ids   = {f.stem.replace("FOMCpresconf", "") for f in pc_dir.glob("FOMCpresconf*.pdf")}

print(f"Statement PDFs  : {len(stmt_ids)} (expected 41)")
print(f"Press conf PDFs : {len(pc_ids)} (expected 41)")

missing_stmt = set(meeting_ids) - stmt_ids
missing_pc   = set(meeting_ids) - pc_ids

if missing_stmt: print(f"Missing statements  : {sorted(missing_stmt)}")
else:            print("✓ All statement PDFs present")

if missing_pc:   print(f"Missing press confs : {sorted(missing_pc)}")
else:            print("✓ All press conf PDFs present")

In [None]:
# Extract transcripts and save to JSON (slow: ~60s for 82 PDFs)
# Skip if already saved
transcript_json = DATA_CLEAN / "transcripts.json"
if not transcript_json.exists():
    print("Extracting transcripts (this takes ~60 seconds) ...")
    transcripts = load_transcripts(DATA_RAW, meeting_ids)
    save_transcripts_json(transcripts, transcript_json)
else:
    print(f"Transcripts already saved at {transcript_json}")
    from src.ingest import load_transcripts_json
    transcripts = load_transcripts_json(transcript_json)

In [None]:
# Check lengths — longer transcripts = more content for text features
lengths = {
    mid: {
        "stmt_chars": len(transcripts[mid]["statement"]),
        "pc_chars":   len(transcripts[mid]["press_conf"]),
    }
    for mid in meeting_ids
    if mid in transcripts
}
len_df = pd.DataFrame(lengths).T

fig, axes = plt.subplots(1, 2, figsize=(13, 4))
for ax, col, title in zip(
    axes,
    ["stmt_chars", "pc_chars"],
    ["Statement length (chars)", "Press conf length (chars)"],
):
    ax.bar(range(len(len_df)), len_df[col], width=0.8)
    ax.set_title(title)
    ax.set_xlabel("Meeting index")
    ax.grid(True, alpha=0.3, axis="y")

plt.suptitle("FOMC Transcript Lengths", y=1.01)
plt.tight_layout()
plt.show()

print("\nSummary:")
print(len_df.describe().round(0).astype(int))

In [None]:
# Peek at one statement
sample_stmt = transcripts[meeting_ids[0]]["statement"]
print(f"--- Statement {meeting_ids[0]} (first 500 chars) ---")
print(sample_stmt[:500])

## Summary

| Check | Result |
|---|---|
| FOMC meetings loaded | 41 |
| Clean bar rows | see QA report above |
| Sources with full coverage | see heatmap above |
| Statement PDFs | 41 |
| Press conf PDFs | 41 |

**Next:** `02_eda_targets.ipynb` — compute log-returns for each meeting × pair × window.