# 02 — EDA: Target Construction

**Goal:** Build the 41 × 4 × 2 = 328-row panel of log-returns, inspect return distributions, check the predict-zero baseline, and confirm no data-leakage.

**Sections:**
1. Setup & load clean data
2. Compute targets
3. Panel completeness
4. Return distributions
5. Time-series of returns
6. Cross-pair correlations
7. Predict-zero baseline
8. Save targets

## 1. Setup

In [None]:
import sys
import pathlib

PROJECT_ROOT = pathlib.Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

DATA_CLEAN = PROJECT_ROOT / "data-clean"

import logging
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.config import load_config
from src.clean import read_parquet, write_parquet
from src.targets import compute_targets, windows_from_config

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
warnings.filterwarnings("ignore")

cfg = load_config(PROJECT_ROOT / "configs" / "config.yaml")

In [None]:
# Load clean data written by notebook 01
fomc   = read_parquet(DATA_CLEAN / "fomc_metadata.parquet")
bars   = read_parquet(DATA_CLEAN / "intraday_bars.parquet")

# Restore tz-aware timestamp (parquet preserves tz)
print(f"FOMC meetings : {len(fomc)}")
print(f"Bar rows      : {len(bars):,}")
print(f"Sources       : {bars['source'].unique().tolist()}")

## 2. Compute Targets

In [None]:
windows = windows_from_config(cfg)
print("Windows:", windows)

panel = compute_targets(
    bars=bars,
    fomc_meta=fomc,
    pairs=cfg.pairs,
    windows=windows,
)

print(f"\nPanel shape: {panel.shape}  (max expected: {len(fomc)} × {len(cfg.pairs)} × 2 = {len(fomc)*len(cfg.pairs)*2})")
panel.head(8)

## 3. Panel Completeness

In [None]:
completeness = panel.groupby(["pair", "window"])["has_data"].agg(["sum", "count"])
completeness.columns = ["n_valid", "n_total"]
completeness["pct_valid"] = (completeness["n_valid"] / completeness["n_total"] * 100).round(1)
print(completeness.to_string())

In [None]:
# Show any missing cells
missing = panel[~panel["has_data"]][["meeting_id", "pair", "window", "n_bars"]]
if missing.empty:
    print("✓ No missing target cells")
else:
    print(f"Missing cells ({len(missing)}):")
    print(missing.to_string(index=False))

## 4. Return Distributions

In [None]:
valid = panel[panel["has_data"]].copy()
valid["log_ret_bps"] = valid["log_ret"] * 10_000  # convert to basis points

print("Log-return summary (bps):")
print(valid.groupby(["pair", "window"])["log_ret_bps"].describe().round(2))

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 7), sharey=False)

pairs = cfg.pairs
window_names = ["statement", "digestion"]
colors = {"statement": "steelblue", "digestion": "darkorange"}

for row_idx, window_name in enumerate(window_names):
    for col_idx, pair in enumerate(pairs):
        ax = axes[row_idx][col_idx]
        data = valid[(valid["pair"] == pair) & (valid["window"] == window_name)]["log_ret_bps"]
        ax.hist(data, bins=15, color=colors[window_name], alpha=0.7, edgecolor="white")
        ax.axvline(0, color="red", lw=1.2, ls="--")
        ax.axvline(data.mean(), color="black", lw=1, ls="-", label=f"mean={data.mean():.1f}")
        ax.set_title(f"{pair} | {window_name}")
        ax.set_xlabel("Log-return (bps)")
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)

plt.suptitle("FX Log-Return Distributions by Pair × Window", y=1.01, fontsize=13)
plt.tight_layout()
plt.show()

In [None]:
# Violin plots
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=False)

for ax, window_name in zip(axes, window_names):
    sub = valid[valid["window"] == window_name]
    sns.violinplot(data=sub, x="pair", y="log_ret_bps", ax=ax, palette="Set2", inner="quart")
    ax.axhline(0, color="red", lw=1.2, ls="--")
    ax.set_title(f"Window: {window_name}")
    ax.set_xlabel("Pair")
    ax.set_ylabel("Log-return (bps)")
    ax.grid(True, alpha=0.3, axis="y")

plt.suptitle("Return Distribution by Pair", y=1.01)
plt.tight_layout()
plt.show()

## 5. Time-Series of Returns

In [None]:
fig, axes = plt.subplots(len(pairs), 1, figsize=(14, 10), sharex=True)

for ax, pair in zip(axes, pairs):
    for window_name, color in colors.items():
        sub = valid[(valid["pair"] == pair) & (valid["window"] == window_name)].sort_values("announcement_et")
        ax.bar(
            sub["announcement_et"],
            sub["log_ret_bps"],
            width=20,
            color=color,
            alpha=0.7,
            label=window_name,
        )
    ax.axhline(0, color="black", lw=0.8)
    ax.set_ylabel(f"{pair}\n(bps)")
    ax.grid(True, alpha=0.3, axis="y")
    ax.legend(loc="upper right", fontsize=8)

axes[-1].set_xlabel("Meeting date")
plt.suptitle("FX Log-Returns at Each FOMC Meeting", y=1.01, fontsize=13)
plt.tight_layout()
plt.show()

## 6. Cross-Pair Correlations

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(13, 5))

for ax, window_name in zip(axes, window_names):
    sub = valid[valid["window"] == window_name].copy()
    wide = sub.pivot(index="meeting_id", columns="pair", values="log_ret_bps")
    corr = wide.corr()
    sns.heatmap(
        corr, ax=ax, cmap="RdBu_r", center=0,
        annot=True, fmt=".2f", vmin=-1, vmax=1,
        linewidths=0.5, cbar=False,
    )
    ax.set_title(f"Return correlations — {window_name}")

plt.suptitle("Cross-Pair Log-Return Correlations (same window)", y=1.01)
plt.tight_layout()
plt.show()

## 7. Predict-Zero Baseline

In [None]:
# Predict-zero: MAE = mean(|actual|), directional accuracy = P(actual > 0)
baselines = (
    valid.groupby(["pair", "window"])
    .agg(
        MAE_bps=("log_ret_bps", lambda x: x.abs().mean()),
        RMSE_bps=("log_ret_bps", lambda x: (x**2).mean() ** 0.5),
        dir_acc_pct=("direction", lambda x: (x == 1).mean() * 100),  # P(USD up)
        n=("log_ret_bps", "count"),
    )
    .round(2)
)

print("Predict-zero baseline (MAE, RMSE in bps; dir_acc = % meetings with USD appreciation):")
print(baselines.to_string())

In [None]:
# Direction balance — ideally close to 50/50 (no systematic bias)
direction_balance = (
    valid.groupby(["pair", "window"])["direction"]
    .value_counts(normalize=True)
    .rename("pct")
    .reset_index()
)
print("\nDirection balance (% of meetings):")
print(direction_balance.pivot(index=["pair", "window"], columns="direction", values="pct").round(3))

## 8. Save Targets

In [None]:
write_parquet(panel, DATA_CLEAN / "targets.parquet")
print(f"Saved {len(panel)} rows to data-clean/targets.parquet")
print(f"Valid rows: {panel['has_data'].sum()} / {len(panel)}")

## Summary

| Metric | Value |
|---|---|
| Panel rows (total) | 328 |
| Valid rows | see above |
| Typical MAE (predict-zero) | see baseline table |
| Direction balance | see table above |

**Next:** `03_eda_features.ipynb` — build structured features (rungs 1–3) and check for leakage/missingness.