# 04 — EDA: Text Feature Scoring

**Goal:** Run all three stages of text feature engineering and validate signal quality.

**Sections:**
1. Setup
2. Stage 4a — Keyword scoring
3. Stage 4b — LLM rubric scoring
4. Stage 4c — Embeddings + PCA
5. Sanity checks
6. Save

## 1. Setup

In [None]:
import sys
import pathlib

PROJECT_ROOT = pathlib.Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

DATA_CLEAN = PROJECT_ROOT / "data-clean"
OUTPUTS    = PROJECT_ROOT / "outputs"
PROMPTS    = OUTPUTS / "prompts"
FIGURES    = OUTPUTS / "figures"

import logging
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.config import load_config
from src.clean import read_parquet, write_parquet
from src.ingest import load_transcripts_json
from src.features_text import (
    score_transcripts_keywords,
    score_transcripts_llm,
    make_embeddings,
    build_text_features,
)

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
warnings.filterwarnings("ignore")

cfg = load_config(PROJECT_ROOT / "configs" / "config.yaml")
print(f"LLM model: {cfg.text.llm_model}")

In [None]:
fomc       = read_parquet(DATA_CLEAN / "fomc_metadata.parquet")
targets    = read_parquet(DATA_CLEAN / "targets.parquet")
transcripts = load_transcripts_json(DATA_CLEAN / "transcripts.json")

meeting_ids = fomc["meeting_id"].tolist()
print(f"Meetings: {len(meeting_ids)}, transcripts loaded: {len(transcripts)}")

## 2. Stage 4a — Keyword Scoring

In [None]:
kw_scores = score_transcripts_keywords(transcripts, meeting_ids)
kw_scores.head(8)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

for ax, text_source in zip(axes, ["statement", "press_conf"]):
    sub = kw_scores[kw_scores["text_source"] == text_source].copy()
    sub = sub.merge(fomc[["meeting_id", "announcement_et"]], on="meeting_id")
    sub = sub.sort_values("announcement_et")

    colors = ["#d62728" if x > 0 else "#2ca02c" for x in sub["net_hawkish"]]
    ax.bar(range(len(sub)), sub["net_hawkish"], color=colors)
    ax.axhline(0, color="k", lw=0.8)
    ax.set_title(f"Keyword net_hawkish — {text_source}")
    ax.set_xlabel("Meeting index")
    ax.set_ylabel("Hawkish − Dovish count")
    ax.grid(True, alpha=0.3, axis="y")

plt.suptitle("Keyword-Based Hawkish/Dovish Scores", y=1.01)
plt.tight_layout()
plt.show()

## 3. Stage 4b — LLM Rubric Scoring

In [None]:
# Check if already scored (avoid re-running API calls)
llm_cache = DATA_CLEAN / "llm_scores_cache.parquet"

if llm_cache.exists():
    print(f"Loading cached LLM scores from {llm_cache}")
    llm_scores = read_parquet(llm_cache)
else:
    print(f"Running LLM scoring (82 API calls via {cfg.text.llm_model}) ...")
    llm_scores = score_transcripts_llm(
        transcripts=transcripts,
        meeting_ids=meeting_ids,
        prompt_path=PROMPTS / f"rubric_{cfg.text.prompt_version}.txt",
        llm_model=cfg.text.llm_model,
        log_dir=OUTPUTS,
    )
    write_parquet(llm_scores, llm_cache)
    print(f"Cached to {llm_cache}")

llm_scores.head(6)

In [None]:
rubric_cols = ["hawkish_dovish", "inflation_focus", "labor_focus",
               "recession_risk", "uncertainty_score", "forward_guidance_strength",
               "balance_sheet_mention"]

print("LLM score distributions:")
print(llm_scores.groupby("text_source")[rubric_cols].describe().round(2))

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 7))
rubric_display = rubric_cols[:7]  # 7 dimensions

for text_source, row_axes in zip(["statement", "press_conf"], [axes[0], axes[1]]):
    sub = llm_scores[llm_scores["text_source"] == text_source]
    for ax, col in zip(row_axes, rubric_display):
        ax.hist(sub[col].dropna(), bins=10, edgecolor="white", color="steelblue")
        ax.set_title(f"{col}\n({text_source})", fontsize=8)
        ax.set_xlabel("Score")
        ax.grid(True, alpha=0.3)
    # hide last subplot if 7 dims and 8 slots
    if len(row_axes) > len(rubric_display):
        row_axes[-1].set_visible(False)

plt.suptitle("LLM Rubric Score Distributions", y=1.01)
plt.tight_layout()
plt.show()

In [None]:
# Timeline: hawkish_dovish score vs actual rate changes
fig, axes = plt.subplots(2, 1, figsize=(14, 7), sharex=True)

for ax, text_source in zip(axes, ["statement", "press_conf"]):
    sub = llm_scores[llm_scores["text_source"] == text_source].copy()
    sub = sub.merge(fomc[["meeting_id", "announcement_et", "rate_change"]], on="meeting_id")
    sub = sub.sort_values("announcement_et")

    ax2 = ax.twinx()
    ax.plot(sub["announcement_et"], sub["hawkish_dovish"], "b-o", ms=4, label="hawkish_dovish (LLM)")
    ax2.bar(sub["announcement_et"], sub["rate_change"] * 100, width=20, alpha=0.4, color="orange", label="Rate change (bps)")
    ax.set_ylabel("hawkish_dovish score", color="b")
    ax2.set_ylabel("Rate change (bps)", color="orange")
    ax.set_title(f"{text_source}: LLM score vs rate change")
    ax.legend(loc="upper left")
    ax2.legend(loc="upper right")
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Stage 4c — Embeddings + PCA

In [None]:
# Check if already computed
emb_cache = DATA_CLEAN / "embeddings_cache.parquet"

if emb_cache.exists():
    print(f"Loading cached embeddings from {emb_cache}")
    emb_scores = read_parquet(emb_cache)
else:
    print("Computing embeddings (requires sentence-transformers) ...")
    emb_scores = make_embeddings(
        transcripts=transcripts,
        meeting_ids=meeting_ids,
        n_components_min=cfg.text.embedding_dim_min,
        n_components_max=cfg.text.embedding_dim_max,
        figures_dir=FIGURES,
    )
    write_parquet(emb_scores, emb_cache)
    print(f"Cached to {emb_cache}")

emb_cols = [c for c in emb_scores.columns if c.startswith("emb_pc_")]
print(f"Embedding components: {len(emb_cols)} per text_source")
emb_scores.head(4)

In [None]:
# Show PCA variance plots if saved
for text_source in ["statement", "press_conf"]:
    fig_path = FIGURES / f"pca_variance_{text_source}.png"
    if fig_path.exists():
        from IPython.display import Image, display
        display(Image(str(fig_path)))
    else:
        print(f"PCA plot not found at {fig_path} (run embedding stage above)")

## 5. Sanity Checks

In [None]:
# Key sanity check: hawkish_dovish (LLM) should correlate with net_hawkish (keyword)
stmt_kw  = kw_scores[kw_scores["text_source"] == "statement"][["meeting_id", "net_hawkish", "net_hawkish_norm"]]
stmt_llm = llm_scores[llm_scores["text_source"] == "statement"][["meeting_id", "hawkish_dovish"]]
sanity = stmt_kw.merge(stmt_llm, on="meeting_id")

corr_kw_llm = sanity[["net_hawkish", "hawkish_dovish"]].corr().iloc[0, 1]
print(f"Keyword net_hawkish vs LLM hawkish_dovish correlation: r = {corr_kw_llm:.3f}")

if corr_kw_llm > 0.3:
    print("✓ Positive correlation — LLM and keyword scores broadly agree")
elif corr_kw_llm > 0:
    print("⚠ Weak positive correlation — check a few low-agreement meetings")
else:
    print("✗ Negative or zero correlation — review LLM prompt or word lists")

In [None]:
fig, ax = plt.subplots(figsize=(7, 5))
ax.scatter(sanity["net_hawkish"], sanity["hawkish_dovish"], alpha=0.7, s=50)
z = np.polyfit(sanity["net_hawkish"].dropna(), sanity["hawkish_dovish"].dropna(), 1)
p = np.poly1d(z)
x_line = np.linspace(sanity["net_hawkish"].min(), sanity["net_hawkish"].max(), 50)
ax.plot(x_line, p(x_line), "r--", lw=1.5)
ax.set_title(f"Keyword vs LLM Hawkish Score (r={corr_kw_llm:.2f})")
ax.set_xlabel("Keyword net_hawkish")
ax.set_ylabel("LLM hawkish_dovish")
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Sanity check: hawkish_dovish should negatively correlate with 2Y yield change
# (hawkish = market expects tighter policy = 2Y yield rises AFTER announcement)
# Use statement score vs statement-window 2Y yield implied return
# (We don't have 2Y target directly, but we can check vs fomc rate_change)
sanity2 = stmt_llm.merge(fomc[["meeting_id", "rate_change"]], on="meeting_id").dropna()
corr_hd_rc = sanity2[["hawkish_dovish", "rate_change"]].corr().iloc[0, 1]
print(f"hawkish_dovish vs rate_change correlation: r = {corr_hd_rc:.3f}")
print("Expected: positive (hawkish meetings tend to be hike meetings)")

In [None]:
# Score variance check — do scores differentiate meetings?
print("LLM score variance (should be >0 for all dims):")
stmt_llm_scores = llm_scores[llm_scores["text_source"] == "statement"]
print(stmt_llm_scores[rubric_cols].std().round(3))

## 6. Save

In [None]:
# Merge all text features
feat_text = kw_scores.merge(llm_scores.drop(columns=["llm_model_id"], errors="ignore"), on=["meeting_id", "text_source"], how="outer")
feat_text = feat_text.merge(emb_scores, on=["meeting_id", "text_source"], how="outer")

write_parquet(feat_text, DATA_CLEAN / "features_text.parquet")
print(f"Saved {len(feat_text)} rows, {feat_text.shape[1]} columns")
print(f"Columns: {feat_text.columns.tolist()}")

## Summary

| Stage | Features | Notes |
|---|---|---|
| 4a Keyword | net_hawkish, net_hawkish_norm, balance_sheet_kw, uncertainty_kw | Deterministic |
| 4b LLM Rubric | 7 rubric dimensions | Claude API, cached |
| 4c Embeddings | emb_pc_1 ... emb_pc_N | PCA from sentence-transformers |

**Next:** `05_model_results.ipynb` — run the full model ladder and compare RMSE across rungs.