# 03 — Evaluate Transaction Categorization Accuracy

**Objective:** Evaluate LLM classification results against the Master Fee Table ground truth,
compute accuracy metrics, analyze failures, and support cross-version comparison.

### Metrics

| Metric | Description |
|--------|-------------|
| Per-Level Accuracy | L1, L2, L3, L4 independently (case-insensitive, null-safe) |
| Exact Match | All 4 levels correct |
| Partial Match | L1+L2 correct, L1+L2+L3 correct |
| Volume-Weighted | Weighted by raw transaction count |
| Per-Layer | Obvious (Layer 1) vs Ambiguous (Layer 2) vs Unknown (Layer 3) |
| Failure Analysis | Categorize mismatches by root cause |
| Cost Summary | Total tokens and estimated cost for the run |
| Cross-Version | Compare accuracy across prompt versions |

### Layer Evaluation Rules

- **Layer 1 (Obvious):** Standard per-level comparison against single GT mapping.
- **Layer 2 (Ambiguous):** Match against ANY valid GT mapping for that transaction_code.
- **Layer 3 (Unknown):** Flagged as NEEDS MANUAL REVIEW — no accuracy calculation.

**Runs on:** Databricks Runtime 15.4 LTS or above.

In [None]:
# ── Configuration ─────────────────────────────────────────────────
CATALOG_NAME    = "ciq-bp_dummy-dev"
SCHEMA_NAME     = "default"
PROMPT_VERSION  = "v1.0"  # Set to None to evaluate the latest run

RESULTS_TABLE = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.classification_results"
GT_TABLE      = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.ground_truth_normalized"
EVAL_TABLE    = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.evaluation_results"

# Target: >= 80% volume-weighted exact match accuracy
TARGET_ACCURACY = 0.80

print(f"Results table:  {RESULTS_TABLE}")
print(f"GT table:       {GT_TABLE}")
print(f"Eval table:     {EVAL_TABLE}")
print(f"Prompt version: {PROMPT_VERSION or '(latest)'}")
print(f"Target:         >= {TARGET_ACCURACY:.0%} vol-weighted exact match")

---
## Step 1 — Validate upstream tables

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

try:
    for table, label in [(RESULTS_TABLE, "classification_results"), (GT_TABLE, "ground_truth_normalized")]:
        count = spark.sql(f"SELECT COUNT(*) as cnt FROM {table}").collect()[0]["cnt"]
        print(f"  OK  {label}: {count} rows")
    print("\nUpstream tables validated.")
except NameError:
    raise SystemExit("Spark session not found — run this notebook in Databricks.")

---
## Step 2 — Load results and ground truth from Unity Catalog

In [None]:
# ── Load classification results ──────────────────────────────────
if PROMPT_VERSION:
    df_results = (
        spark.sql(f"SELECT * FROM {RESULTS_TABLE} WHERE prompt_version = '{PROMPT_VERSION}'")
        .toPandas()
    )
    print(f"Loaded results for prompt_version='{PROMPT_VERSION}': {len(df_results)} rows")
else:
    latest_version = (
        spark.sql(f"SELECT prompt_version FROM {RESULTS_TABLE} ORDER BY run_timestamp DESC LIMIT 1")
        .collect()[0]["prompt_version"]
    )
    df_results = (
        spark.sql(f"SELECT * FROM {RESULTS_TABLE} WHERE prompt_version = '{latest_version}'")
        .toPandas()
    )
    print(f"Loaded latest results (prompt_version='{latest_version}'): {len(df_results)} rows")

df_results["transaction_code"] = df_results["transaction_code"].astype(str)

# ── Load ground truth ─────────────────────────────────────────────
df_gt = spark.table(GT_TABLE).toPandas()
df_gt["transaction_code"] = df_gt["transaction_code"].astype(str)

print(f"Ground truth: {len(df_gt)} rows, {df_gt['transaction_code'].nunique()} unique codes")
print(f"\nResults columns: {list(df_results.columns)}")
print(f"GT columns: {list(df_gt.columns)}")
print(f"\nLayer distribution in results:")
for layer in sorted(df_results["layer"].unique()):
    n = len(df_results[df_results["layer"] == layer])
    print(f"  Layer {layer}: {n} codes")

---
## Step 3 — Comparison helpers

In [None]:
_NULL = "__null__"

def _canon(val):
    """Canonicalize a value for comparison: lowercase, strip, null-safe."""
    if val is None or (isinstance(val, float) and np.isnan(val)):
        return _NULL
    s = str(val).strip().lower()
    if s in ("", "none", "nan", "null", "n/a"):
        return _NULL
    return s


def levels_match(row, llm_col, gt_col):
    return _canon(row[llm_col]) == _canon(row[gt_col])


def add_match_columns(df):
    """Add per-level and aggregate match booleans."""
    df = df.copy()
    df["match_L1"] = df.apply(levels_match, axis=1, llm_col="category_1", gt_col="gt_L1")
    df["match_L2"] = df.apply(levels_match, axis=1, llm_col="category_2", gt_col="gt_L2")
    df["match_L3"] = df.apply(levels_match, axis=1, llm_col="category_3", gt_col="gt_L3")
    df["match_L4"] = df.apply(levels_match, axis=1, llm_col="category_4", gt_col="gt_L4")

    df["exact_match"]          = df[["match_L1", "match_L2", "match_L3", "match_L4"]].all(axis=1)
    df["partial_match_L1L2"]   = df[["match_L1", "match_L2"]].all(axis=1)
    df["partial_match_L1L2L3"] = df[["match_L1", "match_L2", "match_L3"]].all(axis=1)
    return df


LAYER_NAMES = {1: "Obvious", 2: "Ambiguous", 3: "Unknown"}

print("Comparison helpers ready.")

---
## Step 4 — Prepare GT lookups

In [None]:
# Identify single-mapping vs multi-mapping codes in the ground truth
gt_mapping_counts = (
    df_gt
    .groupby("transaction_code")
    .apply(lambda g: g[["gt_L1", "gt_L2", "gt_L3", "gt_L4"]].drop_duplicates().shape[0])
    .reset_index(name="n_mappings")
)

multi_codes  = set(gt_mapping_counts.loc[gt_mapping_counts["n_mappings"] > 1, "transaction_code"])
single_codes = set(gt_mapping_counts.loc[gt_mapping_counts["n_mappings"] == 1, "transaction_code"])
all_gt_codes = set(df_gt["transaction_code"].unique())

# GT lookup for single-mapping codes (one row per code)
gt_cols = ["transaction_code", "gt_desc", "gt_L1", "gt_L2", "gt_L3", "gt_L4", "gt_credit_debit"]
df_gt_single = (
    df_gt[df_gt["transaction_code"].isin(single_codes)]
    .drop_duplicates(subset="transaction_code", keep="first")[gt_cols]
)

print(f"GT codes with 1 mapping (Layer 1):  {len(single_codes)}")
print(f"GT codes with 2+ mappings (Layer 2): {len(multi_codes)}")
print(f"Multi-mapping codes: {sorted(multi_codes)}")

---
## Step 5 — Layer 1 evaluation (Obvious codes)

In [None]:
df_l1 = pd.merge(
    df_results[df_results["layer"] == 1],
    df_gt_single,
    on="transaction_code",
    how="left",
)

df_l1 = add_match_columns(df_l1)
n_l1 = len(df_l1)
vol_l1 = df_l1["volume"].sum()

print("=" * 65)
print(f"LAYER 1 — OBVIOUS CODES  (n = {n_l1}, volume = {vol_l1:,})")
print("=" * 65)
print(f"  L1 (Fee vs Non-fee):      {df_l1['match_L1'].mean():.1%}")
print(f"  L2 (Category):            {df_l1['match_L2'].mean():.1%}")
print(f"  L3 (Channel):             {df_l1['match_L3'].mean():.1%}")
print(f"  L4 (Subtype):             {df_l1['match_L4'].mean():.1%}")
print(f"  ─────────────────────────────")
print(f"  Partial (L1+L2):          {df_l1['partial_match_L1L2'].mean():.1%}")
print(f"  Partial (L1+L2+L3):       {df_l1['partial_match_L1L2L3'].mean():.1%}")
print(f"  Exact Match (all 4):      {df_l1['exact_match'].mean():.1%}")

In [None]:
# ── Volume-weighted accuracy ──────────────────────────────────────
print("=" * 65)
print(f"VOLUME-WEIGHTED ACCURACY  (total: {vol_l1:,} transactions)")
print("=" * 65)

for col, label in [
    ("match_L1",              "L1 (Fee vs Non-fee)"),
    ("match_L2",              "L2 (Category)"),
    ("match_L3",              "L3 (Channel)"),
    ("match_L4",              "L4 (Subtype)"),
    ("partial_match_L1L2",    "Partial (L1+L2)"),
    ("partial_match_L1L2L3",  "Partial (L1+L2+L3)"),
    ("exact_match",           "Exact (all 4)"),
]:
    w = (df_l1[col] * df_l1["volume"]).sum() / vol_l1
    print(f"  {label:<25} {w:.1%}")

---
## Step 6 — Layer 2 evaluation (Ambiguous codes)

A prediction is correct if it matches **any** of the valid GT mappings for that code.

In [None]:
df_l2_src = df_results[df_results["layer"] == 2].copy()

l2_rows = []
for _, row in df_l2_src.iterrows():
    code = row["transaction_code"]
    gt_maps = (
        df_gt[df_gt["transaction_code"] == code][["gt_L1", "gt_L2", "gt_L3", "gt_L4"]]
        .drop_duplicates()
    )
    llm = tuple(_canon(row[c]) for c in ["category_1", "category_2", "category_3", "category_4"])

    best_levels = 0
    matched_any = False
    for _, g in gt_maps.iterrows():
        gt = tuple(_canon(g[c]) for c in ["gt_L1", "gt_L2", "gt_L3", "gt_L4"])
        n_match = sum(a == b for a, b in zip(llm, gt))
        best_levels = max(best_levels, n_match)
        if llm == gt:
            matched_any = True

    l2_rows.append({
        "transaction_code": code,
        "description_1":    row["description_1"],
        "volume":           row["volume"],
        "llm_path":         f"{row['category_1']} > {row['category_2']} > {row['category_3']}",
        "exact_match_any":  matched_any,
        "best_levels_matched": best_levels,
        "n_valid_mappings": len(gt_maps),
        "confidence":       row["confidence"],
    })

df_l2 = pd.DataFrame(l2_rows) if l2_rows else pd.DataFrame()

print("=" * 65)
print(f"LAYER 2 — AMBIGUOUS CODES  (n = {len(df_l2)})")
print("=" * 65)
if len(df_l2) > 0:
    print(f"  Exact match (any valid mapping):  {df_l2['exact_match_any'].mean():.1%}")
    print(f"  Avg best levels matched:          {df_l2['best_levels_matched'].mean():.1f} / 4")
    print()
    print(df_l2.to_string(index=False))
else:
    print("  No ambiguous codes in results.")

In [None]:
# Show all valid GT mappings for each ambiguous code
if len(df_l2) > 0:
    print("Valid GT mappings for ambiguous codes:")
    for code in sorted(df_l2["transaction_code"].unique()):
        maps = (
            df_gt[df_gt["transaction_code"] == code][["gt_desc", "gt_L1", "gt_L2", "gt_L3"]]
            .drop_duplicates()
        )
        llm_row = df_l2_src[df_l2_src["transaction_code"] == code].iloc[0]
        print(f"\n  code={code}  (LLM: {llm_row['category_1']} > {llm_row['category_2']} > {llm_row['category_3']})")
        for _, m in maps.iterrows():
            print(f"    GT: {m['gt_L1']:<15} > {m['gt_L2']:<22} > {str(m['gt_L3']):<25} | {str(m['gt_desc'])[:45]}")

---
## Step 7 — Layer 3 (Unknown codes — manual review)

In [None]:
df_l3 = df_results[df_results["layer"] == 3].copy()

print("=" * 65)
print(f"LAYER 3 — UNKNOWN CODES  (n = {len(df_l3)}, no ground truth)")
print("=" * 65)
print("These codes are absent from the Master Fee Table and need manual review.\n")

for _, r in df_l3.sort_values("volume", ascending=False).iterrows():
    print(f"  code={str(r['transaction_code']):>5} | vol={r['volume']:>6,} | conf={r['confidence']:.2f}")
    print(f"    desc: {r['description_1']}")
    print(f"    LLM:  {r['category_1']} > {r['category_2']} > {r['category_3']} > {r['category_4']}")
    print(f"    scoring: {r['include_in_scoring']}")
    print()

---
## Step 8 — Failure analysis (Layer 1)

In [None]:
failures = df_l1[~df_l1["exact_match"]].copy()

def failure_type(row):
    if not row["match_L1"]:
        return "WRONG BLOCK (L1)"
    if not row["match_L2"]:
        return "WRONG CATEGORY (L2)"
    if not row["match_L3"]:
        return "WRONG CHANNEL (L3)"
    if not row["match_L4"]:
        return "WRONG SUBTYPE (L4)"
    return "UNKNOWN"

failures["failure_type"] = failures.apply(failure_type, axis=1)

print("=" * 65)
print(f"FAILURE ANALYSIS — {len(failures)} mismatches out of {len(df_l1)} obvious codes")
print("=" * 65)

if len(failures) > 0:
    ft = (
        failures.groupby("failure_type")
        .agg(
            count=("transaction_code", "count"),
            volume=("volume", "sum"),
            examples=("transaction_code", lambda x: ", ".join(x.head(4))),
        )
        .sort_values("count", ascending=False)
        .reset_index()
    )
    print("\nFailure type distribution:")
    print(ft.to_string(index=False))
else:
    print("\nNo failures — all Layer 1 codes matched exactly.")

In [None]:
# ── Detailed mismatch table ───────────────────────────────────────
if len(failures) > 0:
    print("=" * 65)
    print("DETAILED FAILURES (sorted by volume, highest impact first)")
    print("=" * 65)

    for _, r in failures.sort_values("volume", ascending=False).iterrows():
        print(
            f"\n  code={str(r['transaction_code']):>5} | vol={r['volume']:>6,}"
            f" | conf={r['confidence']:.2f} | {r['failure_type']}"
        )
        print(f"    description: {str(r['description_1'])[:55]}")
        print(f"    gt desc:     {str(r.get('gt_desc', ''))[:55]}")

        for lvl_num, llm_col, gt_col in [
            ("L1", "category_1", "gt_L1"),
            ("L2", "category_2", "gt_L2"),
            ("L3", "category_3", "gt_L3"),
            ("L4", "category_4", "gt_L4"),
        ]:
            llm_val = str(r[llm_col])
            gt_val  = str(r[gt_col])
            ok      = "Y" if r[f"match_{lvl_num}"] else "X"
            if not r[f"match_{lvl_num}"]:
                print(f'    {lvl_num}: [{ok}] LLM="{llm_val}" vs GT="{gt_val}"')
            else:
                print(f'    {lvl_num}: [{ok}] "{llm_val}"')

---
## Step 9 — Cost summary

In [None]:
print("=" * 65)
print("COST SUMMARY")
print("=" * 65)

total_tokens_in  = df_results["tokens_in"].sum()
total_tokens_out = df_results["tokens_out"].sum()
total_cost       = df_results["estimated_cost"].sum()
model_name       = df_results["model_name"].iloc[0] if len(df_results) > 0 else "N/A"

print(f"  Model:            {model_name}")
print(f"  Total tokens in:  {total_tokens_in:,}")
print(f"  Total tokens out: {total_tokens_out:,}")
print(f"  Total tokens:     {total_tokens_in + total_tokens_out:,}")
print(f"  Estimated cost:   ${total_cost:.4f}")
print(f"  Codes classified: {len(df_results)}")
print(f"  Cost per code:    ${total_cost / max(len(df_results), 1):.4f}")

---
## Step 10 — Cross-version comparison

In [None]:
# Check if multiple prompt versions exist in the results table
all_versions = (
    spark.sql(f"SELECT DISTINCT prompt_version FROM {RESULTS_TABLE} ORDER BY prompt_version")
    .toPandas()["prompt_version"]
    .tolist()
)

print(f"Prompt versions in results table: {all_versions}")

if len(all_versions) > 1:
    version_metrics = []

    for version in all_versions:
        v_results = (
            spark.sql(f"SELECT * FROM {RESULTS_TABLE} WHERE prompt_version = '{version}'")
            .toPandas()
        )
        v_results["transaction_code"] = v_results["transaction_code"].astype(str)

        # Layer 1 evaluation for this version
        v_l1 = pd.merge(
            v_results[v_results["layer"] == 1],
            df_gt_single,
            on="transaction_code",
            how="left",
        )
        v_l1 = add_match_columns(v_l1)
        v_vol = v_l1["volume"].sum()

        vw_exact   = (v_l1["exact_match"] * v_l1["volume"]).sum() / max(v_vol, 1)
        vw_partial = (v_l1["partial_match_L1L2"] * v_l1["volume"]).sum() / max(v_vol, 1)
        v_cost     = v_results["estimated_cost"].sum()

        version_metrics.append({
            "prompt_version":     version,
            "codes":              len(v_results),
            "l1_codes":           len(v_l1),
            "exact_match":        f"{v_l1['exact_match'].mean():.1%}",
            "partial_L1L2":       f"{v_l1['partial_match_L1L2'].mean():.1%}",
            "vol_weighted_exact": f"{vw_exact:.1%}",
            "vol_weighted_partial": f"{vw_partial:.1%}",
            "L1_accuracy":        f"{v_l1['match_L1'].mean():.1%}",
            "L2_accuracy":        f"{v_l1['match_L2'].mean():.1%}",
            "L3_accuracy":        f"{v_l1['match_L3'].mean():.1%}",
            "L4_accuracy":        f"{v_l1['match_L4'].mean():.1%}",
            "estimated_cost":     f"${v_cost:.4f}",
        })

    df_comparison = pd.DataFrame(version_metrics)

    print("\n" + "=" * 65)
    print("CROSS-VERSION COMPARISON")
    print("=" * 65)
    print(df_comparison.to_string(index=False))

    # Highlight regressions and improvements between consecutive versions
    if len(version_metrics) >= 2:
        prev = version_metrics[-2]
        curr = version_metrics[-1]
        print(f"\n  Comparing {prev['prompt_version']} → {curr['prompt_version']}:")
        print(f"    Vol-weighted exact: {prev['vol_weighted_exact']} → {curr['vol_weighted_exact']}")
        print(f"    Vol-weighted partial: {prev['vol_weighted_partial']} → {curr['vol_weighted_partial']}")
        print(f"    Cost: {prev['estimated_cost']} → {curr['estimated_cost']}")
else:
    print("Only one prompt version found — cross-version comparison will be available after additional runs.")

---
## Step 11 — Summary report

In [None]:
vol_total  = df_results["volume"].sum()
vw_exact   = (df_l1["exact_match"] * df_l1["volume"]).sum() / max(vol_l1, 1)
vw_partial = (df_l1["partial_match_L1L2"] * df_l1["volume"]).sum() / max(vol_l1, 1)
amb_match  = df_l2["exact_match_any"].mean() if len(df_l2) > 0 else 0
target_met = vw_exact >= TARGET_ACCURACY
status     = "MET" if target_met else "BELOW TARGET"

print("=" * 65)
print("  TRANSACTION CATEGORIZATION — ACCURACY SUMMARY")
print("=" * 65)
print(f"")
print(f"  Prompt version:                  {df_results['prompt_version'].iloc[0]}")
print(f"  Model:                           {model_name}")
print(f"  Total codes evaluated:           {len(df_results)}")
print(f"  Total transaction volume:        {vol_total:,}")
print(f"")
print(f"  LAYER 1 — Obvious (single-mapping)")
print(f"    Codes: {len(df_l1)}    Volume: {vol_l1:,}")
print(f"    L1 (Block):           {df_l1['match_L1'].mean():.1%}")
print(f"    L2 (Category):        {df_l1['match_L2'].mean():.1%}")
print(f"    L3 (Channel):         {df_l1['match_L3'].mean():.1%}")
print(f"    L4 (Subtype):         {df_l1['match_L4'].mean():.1%}")
print(f"    Exact Match (all 4):  {df_l1['exact_match'].mean():.1%}  (vol-wt: {vw_exact:.1%})")
print(f"    Partial (L1+L2):      {df_l1['partial_match_L1L2'].mean():.1%}  (vol-wt: {vw_partial:.1%})")
print(f"")
print(f"  LAYER 2 — Ambiguous: {len(df_l2)} codes")
print(f"    Match (any valid):    {amb_match:.1%}")
print(f"")
print(f"  LAYER 3 — Unknown: {len(df_l3)} codes (NEEDS MANUAL REVIEW)")
print(f"")
if len(failures) > 0:
    print(f"  KEY FAILURES:")
    for ft_name in failures["failure_type"].value_counts().index:
        ft_count = len(failures[failures["failure_type"] == ft_name])
        print(f"    {ft_name}: {ft_count} codes")
    print(f"")
print(f"  COST: ${total_cost:.4f} ({len(df_results)} codes)")
print(f"")
print(f"  TARGET: >= {TARGET_ACCURACY:.0%} vol-weighted exact match → {status}")
print("=" * 65)

---
## Step 12 — Save evaluation results to Unity Catalog

In [None]:
# Build the evaluation output DataFrame
# Layer 1: full match details
eval_l1 = df_l1[[
    "transaction_code", "layer", "prompt_version", "model_name",
    "description_1", "volume", "source_file",
    "category_1", "category_2", "category_3", "category_4",
    "gt_L1", "gt_L2", "gt_L3", "gt_L4",
    "match_L1", "match_L2", "match_L3", "match_L4",
    "exact_match", "partial_match_L1L2", "partial_match_L1L2L3",
    "confidence",
]].copy()
eval_l1["failure_type"] = None
if len(failures) > 0:
    eval_l1.loc[eval_l1["transaction_code"].isin(failures["transaction_code"]), "failure_type"] = (
        failures.set_index("transaction_code")["failure_type"]
    )
eval_l1["review_status"] = "EVALUATED"

# Layer 2: use match_any as exact_match
eval_l2_records = []
for _, row in df_l2_src.iterrows():
    code = row["transaction_code"]
    l2_match = df_l2[df_l2["transaction_code"] == code]
    is_match = bool(l2_match["exact_match_any"].iloc[0]) if len(l2_match) > 0 else False
    eval_l2_records.append({
        "transaction_code": code,
        "layer":            row["layer"],
        "prompt_version":   row["prompt_version"],
        "model_name":       row["model_name"],
        "description_1":    row["description_1"],
        "volume":           row["volume"],
        "source_file":      row["source_file"],
        "category_1":       row["category_1"],
        "category_2":       row["category_2"],
        "category_3":       row["category_3"],
        "category_4":       row["category_4"],
        "gt_L1":            None,
        "gt_L2":            None,
        "gt_L3":            None,
        "gt_L4":            None,
        "match_L1":         None,
        "match_L2":         None,
        "match_L3":         None,
        "match_L4":         None,
        "exact_match":      is_match,
        "partial_match_L1L2":   None,
        "partial_match_L1L2L3": None,
        "confidence":       row["confidence"],
        "failure_type":     None if is_match else "AMBIGUOUS_NO_MATCH",
        "review_status":    "EVALUATED",
    })
eval_l2 = pd.DataFrame(eval_l2_records) if eval_l2_records else pd.DataFrame()

# Layer 3: flagged for manual review
eval_l3_records = []
for _, row in df_l3.iterrows():
    eval_l3_records.append({
        "transaction_code": row["transaction_code"],
        "layer":            row["layer"],
        "prompt_version":   row["prompt_version"],
        "model_name":       row["model_name"],
        "description_1":    row["description_1"],
        "volume":           row["volume"],
        "source_file":      row["source_file"],
        "category_1":       row["category_1"],
        "category_2":       row["category_2"],
        "category_3":       row["category_3"],
        "category_4":       row["category_4"],
        "gt_L1":            None,
        "gt_L2":            None,
        "gt_L3":            None,
        "gt_L4":            None,
        "match_L1":         None,
        "match_L2":         None,
        "match_L3":         None,
        "match_L4":         None,
        "exact_match":      None,
        "partial_match_L1L2":   None,
        "partial_match_L1L2L3": None,
        "confidence":       row["confidence"],
        "failure_type":     None,
        "review_status":    "NEEDS MANUAL REVIEW",
    })
eval_l3_df = pd.DataFrame(eval_l3_records) if eval_l3_records else pd.DataFrame()

# Combine all layers
eval_parts = [df for df in [eval_l1, eval_l2, eval_l3_df] if len(df) > 0]
df_eval = pd.concat(eval_parts, ignore_index=True) if eval_parts else pd.DataFrame()

print(f"Evaluation results: {len(df_eval)} rows")
print(f"  Layer 1: {len(eval_l1)}")
print(f"  Layer 2: {len(eval_l2)}")
print(f"  Layer 3: {len(eval_l3_df)}")

In [None]:
try:
    sdf_eval = spark.createDataFrame(df_eval)
    sdf_eval.write.mode("overwrite").saveAsTable(EVAL_TABLE)
    print(f"Saved {len(df_eval)} rows to {EVAL_TABLE}")
except NameError:
    print("Spark session not found — skipping UC write.")
    print(f"DataFrame ready with {len(df_eval)} rows.")

---
## Validation

In [None]:
try:
    count = spark.sql(f"SELECT COUNT(*) as cnt FROM {EVAL_TABLE}").collect()[0]["cnt"]
    print(f"  OK  {EVAL_TABLE}: {count} rows")

    eval_cols = [f.name for f in spark.table(EVAL_TABLE).schema.fields]
    required = [
        "transaction_code", "layer", "prompt_version", "model_name",
        "exact_match", "failure_type", "review_status",
        "category_1", "category_2", "gt_L1", "gt_L2",
    ]
    missing = [c for c in required if c not in eval_cols]
    assert not missing, f"Missing columns: {missing}"
    print(f"  OK  All required columns present")

    # Check review_status distribution
    status_counts = spark.sql(
        f"SELECT review_status, COUNT(*) as cnt FROM {EVAL_TABLE} GROUP BY review_status"
    ).toPandas()
    print(f"\n  Review status:")
    for _, row in status_counts.iterrows():
        print(f"    {row['review_status']}: {row['cnt']} codes")

    print("\nAll validations passed.")
except NameError:
    print("Spark session not found — skipping UC validation.")
    print(f"Local DataFrame ready: {len(df_eval)} rows, columns: {list(df_eval.columns)}")