# 03 — Evaluate Product Categorization Accuracy

**Objective:** Evaluate LLM product classification results against the Product Catalog ground truth,
compute accuracy metrics, analyze failures, and support cross-version comparison.

### Metrics

| Metric | Description |
|--------|-------------|
| Per-Level Accuracy | L1 through L5 independently (case-insensitive, null-safe) |
| Exact Match | All 5 levels correct (null-to-null counts as match) |
| Partial Match | L1+L2 correct, L1+L2+L3 correct |
| Account-Weighted | Weighted by raw account count |
| Per-Layer | Obvious (Layer 1) vs Ambiguous (Layer 2) vs Unknown (Layer 3) |
| Failure Analysis | Categorize mismatches by root cause |
| Cost Summary | Total tokens and estimated cost for the run |
| Cross-Version | Compare accuracy across prompt versions |

### Layer Evaluation Rules

- **Layer 1 (Obvious):** Standard per-level comparison against single GT mapping.
- **Layer 2 (Ambiguous):** Match against ANY valid GT mapping for that product_code.
- **Layer 3 (Unknown):** Flagged as NEEDS MANUAL REVIEW — no accuracy calculation.

**Runs on:** Databricks Runtime 15.4 LTS or above.

In [None]:
# ── Configuration ─────────────────────────────────────────────────
CATALOG_NAME    = "ciq-bp_dummy-dev"
SCHEMA_NAME     = "default"
PROMPT_VERSION  = "v1.0"  # Set to None to evaluate the latest run

RESULTS_TABLE = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.product_classification_results"
GT_TABLE      = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.product_ground_truth_normalized"
EVAL_TABLE    = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.product_evaluation_results"

TARGET_ACCURACY = 0.80

print(f"Results table:  {RESULTS_TABLE}")
print(f"GT table:       {GT_TABLE}")
print(f"Eval table:     {EVAL_TABLE}")
print(f"Prompt version: {PROMPT_VERSION or '(latest)'}")
print(f"Target:         >= {TARGET_ACCURACY:.0%} account-weighted exact match")

---
## Step 1 — Validate upstream tables

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

try:
    for table, label in [
        (RESULTS_TABLE, "product_classification_results"),
        (GT_TABLE, "product_ground_truth_normalized"),
    ]:
        count = spark.sql(f"SELECT COUNT(*) as cnt FROM {table}").collect()[0]["cnt"]
        print(f"  OK  {label}: {count} rows")
    print("\nUpstream tables validated.")
except NameError:
    raise SystemExit("Spark session not found — run this notebook in Databricks.")

---
## Step 2 — Load results and ground truth from Unity Catalog

In [None]:
if PROMPT_VERSION:
    df_results = (
        spark.sql(f"SELECT * FROM {RESULTS_TABLE} WHERE prompt_version = '{PROMPT_VERSION}'")
        .toPandas()
    )
    print(f"Loaded results for prompt_version='{PROMPT_VERSION}': {len(df_results)} rows")
else:
    latest_version = (
        spark.sql(f"SELECT prompt_version FROM {RESULTS_TABLE} ORDER BY run_timestamp DESC LIMIT 1")
        .collect()[0]["prompt_version"]
    )
    df_results = (
        spark.sql(f"SELECT * FROM {RESULTS_TABLE} WHERE prompt_version = '{latest_version}'")
        .toPandas()
    )
    print(f"Loaded latest results (prompt_version='{latest_version}'): {len(df_results)} rows")

df_results["product_code"] = df_results["product_code"].astype(str).str.strip()

df_gt = spark.table(GT_TABLE).toPandas()
df_gt["product_code"] = df_gt["product_code"].astype(str).str.strip()

print(f"Ground truth: {len(df_gt)} rows, {df_gt['product_code'].nunique()} unique codes")
print(f"\nResults columns: {list(df_results.columns)}")
print(f"GT columns: {list(df_gt.columns)}")
print(f"\nLayer distribution in results:")
for layer in sorted(df_results["layer"].unique()):
    n = len(df_results[df_results["layer"] == layer])
    print(f"  Layer {layer}: {n} codes")
print(f"\nSource file distribution in results:")
print(df_results["source_file"].value_counts().to_string())

---
## Step 3 — Comparison helpers

In [None]:
_NULL = "__null__"


def _canon(val):
    """Canonicalize a value for comparison: lowercase, strip, null-safe."""
    if val is None or (isinstance(val, float) and np.isnan(val)):
        return _NULL
    s = str(val).strip().lower()
    if s in ("", "none", "nan", "null", "n/a"):
        return _NULL
    return s


def levels_match(row, llm_col, gt_col):
    return _canon(row[llm_col]) == _canon(row[gt_col])


def add_match_columns(df):
    """Add per-level and aggregate match booleans for 5-level product taxonomy."""
    df = df.copy()
    df["match_L1"] = df.apply(levels_match, axis=1, llm_col="line_of_business", gt_col="gt_L1_line_of_business")
    df["match_L2"] = df.apply(levels_match, axis=1, llm_col="product_type", gt_col="gt_L2_type")
    df["match_L3"] = df.apply(levels_match, axis=1, llm_col="product_category", gt_col="gt_L3_category")
    df["match_L4"] = df.apply(levels_match, axis=1, llm_col="product_subcategory", gt_col="gt_L4_subcategory")
    df["match_L5"] = df.apply(levels_match, axis=1, llm_col="product_special", gt_col="gt_L5_special")

    df["exact_match"]          = df[["match_L1", "match_L2", "match_L3", "match_L4", "match_L5"]].all(axis=1)
    df["partial_match_L1L2"]   = df[["match_L1", "match_L2"]].all(axis=1)
    df["partial_match_L1L2L3"] = df[["match_L1", "match_L2", "match_L3"]].all(axis=1)
    return df


LAYER_NAMES = {1: "Obvious", 2: "Ambiguous", 3: "Unknown"}

print("Comparison helpers ready.")

---
## Step 4 — Prepare GT lookups

In [None]:
gt_level_cols = [
    "gt_L1_line_of_business", "gt_L2_type", "gt_L3_category",
    "gt_L4_subcategory", "gt_L5_special",
]

gt_mapping_counts = (
    df_gt
    .groupby("product_code")
    .apply(lambda g: g[gt_level_cols].drop_duplicates().shape[0])
    .reset_index(name="n_mappings")
)

multi_codes  = set(gt_mapping_counts.loc[gt_mapping_counts["n_mappings"] > 1, "product_code"])
single_codes = set(gt_mapping_counts.loc[gt_mapping_counts["n_mappings"] == 1, "product_code"])
all_gt_codes = set(df_gt["product_code"].unique())

gt_all_cols = ["product_code", "product_name"] + gt_level_cols + ["product_domain"]
df_gt_single = (
    df_gt[df_gt["product_code"].isin(single_codes)]
    .drop_duplicates(subset="product_code", keep="first")[gt_all_cols]
)

print(f"GT codes with 1 mapping (Layer 1):  {len(single_codes)}")
print(f"GT codes with 2+ mappings (Layer 2): {len(multi_codes)}")
if multi_codes:
    print(f"Multi-mapping codes: {sorted(multi_codes)}")

---
## Step 5 — Layer 1 evaluation (Obvious codes)

In [None]:
df_l1 = pd.merge(
    df_results[df_results["layer"] == 1],
    df_gt_single,
    on="product_code",
    how="left",
    suffixes=("", "_gt"),
)

df_l1 = add_match_columns(df_l1)
n_l1 = len(df_l1)
accts_l1 = df_l1["account_count"].sum()

print("=" * 65)
print(f"LAYER 1 — OBVIOUS CODES  (n = {n_l1}, accounts = {accts_l1:,})")
print("=" * 65)
print(f"  L1 (Line of Business):    {df_l1['match_L1'].mean():.1%}")
print(f"  L2 (Product Type):        {df_l1['match_L2'].mean():.1%}")
print(f"  L3 (Category):            {df_l1['match_L3'].mean():.1%}")
print(f"  L4 (Sub-category):        {df_l1['match_L4'].mean():.1%}")
print(f"  L5 (Special):             {df_l1['match_L5'].mean():.1%}")
print(f"  ─────────────────────────────")
print(f"  Partial (L1+L2):          {df_l1['partial_match_L1L2'].mean():.1%}")
print(f"  Partial (L1+L2+L3):       {df_l1['partial_match_L1L2L3'].mean():.1%}")
print(f"  Exact Match (all 5):      {df_l1['exact_match'].mean():.1%}")

In [None]:
print("=" * 65)
print(f"ACCOUNT-WEIGHTED ACCURACY  (total: {accts_l1:,} accounts)")
print("=" * 65)

for col, label in [
    ("match_L1",              "L1 (Line of Business)"),
    ("match_L2",              "L2 (Product Type)"),
    ("match_L3",              "L3 (Category)"),
    ("match_L4",              "L4 (Sub-category)"),
    ("match_L5",              "L5 (Special)"),
    ("partial_match_L1L2",    "Partial (L1+L2)"),
    ("partial_match_L1L2L3",  "Partial (L1+L2+L3)"),
    ("exact_match",           "Exact (all 5)"),
]:
    w = (df_l1[col] * df_l1["account_count"]).sum() / max(accts_l1, 1)
    print(f"  {label:<25} {w:.1%}")

---
## Step 6 — Layer 2 evaluation (Ambiguous codes)

A prediction is correct if it matches **any** of the valid GT mappings for that code.

In [None]:
df_l2_src = df_results[df_results["layer"] == 2].copy()

llm_cols = ["line_of_business", "product_type", "product_category", "product_subcategory", "product_special"]

l2_rows = []
for _, row in df_l2_src.iterrows():
    code = row["product_code"]
    gt_maps = (
        df_gt[df_gt["product_code"] == code][gt_level_cols]
        .drop_duplicates()
    )
    llm = tuple(_canon(row[c]) for c in llm_cols)

    best_levels = 0
    matched_any = False
    for _, g in gt_maps.iterrows():
        gt = tuple(_canon(g[c]) for c in gt_level_cols)
        n_match = sum(a == b for a, b in zip(llm, gt))
        best_levels = max(best_levels, n_match)
        if llm == gt:
            matched_any = True

    l2_rows.append({
        "product_code":        code,
        "product_name":        row["product_name"],
        "account_count":       row["account_count"],
        "llm_path":            f"{row['line_of_business']} > {row['product_type']} > {row['product_category']}",
        "exact_match_any":     matched_any,
        "best_levels_matched": best_levels,
        "n_valid_mappings":    len(gt_maps),
        "confidence":          row["confidence"],
    })

df_l2 = pd.DataFrame(l2_rows) if l2_rows else pd.DataFrame()

print("=" * 65)
print(f"LAYER 2 — AMBIGUOUS CODES  (n = {len(df_l2)})")
print("=" * 65)
if len(df_l2) > 0:
    print(f"  Exact match (any valid mapping):  {df_l2['exact_match_any'].mean():.1%}")
    print(f"  Avg best levels matched:          {df_l2['best_levels_matched'].mean():.1f} / 5")
    print()
    print(df_l2.to_string(index=False))
else:
    print("  No ambiguous codes in results.")

---
## Step 7 — Layer 3 (Unknown codes — manual review)

In [None]:
df_l3 = df_results[df_results["layer"] == 3].copy()

print("=" * 65)
print(f"LAYER 3 — UNKNOWN CODES  (n = {len(df_l3)}, no ground truth)")
print("=" * 65)
print("These codes are absent from the Product Catalog GT and need manual review.\n")

for _, r in df_l3.sort_values("account_count", ascending=False).iterrows():
    print(
        f"  code={str(r['product_code']):>3}"
        f" | accts={r['account_count']:>6,}"
        f" | {r['source_file']:<7}"
        f" | conf={r['confidence']:.2f}"
    )
    print(f"    desc: {r['product_name']}")
    print(
        f"    LLM:  {r['line_of_business']} > {r['product_type']}"
        f" > {r['product_category']} > {r['product_subcategory']} > {r['product_special']}"
    )
    print()

---
## Step 8 — Failure analysis (Layer 1)

In [None]:
failures = df_l1[~df_l1["exact_match"]].copy()


def failure_type(row):
    if not row["match_L1"]:
        return "WRONG LOB (L1)"
    if not row["match_L2"]:
        return "WRONG TYPE (L2)"
    if not row["match_L3"]:
        return "WRONG CATEGORY (L3)"
    if not row["match_L4"]:
        return "WRONG SUBCATEGORY (L4)"
    if not row["match_L5"]:
        return "WRONG SPECIAL (L5)"
    return "UNKNOWN"


failures["failure_type"] = failures.apply(failure_type, axis=1)

print("=" * 65)
print(f"FAILURE ANALYSIS — {len(failures)} mismatches out of {len(df_l1)} obvious codes")
print("=" * 65)

if len(failures) > 0:
    ft = (
        failures.groupby("failure_type")
        .agg(
            count=("product_code", "count"),
            accounts=("account_count", "sum"),
            examples=("product_code", lambda x: ", ".join(x.head(4))),
        )
        .sort_values("count", ascending=False)
        .reset_index()
    )
    print("\nFailure type distribution:")
    print(ft.to_string(index=False))
else:
    print("\nNo failures — all Layer 1 codes matched exactly.")

In [None]:
if len(failures) > 0:
    print("=" * 65)
    print("DETAILED FAILURES (sorted by account count, highest impact first)")
    print("=" * 65)

    for _, r in failures.sort_values("account_count", ascending=False).iterrows():
        print(
            f"\n  code={str(r['product_code']):>3}"
            f" | accts={r['account_count']:>6,}"
            f" | conf={r['confidence']:.2f}"
            f" | {r['failure_type']}"
        )
        print(f"    name: {str(r['product_name'])[:55]}")

        for lvl, llm_col, gt_col in [
            ("L1", "line_of_business", "gt_L1_line_of_business"),
            ("L2", "product_type", "gt_L2_type"),
            ("L3", "product_category", "gt_L3_category"),
            ("L4", "product_subcategory", "gt_L4_subcategory"),
            ("L5", "product_special", "gt_L5_special"),
        ]:
            llm_val = str(r[llm_col])
            gt_val  = str(r[gt_col])
            ok      = "Y" if r[f"match_{lvl}"] else "X"
            if not r[f"match_{lvl}"]:
                print(f'    {lvl}: [{ok}] LLM="{llm_val}" vs GT="{gt_val}"')
            else:
                print(f'    {lvl}: [{ok}] "{llm_val}"')

---
## Step 9 — Cost summary

In [None]:
print("=" * 65)
print("COST SUMMARY")
print("=" * 65)

total_tokens_in  = df_results["tokens_in"].sum()
total_tokens_out = df_results["tokens_out"].sum()
total_cost       = df_results["estimated_cost"].sum()
model_name       = df_results["model_name"].iloc[0] if len(df_results) > 0 else "N/A"

print(f"  Model:            {model_name}")
print(f"  Total tokens in:  {total_tokens_in:,}")
print(f"  Total tokens out: {total_tokens_out:,}")
print(f"  Total tokens:     {total_tokens_in + total_tokens_out:,}")
print(f"  Estimated cost:   ${total_cost:.4f}")
print(f"  Codes classified: {len(df_results)}")
print(f"  Cost per code:    ${total_cost / max(len(df_results), 1):.4f}")

---
## Step 10 — Cross-version comparison

In [None]:
all_versions = (
    spark.sql(f"SELECT DISTINCT prompt_version FROM {RESULTS_TABLE} ORDER BY prompt_version")
    .toPandas()["prompt_version"]
    .tolist()
)

print(f"Prompt versions in results table: {all_versions}")

if len(all_versions) > 1:
    version_metrics = []

    for version in all_versions:
        v_results = (
            spark.sql(f"SELECT * FROM {RESULTS_TABLE} WHERE prompt_version = '{version}'")
            .toPandas()
        )
        v_results["product_code"] = v_results["product_code"].astype(str).str.strip()

        v_l1 = pd.merge(
            v_results[v_results["layer"] == 1],
            df_gt_single,
            on="product_code",
            how="left",
            suffixes=("", "_gt"),
        )
        v_l1 = add_match_columns(v_l1)
        v_accts = v_l1["account_count"].sum()

        vw_exact   = (v_l1["exact_match"] * v_l1["account_count"]).sum() / max(v_accts, 1)
        vw_partial = (v_l1["partial_match_L1L2"] * v_l1["account_count"]).sum() / max(v_accts, 1)
        v_cost     = v_results["estimated_cost"].sum()

        version_metrics.append({
            "prompt_version":       version,
            "codes":                len(v_results),
            "l1_codes":             len(v_l1),
            "exact_match":          f"{v_l1['exact_match'].mean():.1%}",
            "partial_L1L2":         f"{v_l1['partial_match_L1L2'].mean():.1%}",
            "vol_weighted_exact":   f"{vw_exact:.1%}",
            "vol_weighted_partial": f"{vw_partial:.1%}",
            "L1_accuracy":          f"{v_l1['match_L1'].mean():.1%}",
            "L2_accuracy":          f"{v_l1['match_L2'].mean():.1%}",
            "L3_accuracy":          f"{v_l1['match_L3'].mean():.1%}",
            "L4_accuracy":          f"{v_l1['match_L4'].mean():.1%}",
            "L5_accuracy":          f"{v_l1['match_L5'].mean():.1%}",
            "estimated_cost":       f"${v_cost:.4f}",
        })

    df_comparison = pd.DataFrame(version_metrics)

    print("\n" + "=" * 65)
    print("CROSS-VERSION COMPARISON")
    print("=" * 65)
    print(df_comparison.to_string(index=False))

    if len(version_metrics) >= 2:
        prev = version_metrics[-2]
        curr = version_metrics[-1]
        print(f"\n  Comparing {prev['prompt_version']} -> {curr['prompt_version']}:")
        print(f"    Acct-weighted exact: {prev['vol_weighted_exact']} -> {curr['vol_weighted_exact']}")
        print(f"    Acct-weighted partial: {prev['vol_weighted_partial']} -> {curr['vol_weighted_partial']}")
        print(f"    Cost: {prev['estimated_cost']} -> {curr['estimated_cost']}")
else:
    print("Only one prompt version found — cross-version comparison will be available after additional runs.")

---
## Step 11 — Summary report

In [None]:
accts_total = df_results["account_count"].sum()
vw_exact    = (df_l1["exact_match"] * df_l1["account_count"]).sum() / max(accts_l1, 1)
vw_partial  = (df_l1["partial_match_L1L2"] * df_l1["account_count"]).sum() / max(accts_l1, 1)
amb_match   = df_l2["exact_match_any"].mean() if len(df_l2) > 0 else 0
target_met  = vw_exact >= TARGET_ACCURACY
status      = "MET" if target_met else "BELOW TARGET"

print("=" * 65)
print("  PRODUCT CATEGORIZATION — ACCURACY SUMMARY")
print("=" * 65)
print(f"")
print(f"  Prompt version:                  {df_results['prompt_version'].iloc[0]}")
print(f"  Model:                           {model_name}")
print(f"  Total codes evaluated:           {len(df_results)}")
print(f"  Total accounts:                  {accts_total:,}")
print(f"")
print(f"  LAYER 1 — Obvious (single-mapping)")
print(f"    Codes: {len(df_l1)}    Accounts: {accts_l1:,}")
print(f"    L1 (LoB):             {df_l1['match_L1'].mean():.1%}")
print(f"    L2 (Type):            {df_l1['match_L2'].mean():.1%}")
print(f"    L3 (Category):        {df_l1['match_L3'].mean():.1%}")
print(f"    L4 (Sub-category):    {df_l1['match_L4'].mean():.1%}")
print(f"    L5 (Special):         {df_l1['match_L5'].mean():.1%}")
print(f"    Exact Match (all 5):  {df_l1['exact_match'].mean():.1%}  (acct-wt: {vw_exact:.1%})")
print(f"    Partial (L1+L2):      {df_l1['partial_match_L1L2'].mean():.1%}  (acct-wt: {vw_partial:.1%})")
print(f"")
print(f"  LAYER 2 — Ambiguous: {len(df_l2)} codes")
print(f"    Match (any valid):    {amb_match:.1%}")
print(f"")
print(f"  LAYER 3 — Unknown: {len(df_l3)} codes (NEEDS MANUAL REVIEW)")
print(f"")
if len(failures) > 0:
    print(f"  KEY FAILURES:")
    for ft_name in failures["failure_type"].value_counts().index:
        ft_count = len(failures[failures["failure_type"] == ft_name])
        print(f"    {ft_name}: {ft_count} codes")
    print(f"")
print(f"  COST: ${total_cost:.4f} ({len(df_results)} codes)")
print(f"")
print(f"  TARGET: >= {TARGET_ACCURACY:.0%} acct-weighted exact match -> {status}")
print("=" * 65)

---
## Step 12 — Save evaluation results to Unity Catalog

In [None]:
# Layer 1: full match details
eval_l1 = df_l1[[
    "product_code", "layer", "prompt_version", "model_name",
    "product_name", "account_count", "source_file",
    "line_of_business", "product_type", "product_category",
    "product_subcategory", "product_special",
    "gt_L1_line_of_business", "gt_L2_type", "gt_L3_category",
    "gt_L4_subcategory", "gt_L5_special",
    "match_L1", "match_L2", "match_L3", "match_L4", "match_L5",
    "exact_match", "partial_match_L1L2", "partial_match_L1L2L3",
    "confidence",
]].copy()
eval_l1["failure_type"] = None
if len(failures) > 0:
    eval_l1.loc[
        eval_l1["product_code"].isin(failures["product_code"]),
        "failure_type",
    ] = failures.set_index("product_code")["failure_type"]
eval_l1["review_status"] = "EVALUATED"

# Layer 2
eval_l2_records = []
for _, row in df_l2_src.iterrows():
    code = row["product_code"]
    l2_match = df_l2[df_l2["product_code"] == code]
    is_match = bool(l2_match["exact_match_any"].iloc[0]) if len(l2_match) > 0 else False
    eval_l2_records.append({
        "product_code":           code,
        "layer":                  row["layer"],
        "prompt_version":         row["prompt_version"],
        "model_name":             row["model_name"],
        "product_name":           row["product_name"],
        "account_count":          row["account_count"],
        "source_file":            row["source_file"],
        "line_of_business":       row["line_of_business"],
        "product_type":           row["product_type"],
        "product_category":       row["product_category"],
        "product_subcategory":    row["product_subcategory"],
        "product_special":        row["product_special"],
        "gt_L1_line_of_business": None,
        "gt_L2_type":             None,
        "gt_L3_category":         None,
        "gt_L4_subcategory":      None,
        "gt_L5_special":          None,
        "match_L1":               None,
        "match_L2":               None,
        "match_L3":               None,
        "match_L4":               None,
        "match_L5":               None,
        "exact_match":            is_match,
        "partial_match_L1L2":     None,
        "partial_match_L1L2L3":   None,
        "confidence":             row["confidence"],
        "failure_type":           None if is_match else "AMBIGUOUS_NO_MATCH",
        "review_status":          "EVALUATED",
    })
eval_l2 = pd.DataFrame(eval_l2_records) if eval_l2_records else pd.DataFrame()

# Layer 3
eval_l3_records = []
for _, row in df_l3.iterrows():
    eval_l3_records.append({
        "product_code":           row["product_code"],
        "layer":                  row["layer"],
        "prompt_version":         row["prompt_version"],
        "model_name":             row["model_name"],
        "product_name":           row["product_name"],
        "account_count":          row["account_count"],
        "source_file":            row["source_file"],
        "line_of_business":       row["line_of_business"],
        "product_type":           row["product_type"],
        "product_category":       row["product_category"],
        "product_subcategory":    row["product_subcategory"],
        "product_special":        row["product_special"],
        "gt_L1_line_of_business": None,
        "gt_L2_type":             None,
        "gt_L3_category":         None,
        "gt_L4_subcategory":      None,
        "gt_L5_special":          None,
        "match_L1":               None,
        "match_L2":               None,
        "match_L3":               None,
        "match_L4":               None,
        "match_L5":               None,
        "exact_match":            None,
        "partial_match_L1L2":     None,
        "partial_match_L1L2L3":   None,
        "confidence":             row["confidence"],
        "failure_type":           None,
        "review_status":          "NEEDS MANUAL REVIEW",
    })
eval_l3_df = pd.DataFrame(eval_l3_records) if eval_l3_records else pd.DataFrame()

# Combine all layers
eval_parts = [df for df in [eval_l1, eval_l2, eval_l3_df] if len(df) > 0]
df_eval = pd.concat(eval_parts, ignore_index=True) if eval_parts else pd.DataFrame()

print(f"Evaluation results: {len(df_eval)} rows")
print(f"  Layer 1: {len(eval_l1)}")
print(f"  Layer 2: {len(eval_l2)}")
print(f"  Layer 3: {len(eval_l3_df)}")

In [None]:
try:
    sdf_eval = spark.createDataFrame(df_eval)
    sdf_eval.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(EVAL_TABLE)
    print(f"Saved {len(df_eval)} rows to {EVAL_TABLE}")
except NameError:
    print("Spark session not found — skipping UC write.")
    print(f"DataFrame ready with {len(df_eval)} rows.")

---
## Validation

In [None]:
try:
    count = spark.sql(f"SELECT COUNT(*) as cnt FROM {EVAL_TABLE}").collect()[0]["cnt"]
    print(f"  OK  {EVAL_TABLE}: {count} rows")

    eval_cols = [f.name for f in spark.table(EVAL_TABLE).schema.fields]
    required = [
        "product_code", "layer", "prompt_version", "model_name",
        "exact_match", "failure_type", "review_status",
        "line_of_business", "product_type",
        "gt_L1_line_of_business", "gt_L2_type",
    ]
    missing = [c for c in required if c not in eval_cols]
    assert not missing, f"Missing columns: {missing}"
    print(f"  OK  All required columns present")

    status_counts = spark.sql(
        f"SELECT review_status, COUNT(*) as cnt FROM {EVAL_TABLE} GROUP BY review_status"
    ).toPandas()
    print(f"\n  Review status:")
    for _, row in status_counts.iterrows():
        print(f"    {row['review_status']}: {row['cnt']} codes")

    print("\nAll validations passed.")
except NameError:
    print("Spark session not found — skipping UC validation.")
    print(f"Local DataFrame ready: {len(df_eval)} rows, columns: {list(df_eval.columns)}")