# 05 â€” Evaluate Accuracy

**Objective:** Quantify the performance of the transaction categorization model by comparing LLM predictions against the normalized ground truth.

### Metrics to Calculate:
1. **Exact Match Rate:** L1 through L4 all match correctly.
2. **Partial Match Rate:** L1 and L2 match (correct block and primary category).
3. **Per-Level Accuracy:** Accuracy scores for L1, L2, L3, and L4 independently.
4. **Per-Layer Accuracy:** Accuracy breakdown by test layer (Obvious vs. Ambiguous).
5. **Failure Analysis:** Identification of specific codes where the model consistently fails.

In [None]:
import pandas as pd
import numpy as np
import os
import json

# Paths
gt_path = "../taxonomy/data/ground_truth_normalized.csv"
results_path = "../results/latest_run.csv" # Path to results exported from Step 04

print("Loading datasets...")
df_gt = pd.read_csv(gt_path, dtype={'TRANCD': str})

if os.path.exists(results_path):
    df_res = pd.read_csv(results_path, dtype={'TRANCD': str})
    print(f"Loaded {len(df_res)} result rows.")
else:
    print(f"Results file not found at {results_path}. Run Notebook 04 first.")
    # Creating dummy results for structure demonstration
    df_res = pd.DataFrame(columns=['TRANCD', 'category_1', 'category_2', 'category_3', 'category_4'])

In [None]:
def calculate_metrics(df_merged):
    # Exact matches
    df_merged['match_l1'] = (df_merged['category_1'] == df_merged['L1'])
    df_merged['match_l2'] = (df_merged['category_2'] == df_merged['L2'])
    df_merged['match_l3'] = (df_merged['category_3'].fillna('null') == df_merged['L3'].fillna('null'))
    df_merged['match_l4'] = (df_merged['category_4'].fillna('null') == df_merged['L4'].fillna('null'))
    
    df_merged['exact_match'] = df_merged[['match_l1', 'match_l2', 'match_l3', 'match_l4']].all(axis=1)
    df_merged['partial_match'] = df_merged[['match_l1', 'match_l2']].all(axis=1)
    
    return df_merged

if not df_res.empty:
    # Merge results with ground truth on TRANCD
    df_eval = pd.merge(df_res, df_gt, on='TRANCD', how='inner', suffixes=('_llm', '_gt'))
    df_eval = calculate_metrics(df_eval)
    
    print("--- Overall Accuracy ---")
    print(f"Exact Match Rate: {df_eval['exact_match'].mean():.2%}")
    print(f"Partial Match (L1+L2): {df_eval['partial_match'].mean():.2%}")
    print(f"L1 Accuracy: {df_eval['match_l1'].mean():.2%}")
    print(f"L2 Accuracy: {df_eval['match_l2'].mean():.2%}")
    print(f"L3 Accuracy: {df_eval['match_l3'].mean():.2%}")
    print(f"L4 Accuracy: {df_eval['match_l4'].mean():.2%}")

In [None]:
if not df_res.empty:
    print("--- Top 10 Failures ---")
    failures = df_eval[~df_eval['exact_match']].copy()
    # Show expected vs actual for the first few levels
    cols_to_show = ['TRANCD', 'sample_desc_1', 'L1', 'category_1', 'L2', 'category_2']
    print(failures[cols_to_show].head(10))

### Save Evaluation Report
Outputs a detailed CSV of matches/mismatches for manual review.

In [None]:
if not df_res.empty:
    output_eval_path = "../results/eval_report_latest.csv"
    df_eval.to_csv(output_eval_path, index=False)
    print(f"Evaluation report saved to: {output_eval_path}")