# 01 — Prepare Data

**Objective:** Build the three foundational Unity Catalog tables that the rest of the pipeline depends on.

| Section | Output Table | Description |
|---------|-------------|-------------|
| **A** | `ground_truth_normalized` | Cleaned, normalized Master Fee Table (431 codes) |
| **B** | `transaction_code_catalog` | Unique TRANCD codes from raw data with sample descriptions and volume |
| **C** | Layer assignment | Adds `layer` column to the catalog (Obvious / Ambiguous / Unknown) |

**Runs on:** Databricks Runtime 15.4 LTS or above.

In [None]:
# ── Configuration ─────────────────────────────────────────────────
CATALOG_NAME = "ciq-bp_dummy-dev"
SCHEMA_NAME  = "default"

GT_PATH          = "../data/bank-plus-data/source-of-truth/Master Fee Table(Master).csv"
RAW_NON_POS_PATH = "../data/bank-plus-data/raw/CheckingIQ_NON_POS_Daily_012626_rerun.csv"
RAW_POS_PATH     = "../data/bank-plus-data/raw/CheckingIQ_POS_Daily_012626_rerun.csv"

GT_TABLE      = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.ground_truth_normalized"
CATALOG_TABLE = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.transaction_code_catalog"

print(f"Catalog:        {CATALOG_NAME}")
print(f"Schema:         {SCHEMA_NAME}")
print(f"GT table:       {GT_TABLE}")
print(f"Catalog table:  {CATALOG_TABLE}")

In [None]:
# ── Validation: check that input files exist ─────────────────────
import os

for path, label in [
    (GT_PATH, "Master Fee Table"),
    (RAW_NON_POS_PATH, "NON_POS raw data"),
    (RAW_POS_PATH, "POS raw data"),
]:
    if os.path.exists(path):
        print(f"  OK  {label}: {path}")
    else:
        raise FileNotFoundError(f"Missing {label}: {path}")

print("\nAll input files found.")

---
## Section A — Ground Truth Normalization

Read the Master Fee Table, clean it, normalize casing inconsistencies, and save to Unity Catalog.

In [None]:
import pandas as pd
import numpy as np

df_gt_raw = pd.read_csv(GT_PATH, encoding="latin-1")

# Strip whitespace from column names
df_gt_raw.columns = [c.strip() for c in df_gt_raw.columns]

print(f"Raw rows loaded: {len(df_gt_raw)}")
print(f"Columns: {list(df_gt_raw.columns)}")
df_gt_raw.head(5)

In [None]:
df_gt = df_gt_raw.copy()

# Strip whitespace from all string cells
for col in df_gt.columns:
    if df_gt[col].dtype == object:
        df_gt[col] = df_gt[col].astype(str).str.strip()

# Drop rows where External Transaction Code is empty or non-numeric
# These are section headers (e.g. "ATM activities"), generic descriptions
# (no TRANCD), and trailing blank rows.
df_gt = df_gt[
    df_gt["External Transaction Code"].notna()
    & (df_gt["External Transaction Code"].astype(str).str.strip() != "")
    & (df_gt["External Transaction Code"].astype(str).str.strip() != "nan")
].copy()

df_gt["External Transaction Code"] = df_gt["External Transaction Code"].astype(str).str.strip()

# Drop header-leak rows (the header row repeated mid-file at the Fee Items boundary)
df_gt = df_gt[
    df_gt["Scoring Category 1"].astype(str).str.strip() != "Scoring Category 1"
].copy()

# Keep only rows where the TRANCD is a valid integer
df_gt = df_gt[
    df_gt["External Transaction Code"].str.match(r"^\d+$")
].copy()

print(f"After cleaning: {len(df_gt)} rows, {df_gt['External Transaction Code'].nunique()} unique codes")

In [None]:
# ── Normalization maps ────────────────────────────────────────────
# Fix casing inconsistencies found in the spreadsheet.

L1_NORM = {
    "Fee Item":  "Fee item",
    "Fee item":  "Fee item",
    "Non-fee item": "Non-fee item",
}

L2_NORM = {
    "NSF /OD":            "NSF/OD",
    "NSF/OD":             "NSF/OD",
    "Money Movement":     "Money movement",
    "Money movement":     "Money movement",
    "Account Operations": "Account operations",
    "Account operations": "Account operations",
    "All others":         "All others",
    "Service Charges":    "Service Charges",
    "Interchange":        "Interchange",
    "Miscellaneous":      "Miscellaneous",
    "Unclassified":       "Unclassified",
}

L3_NORM = {
    "N/A": None,
    "nan": None,
    "Money Movement":     "Money movement",
    "Account Operations": "Account operations",
}

L4_NORM = {
    "N/A": None,
    "nan": None,
}


def _apply_map(series, norm_map):
    """Map values through a normalization dict, keeping unmapped values as-is."""
    mapped = series.map(norm_map)
    # Where the map returned a value (including explicit None), use it.
    # Where the key was not in the map, keep the original.
    has_mapping = series.isin(norm_map.keys())
    return mapped.where(has_mapping, series)


df_gt["Scoring Category 1"] = _apply_map(df_gt["Scoring Category 1"], L1_NORM)
df_gt["Scoring Category 2"] = _apply_map(df_gt["Scoring Category 2"], L2_NORM)
df_gt["Scoring Category 3"] = _apply_map(df_gt["Scoring Category 3"], L3_NORM)
df_gt["Scoring Category 4"] = _apply_map(df_gt["Scoring Category 4"], L4_NORM)

# Drop rows where L1 ended up as None (shouldn't happen after cleaning, but safety)
df_gt = df_gt[df_gt["Scoring Category 1"].notna()].copy()

print("After normalization:")
print(f"  L1 values: {sorted(df_gt['Scoring Category 1'].dropna().unique())}")
print(f"  L2 values: {sorted(df_gt['Scoring Category 2'].dropna().unique())}")
print(f"  L3 values: {sorted(df_gt['Scoring Category 3'].dropna().unique())}")
l4_vals = df_gt['Scoring Category 4'].dropna().unique()
print(f"  L4 values: {sorted([v for v in l4_vals if v is not None])}")

In [None]:
# ── Rename to canonical column names ──────────────────────────────
df_gt = df_gt.rename(columns={
    "External Transaction Code":        "TRANCD",
    "External Transaction Description": "gt_desc",
    "Scoring Category 1":               "gt_L1",
    "Scoring Category 2":               "gt_L2",
    "Scoring Category 3":               "gt_L3",
    "Scoring Category 4":               "gt_L4",
    "Credit / Debit":                   "gt_credit_debit",
})

gt_cols = ["TRANCD", "gt_desc", "gt_L1", "gt_L2", "gt_L3", "gt_L4", "gt_credit_debit"]
df_gt = df_gt[gt_cols].copy()

# Replace remaining string 'None' / 'nan' artifacts
df_gt.replace({"None": None, "nan": None}, inplace=True)

print(f"Ground truth ready: {len(df_gt)} rows, {df_gt['TRANCD'].nunique()} unique codes")
df_gt.head(10)

In [None]:
# ── Save ground truth to Unity Catalog ────────────────────────────
try:
    sdf_gt = spark.createDataFrame(df_gt)
    sdf_gt.write.mode("overwrite").saveAsTable(GT_TABLE)
    print(f"Saved {len(df_gt)} rows to {GT_TABLE}")
except NameError:
    print("Spark session not found — skipping UC write (run in Databricks).")
    print(f"DataFrame ready with {len(df_gt)} rows.")

---
## Section B — Transaction Code Catalog

Build a catalog of unique transaction codes from the raw NON_POS and POS daily files.
For each code, capture one sample description and the total transaction volume.

In [None]:
# ── Load raw NON_POS data ─────────────────────────────────────────
df_non_pos = pd.read_csv(RAW_NON_POS_PATH, dtype={"TRANCD": str})
df_non_pos.columns = [c.strip() for c in df_non_pos.columns]
df_non_pos["TRANCD"] = df_non_pos["TRANCD"].astype(str).str.strip()

print(f"NON_POS rows: {len(df_non_pos):,}")
print(f"NON_POS unique TRANCD: {df_non_pos['TRANCD'].nunique()}")
print(f"Columns: {list(df_non_pos.columns)}")

In [None]:
# ── Load raw POS data ─────────────────────────────────────────────
df_pos = pd.read_csv(RAW_POS_PATH, dtype={"TRANCD": str})
df_pos.columns = [c.strip() for c in df_pos.columns]
df_pos["TRANCD"] = df_pos["TRANCD"].astype(str).str.strip()

print(f"POS rows: {len(df_pos):,}")
print(f"POS unique TRANCD: {df_pos['TRANCD'].nunique()}")
print(f"Columns: {list(df_pos.columns)}")

In [None]:
# ── Build NON_POS catalog entries ─────────────────────────────────
# EFHDS1 is the primary description field (the `description` column is always empty)
non_pos_catalog = (
    df_non_pos
    .groupby("TRANCD")
    .agg(
        sample_desc_1=("EFHDS1", "first"),
        volume=("TRANCD", "size"),
    )
    .reset_index()
)
non_pos_catalog["source_file"] = "NON_POS"
non_pos_catalog["sample_desc_1"] = non_pos_catalog["sample_desc_1"].astype(str).str.strip()

print(f"NON_POS catalog: {len(non_pos_catalog)} unique codes")
non_pos_catalog.sort_values("volume", ascending=False).head(10)

In [None]:
# ── Build POS catalog entries ─────────────────────────────────────
# POS has a populated `description` column (unlike NON_POS)
pos_catalog = (
    df_pos
    .groupby("TRANCD")
    .agg(
        sample_desc_1=("description", "first"),
        volume=("TRANCD", "size"),
    )
    .reset_index()
)
pos_catalog["source_file"] = "POS"
pos_catalog["sample_desc_1"] = pos_catalog["sample_desc_1"].astype(str).str.strip()

print(f"POS catalog: {len(pos_catalog)} unique codes")
pos_catalog.sort_values("volume", ascending=False).head(10)

In [None]:
# ── Combine into a single catalog ─────────────────────────────────
df_catalog = pd.concat([non_pos_catalog, pos_catalog], ignore_index=True)
df_catalog["TRANCD"] = df_catalog["TRANCD"].astype(str)

print(f"Combined catalog: {len(df_catalog)} codes")
print(f"Total transaction volume: {df_catalog['volume'].sum():,}")
df_catalog.sort_values("volume", ascending=False)

---
## Section C — Layer Assignment

Assign each catalog code to a test layer based on how it appears in the ground truth:

| Layer | Name | Rule |
|-------|------|------|
| 1 | Obvious | Exactly 1 unique (L1,L2,L3,L4) mapping in GT |
| 2 | Ambiguous | 2+ distinct mappings in GT |
| 3 | Unknown | TRANCD absent from GT entirely |

In [None]:
# Count distinct mappings per TRANCD in ground truth
gt_mapping_counts = (
    df_gt
    .groupby("TRANCD")
    .apply(lambda g: g[["gt_L1", "gt_L2", "gt_L3", "gt_L4"]].drop_duplicates().shape[0])
    .reset_index(name="n_mappings")
)

multi_codes  = set(gt_mapping_counts.loc[gt_mapping_counts["n_mappings"] > 1, "TRANCD"])
single_codes = set(gt_mapping_counts.loc[gt_mapping_counts["n_mappings"] == 1, "TRANCD"])
all_gt_codes = set(df_gt["TRANCD"].unique())

print(f"GT codes with 1 mapping (Layer 1):  {len(single_codes)}")
print(f"GT codes with 2+ mappings (Layer 2): {len(multi_codes)}")
print(f"Multi-mapping codes: {sorted(multi_codes)}")

In [None]:
def assign_layer(trancd):
    if trancd not in all_gt_codes:
        return 3  # Unknown
    if trancd in multi_codes:
        return 2  # Ambiguous
    return 1      # Obvious


df_catalog["layer"] = df_catalog["TRANCD"].apply(assign_layer)

# ── Summary ───────────────────────────────────────────────────────
layer_summary = (
    df_catalog
    .groupby("layer")
    .agg(codes=("TRANCD", "nunique"), total_volume=("volume", "sum"))
    .reset_index()
)
layer_summary["pct_volume"] = (
    layer_summary["total_volume"] / layer_summary["total_volume"].sum() * 100
).round(1)

layer_names = {1: "Obvious", 2: "Ambiguous", 3: "Unknown"}
layer_summary["name"] = layer_summary["layer"].map(layer_names)

print("Layer assignment summary:")
print(layer_summary[["layer", "name", "codes", "total_volume", "pct_volume"]].to_string(index=False))
print(f"\nTotal codes: {len(df_catalog)}")

In [None]:
# ── Show codes per layer ──────────────────────────────────────────
for layer_num, layer_name in layer_names.items():
    layer_df = df_catalog[df_catalog["layer"] == layer_num].sort_values("volume", ascending=False)
    print(f"\n{'='*60}")
    print(f"Layer {layer_num} — {layer_name} ({len(layer_df)} codes)")
    print(f"{'='*60}")
    for _, row in layer_df.iterrows():
        print(f"  TRANCD={row['TRANCD']:>5} | vol={row['volume']:>7,} | {row['source_file']:<7} | {str(row['sample_desc_1'])[:50]}")

In [None]:
# ── Save catalog (with layers) to Unity Catalog ───────────────────
try:
    sdf_catalog = spark.createDataFrame(df_catalog)
    sdf_catalog.write.mode("overwrite").saveAsTable(CATALOG_TABLE)
    print(f"Saved {len(df_catalog)} rows to {CATALOG_TABLE}")
except NameError:
    print("Spark session not found — skipping UC write (run in Databricks).")
    print(f"DataFrame ready with {len(df_catalog)} rows.")

---
## Validation

Verify that all Unity Catalog tables were created successfully.

In [None]:
# ── Validate UC tables ────────────────────────────────────────────
try:
    for table_name, expected_label in [
        (GT_TABLE, "ground_truth_normalized"),
        (CATALOG_TABLE, "transaction_code_catalog"),
    ]:
        count = spark.sql(f"SELECT COUNT(*) as cnt FROM {table_name}").collect()[0]["cnt"]
        print(f"  OK  {expected_label}: {count} rows")

    # Verify the catalog has a layer column
    catalog_cols = [f.name for f in spark.table(CATALOG_TABLE).schema.fields]
    assert "layer" in catalog_cols, "Missing 'layer' column in catalog table"
    print(f"  OK  catalog has 'layer' column")

    print("\nAll validations passed.")
except NameError:
    print("Spark session not found — skipping UC validation (run in Databricks).")
    print("Local DataFrames are ready:")
    print(f"  df_gt:      {len(df_gt)} rows, {df_gt['TRANCD'].nunique()} unique codes")
    print(f"  df_catalog: {len(df_catalog)} rows, columns: {list(df_catalog.columns)}")