# 01 — Prepare Product Data

**Objective:** Build the foundational Unity Catalog tables for the product categorization pipeline.

| Section | Output Table | Description |
|---------|-------------|-------------|
| **A** | `product_ground_truth_normalized` | Parsed hierarchical GT from Deposits + Loans product catalogs |
| **B** | `product_code_catalog` | Unique ACTYPE codes from raw Deposit/Loan/CD data with descriptions and account counts |
| **C** | Layer assignment | Adds `layer` column to the catalog (Obvious / Ambiguous / Unknown) |

**Runs on:** Databricks Runtime 15.4 LTS or above.

In [None]:
# ── Configuration ─────────────────────────────────────────────────
CATALOG_NAME = "ciq-bp_dummy-dev"
SCHEMA_NAME  = "default"

DEPOSITS_GT_PATH = "../../data/bank-plus-data/source-of-truth/products/Product catalog(Deposits).csv"
LOANS_GT_PATH    = "../../data/bank-plus-data/source-of-truth/products/Product catalog(Loans).csv"
DDA_TYPES_PATH   = "../../data/bank-plus-data/source-of-truth/products/Product catalog(DDA Types).csv"
LOAN_TYPES_PATH  = "../../data/bank-plus-data/source-of-truth/products/Product catalog(Loan Types).csv"

RAW_DEPOSIT_PATH = "../../data/bank-plus-data/raw/CheckingIQ_Deposit_ALL_*.csv"
RAW_LOAN_PATH    = "../../data/bank-plus-data/raw/CheckingIQ_Loan_13Month_All_*.csv"
RAW_CD_PATH      = "../../data/bank-plus-data/raw/CheckingIQ_CD_All_*.csv"

GT_TABLE      = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.product_ground_truth_normalized"
CATALOG_TABLE = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.product_code_catalog"

print(f"Catalog:        {CATALOG_NAME}")
print(f"Schema:         {SCHEMA_NAME}")
print(f"GT table:       {GT_TABLE}")
print(f"Catalog table:  {CATALOG_TABLE}")

In [None]:
# ── Validation: check that input files exist ─────────────────────
import os, glob as globmod

for path, label in [
    (DEPOSITS_GT_PATH, "Deposits GT"),
    (LOANS_GT_PATH,    "Loans GT"),
    (DDA_TYPES_PATH,   "DDA Types reference"),
    (LOAN_TYPES_PATH,  "Loan Types reference"),
]:
    if os.path.exists(path):
        print(f"  OK  {label}: {path}")
    else:
        raise FileNotFoundError(f"Missing {label}: {path}")

for pattern, label in [
    (RAW_DEPOSIT_PATH, "Raw Deposits"),
    (RAW_LOAN_PATH,    "Raw Loans"),
    (RAW_CD_PATH,      "Raw CDs"),
]:
    matches = globmod.glob(pattern)
    if matches:
        print(f"  OK  {label}: {len(matches)} file(s)")
    else:
        print(f"  WARN  {label}: no files matching {pattern} (will use Databricks paths)")

print("\nInput file check complete.")

---
## Section A — Ground Truth Normalization

Parse the hierarchical Product Catalog CSVs (Deposits + Loans) into flat ground truth rows.

The CSV files use **indentation-based hierarchy**: columns 1-5 map to taxonomy Levels 1-5,
and rows that only populate a level column (without a product code) are hierarchy headers
that set context for subsequent code rows.

In [None]:
import pandas as pd
import numpy as np


def parse_hierarchical_product_gt(path, product_domain, skip_unmapped_rows=0):
    """
    Parse a hierarchical product catalog CSV into flat ground truth rows.

    The CSV has this structure:
      - Optional unmapped codes at the top (loan file only)
      - A blank row + header rows (StrategyCorps level configuration, Level 1/2/3...)
      - Hierarchy rows: a value in columns 0-4 sets that level for subsequent rows
      - Code rows: columns 5 (product_code) and 6 (product_name) populated

    Forward-fills L1-L4 from hierarchy headers to code rows.
    """
    df_raw = pd.read_csv(path, header=None, encoding="latin-1")

    # Find the row where the actual hierarchy starts (after "LoB,Type,Category..." header)
    start_idx = None
    for i, row in df_raw.iterrows():
        vals = [str(v).strip() for v in row.values if pd.notna(v) and str(v).strip()]
        if vals and vals[0] == "LoB":
            start_idx = i + 1
            break

    if start_idx is None:
        raise ValueError(f"Could not find hierarchy start in {path}")

    df_hier = df_raw.iloc[start_idx:].reset_index(drop=True)

    # Columns: 0=L1(LoB), 1=L2(Type), 2=L3(Category), 3=L4(Sub-category), 4=L5(Special),
    #          5=product_code, 6=product_name
    current = {"L1": None, "L2": None, "L3": None, "L4": None, "L5": None}
    rows = []

    for _, row in df_hier.iterrows():
        vals = [str(v).strip() if pd.notna(v) and str(v).strip() else None for v in row.values[:7]]
        l1, l2, l3, l4, l5, code, name = (vals + [None] * 7)[:7]

        # Update hierarchy context when a level header is present
        if l1:
            current["L1"] = l1
            current["L2"] = None
            current["L3"] = None
            current["L4"] = None
            current["L5"] = None
        if l2:
            current["L2"] = l2
            current["L3"] = None
            current["L4"] = None
            current["L5"] = None
        if l3:
            current["L3"] = l3
            current["L4"] = None
            current["L5"] = None
        if l4:
            current["L4"] = l4
            current["L5"] = None
        if l5:
            current["L5"] = l5

        # If there's a product code, this is a code row
        if code:
            rows.append({
                "product_code":           code,
                "product_name":           name,
                "gt_L1_line_of_business":  current["L1"],
                "gt_L2_type":              current["L2"] if current["L2"] else product_domain,
                "gt_L3_category":          current["L3"],
                "gt_L4_subcategory":       current["L4"],
                "gt_L5_special":           current["L5"],
                "product_domain":          product_domain,
            })

    df = pd.DataFrame(rows)

    # Strip whitespace
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].astype(str).str.strip()
            df[col] = df[col].replace({"None": None, "nan": None, "": None})

    return df


print("parse_hierarchical_product_gt() defined.")

### Section A.1 — Deposits Ground Truth

In [None]:
df_gt_deposits = parse_hierarchical_product_gt(DEPOSITS_GT_PATH, product_domain="Deposits")

print(f"Deposit GT rows: {len(df_gt_deposits)}")
print(f"Unique product codes: {df_gt_deposits['product_code'].nunique()}")
print(f"\nLoB distribution:")
print(df_gt_deposits["gt_L1_line_of_business"].value_counts().to_string())
print(f"\nCategory distribution:")
print(df_gt_deposits["gt_L3_category"].value_counts().to_string())
print()
df_gt_deposits.head(10)

### Section A.2 — Loans Ground Truth

In [None]:
# Extract unmapped loan codes from the top of the file (before the hierarchy)
df_loans_raw = pd.read_csv(LOANS_GT_PATH, header=None, encoding="latin-1")

unmapped_rows = []
for i, row in df_loans_raw.iterrows():
    vals = [str(v).strip() if pd.notna(v) and str(v).strip() else None for v in row.values]
    # Stop when we hit the blank row or hierarchy headers
    if vals[0] is None or vals[0] == "" or vals[0] == "StrategyCorps level configuration":
        break
    unmapped_rows.append({"product_code": vals[0], "product_name": vals[1] if len(vals) > 1 else None})

df_unmapped_loans = pd.DataFrame(unmapped_rows)
if len(df_unmapped_loans) > 0:
    for col in df_unmapped_loans.columns:
        if df_unmapped_loans[col].dtype == object:
            df_unmapped_loans[col] = df_unmapped_loans[col].str.strip()

print(f"Unmapped loan codes (top of file): {len(df_unmapped_loans)}")
print(df_unmapped_loans.to_string(index=False))

In [None]:
df_gt_loans = parse_hierarchical_product_gt(LOANS_GT_PATH, product_domain="Loans")

print(f"Loan GT rows: {len(df_gt_loans)}")
print(f"Unique product codes: {df_gt_loans['product_code'].nunique()}")
print(f"\nLoB distribution:")
print(df_gt_loans["gt_L1_line_of_business"].value_counts().to_string())
print(f"\nCategory distribution:")
print(df_gt_loans["gt_L3_category"].value_counts().to_string())
print()
df_gt_loans.head(10)

### Section A.3 — Merge and Save Ground Truth

In [None]:
df_gt = pd.concat([df_gt_deposits, df_gt_loans], ignore_index=True)

# Replace remaining artifacts
df_gt.replace({"None": None, "nan": None}, inplace=True)

print(f"Combined ground truth: {len(df_gt)} rows, {df_gt['product_code'].nunique()} unique codes")
print(f"\nLine of Business:")
print(df_gt["gt_L1_line_of_business"].value_counts().to_string())
print(f"\nProduct Type (L2):")
print(df_gt["gt_L2_type"].value_counts().to_string())
print(f"\nCategory (L3):")
print(df_gt["gt_L3_category"].value_counts().to_string())
print(f"\nDomain:")
print(df_gt["product_domain"].value_counts().to_string())
print()
df_gt

In [None]:
# ── Save ground truth to Unity Catalog ────────────────────────────
try:
    sdf_gt = spark.createDataFrame(df_gt)
    sdf_gt.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(GT_TABLE)
    print(f"Saved {len(df_gt)} rows to {GT_TABLE}")
except NameError:
    print("Spark session not found — skipping UC write (run in Databricks).")
    print(f"DataFrame ready with {len(df_gt)} rows.")

---
## Section B — Product Code Catalog

Build a catalog of unique product codes (ACTYPE) from raw Deposit, Loan, and CD files.
For each code, capture a description and the account count.

In [None]:
# ── Load reference lookups ────────────────────────────────────────
df_dda_types = pd.read_csv(DDA_TYPES_PATH, encoding="latin-1")
df_dda_types.columns = [c.strip() for c in df_dda_types.columns]
for col in df_dda_types.columns:
    if df_dda_types[col].dtype == object:
        df_dda_types[col] = df_dda_types[col].str.strip()

# Build code-to-description lookup
dda_code_col = df_dda_types.columns[0]
dda_desc_col = df_dda_types.columns[1]
dda_lookup = dict(zip(df_dda_types[dda_code_col].astype(str), df_dda_types[dda_desc_col]))
print(f"DDA Types lookup: {len(dda_lookup)} entries")

df_loan_types = pd.read_csv(LOAN_TYPES_PATH, encoding="latin-1")
df_loan_types.columns = [c.strip() for c in df_loan_types.columns]
for col in df_loan_types.columns:
    if df_loan_types[col].dtype == object:
        df_loan_types[col] = df_loan_types[col].str.strip()

loan_code_col = df_loan_types.columns[0]
loan_desc_col = df_loan_types.columns[1]
loan_lookup = dict(zip(df_loan_types[loan_code_col].astype(str), df_loan_types[loan_desc_col]))
print(f"Loan Types lookup: {len(loan_lookup)} entries")

In [None]:
# ── Build Deposit catalog from raw data ────────────────────────────
try:
    sdf_dep = spark.read.csv(RAW_DEPOSIT_PATH, header=True, inferSchema=True)
    df_dep_raw = sdf_dep.toPandas()
except NameError:
    import glob as globmod
    dep_files = globmod.glob(RAW_DEPOSIT_PATH)
    if dep_files:
        df_dep_raw = pd.concat([pd.read_csv(f, encoding="latin-1") for f in dep_files], ignore_index=True)
    else:
        raise FileNotFoundError(f"No files matching {RAW_DEPOSIT_PATH}")

df_dep_raw.columns = [c.strip() for c in df_dep_raw.columns]
df_dep_raw["ACTYPE"] = df_dep_raw["ACTYPE"].astype(str).str.strip()

dep_catalog = (
    df_dep_raw
    .groupby("ACTYPE")
    .agg(account_count=("ACTYPE", "size"))
    .reset_index()
)
dep_catalog["product_name"] = dep_catalog["ACTYPE"].map(dda_lookup)
dep_catalog["product_name"] = dep_catalog["product_name"].fillna("(unknown)")
dep_catalog["source_file"] = "Deposit"
dep_catalog = dep_catalog.rename(columns={"ACTYPE": "product_code"})

print(f"Deposit catalog: {len(dep_catalog)} unique ACTYPE codes")
print(f"Total accounts:  {dep_catalog['account_count'].sum():,}")
dep_catalog.sort_values("account_count", ascending=False).head(10)

In [None]:
# ── Build Loan catalog from raw data ──────────────────────────────
try:
    sdf_loan = spark.read.csv(RAW_LOAN_PATH, header=True, inferSchema=True)
    df_loan_raw = sdf_loan.toPandas()
except NameError:
    import glob as globmod
    loan_files = globmod.glob(RAW_LOAN_PATH)
    if loan_files:
        df_loan_raw = pd.concat([pd.read_csv(f, encoding="latin-1") for f in loan_files], ignore_index=True)
    else:
        raise FileNotFoundError(f"No files matching {RAW_LOAN_PATH}")

df_loan_raw.columns = [c.strip() for c in df_loan_raw.columns]
df_loan_raw["ACTYPE"] = df_loan_raw["ACTYPE"].astype(str).str.strip()

# Capture PURCOD and LoanTypeDesc for context
loan_catalog = (
    df_loan_raw
    .groupby("ACTYPE")
    .agg(
        account_count=("ACTYPE", "size"),
        sample_purcod=("PURCOD", "first"),
        sample_purpose_desc=("PurposeDescription", "first"),
        sample_loan_type_desc=("LoanTypeDesc", "first"),
    )
    .reset_index()
)

loan_catalog["product_name"] = loan_catalog["ACTYPE"].map(loan_lookup)
loan_catalog["product_name"] = loan_catalog["product_name"].fillna(
    loan_catalog["sample_loan_type_desc"].fillna("(unknown)")
)
loan_catalog["source_file"] = "Loan"
loan_catalog = loan_catalog.rename(columns={"ACTYPE": "product_code"})

print(f"Loan catalog: {len(loan_catalog)} unique ACTYPE codes")
print(f"Total accounts: {loan_catalog['account_count'].sum():,}")
loan_catalog.sort_values("account_count", ascending=False).head(10)

In [None]:
# ── Build CD catalog from raw data ────────────────────────────────
try:
    sdf_cd = spark.read.csv(RAW_CD_PATH, header=True, inferSchema=True)
    df_cd_raw = sdf_cd.toPandas()
except NameError:
    import glob as globmod
    cd_files = globmod.glob(RAW_CD_PATH)
    if cd_files:
        df_cd_raw = pd.concat([pd.read_csv(f, encoding="latin-1") for f in cd_files], ignore_index=True)
    else:
        raise FileNotFoundError(f"No files matching {RAW_CD_PATH}")

df_cd_raw.columns = [c.strip() for c in df_cd_raw.columns]
df_cd_raw["ACTYPE"] = df_cd_raw["ACTYPE"].astype(str).str.strip()

cd_catalog = (
    df_cd_raw
    .groupby("ACTYPE")
    .agg(account_count=("ACTYPE", "size"))
    .reset_index()
)
cd_catalog["product_name"] = "(CD type " + cd_catalog["ACTYPE"] + ")"
cd_catalog["source_file"] = "CD"
cd_catalog = cd_catalog.rename(columns={"ACTYPE": "product_code"})

print(f"CD catalog: {len(cd_catalog)} unique ACTYPE codes")
print(f"Total accounts: {cd_catalog['account_count'].sum():,}")
cd_catalog.sort_values("account_count", ascending=False).head(10)

In [None]:
# ── Combine into a single product catalog ─────────────────────────
# Standardize loan catalog columns to match deposit/cd
loan_extra_cols = ["sample_purcod", "sample_purpose_desc", "sample_loan_type_desc"]
common_cols = ["product_code", "product_name", "account_count", "source_file"]

df_catalog = pd.concat([
    dep_catalog[common_cols],
    loan_catalog[common_cols + loan_extra_cols],
    cd_catalog[common_cols],
], ignore_index=True)

# Fill NaN for loan-specific columns in non-loan rows
for col in loan_extra_cols:
    if col not in df_catalog.columns:
        df_catalog[col] = None

df_catalog["product_code"] = df_catalog["product_code"].astype(str)

print(f"Combined product catalog: {len(df_catalog)} codes")
print(f"  Deposit: {len(dep_catalog)}")
print(f"  Loan:    {len(loan_catalog)}")
print(f"  CD:      {len(cd_catalog)}")
print(f"Total accounts: {df_catalog['account_count'].sum():,}")
df_catalog.sort_values("account_count", ascending=False)

---
## Section C — Layer Assignment

Assign each catalog code to a test layer based on how it appears in the ground truth:

| Layer | Name | Rule |
|-------|------|------|
| 1 | Obvious | Exactly 1 unique (L1,L2,L3,L4,L5) mapping in GT |
| 2 | Ambiguous | 2+ distinct mappings in GT |
| 3 | Unknown | product_code absent from GT entirely |

In [None]:
gt_mapping_counts = (
    df_gt
    .groupby("product_code")
    .apply(
        lambda g: g[
            ["gt_L1_line_of_business", "gt_L2_type", "gt_L3_category",
             "gt_L4_subcategory", "gt_L5_special"]
        ].drop_duplicates().shape[0]
    )
    .reset_index(name="n_mappings")
)

multi_codes  = set(gt_mapping_counts.loc[gt_mapping_counts["n_mappings"] > 1, "product_code"])
single_codes = set(gt_mapping_counts.loc[gt_mapping_counts["n_mappings"] == 1, "product_code"])
all_gt_codes = set(df_gt["product_code"].unique())

print(f"GT codes with 1 mapping (Layer 1):  {len(single_codes)}")
print(f"GT codes with 2+ mappings (Layer 2): {len(multi_codes)}")
if multi_codes:
    print(f"Multi-mapping codes: {sorted(multi_codes)}")

In [None]:
def assign_layer(code):
    if code not in all_gt_codes:
        return 3  # Unknown
    if code in multi_codes:
        return 2  # Ambiguous
    return 1      # Obvious


df_catalog["layer"] = df_catalog["product_code"].apply(assign_layer)

# ── Summary ───────────────────────────────────────────────────────
layer_summary = (
    df_catalog
    .groupby("layer")
    .agg(codes=("product_code", "nunique"), total_accounts=("account_count", "sum"))
    .reset_index()
)
layer_summary["pct_accounts"] = (
    layer_summary["total_accounts"] / layer_summary["total_accounts"].sum() * 100
).round(1)

layer_names = {1: "Obvious", 2: "Ambiguous", 3: "Unknown"}
layer_summary["name"] = layer_summary["layer"].map(layer_names)

print("Layer assignment summary:")
print(layer_summary[["layer", "name", "codes", "total_accounts", "pct_accounts"]].to_string(index=False))
print(f"\nTotal codes: {len(df_catalog)}")

In [None]:
# ── Show codes per layer ──────────────────────────────────────────
for layer_num, layer_name in layer_names.items():
    layer_df = df_catalog[df_catalog["layer"] == layer_num].sort_values(
        "account_count", ascending=False
    )
    print(f"\n{'='*60}")
    print(f"Layer {layer_num} — {layer_name} ({len(layer_df)} codes)")
    print(f"{'='*60}")
    for _, row in layer_df.iterrows():
        print(
            f"  code={str(row['product_code']):>3}"
            f" | accts={row['account_count']:>7,}"
            f" | {row['source_file']:<7}"
            f" | {str(row['product_name'])[:45]}"
        )

In [None]:
# ── Save catalog (with layers) to Unity Catalog ───────────────────
try:
    sdf_catalog = spark.createDataFrame(df_catalog)
    sdf_catalog.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(CATALOG_TABLE)
    print(f"Saved {len(df_catalog)} rows to {CATALOG_TABLE}")
except NameError:
    print("Spark session not found — skipping UC write (run in Databricks).")
    print(f"DataFrame ready with {len(df_catalog)} rows.")

---
## Validation

Verify that all Unity Catalog tables were created successfully.

In [None]:
try:
    for table_name, expected_label in [
        (GT_TABLE, "product_ground_truth_normalized"),
        (CATALOG_TABLE, "product_code_catalog"),
    ]:
        count = spark.sql(f"SELECT COUNT(*) as cnt FROM {table_name}").collect()[0]["cnt"]
        print(f"  OK  {expected_label}: {count} rows")

    catalog_cols = [f.name for f in spark.table(CATALOG_TABLE).schema.fields]
    assert "layer" in catalog_cols, "Missing 'layer' column in catalog table"
    print(f"  OK  catalog has 'layer' column")

    print("\nAll validations passed.")
except NameError:
    print("Spark session not found — skipping UC validation (run in Databricks).")
    print("Local DataFrames are ready:")
    print(f"  df_gt:      {len(df_gt)} rows, {df_gt['product_code'].nunique()} unique codes")
    print(f"  df_catalog: {len(df_catalog)} rows, columns: {list(df_catalog.columns)}")