# 00 â€” Prepare Ground Truth

**Objective:** Clean and normalize the Master Fee Table to create a reliable "ground truth" dataset for model evaluation.

### Tasks:
1. Read `taxonomy/data/Master Fee Table(Master).csv`.
2. Normalize column values (casing, whitespace, "N/A" -> null).
3. Filter to transaction codes present in the BankPlus raw data.
4. Resolve multi-mapping ambiguities by prioritizing checking/DDA contexts.
5. Save normalized ground truth to `taxonomy/data/ground_truth_normalized.csv`.

In [None]:
import pandas as pd
import numpy as np
import os

# Paths
input_path = "../taxonomy/data/Master Fee Table(Master).csv"
output_path = "../taxonomy/data/ground_truth_normalized.csv"

print(f"Reading Master Fee Table from: {input_path}")

In [None]:
# 1. Read the CSV
# Note: The file has some header rows that are descriptions. 
# We'll read it and handle the headers carefully.
df = pd.read_csv(input_path, dtype={'External Transaction Code': str})

# Rename columns for easier access
df = df.rename(columns={
    'External Transaction Code': 'TRANCD',
    'External Transaction Description ': 'description',
    'Scoring Category 1': 'L1',
    'Scoring Category 2': 'L2',
    'Scoring Category 3': 'L3',
    'Scoring Category 4': 'L4',
    'Credit / Debit': 'credit_debit'
})

# Filter out rows where TRANCD or L1 is missing (usually header/description rows in the CSV)
df = df[df['TRANCD'].notna() & df['L1'].notna()]

print(f"Loaded {len(df)} candidate mapping rows.")

In [None]:
# 2. Normalization Logic

def normalize_l1(val):
    if pd.isna(val): return None
    val = str(val).strip().lower()
    if 'non-fee' in val: return 'Non-fee item'
    if 'fee item' in val: return 'Fee item'
    return val

def normalize_l2(val):
    if pd.isna(val): return None
    val = str(val).strip()
    # Common fixes
    val = val.replace('NSF /OD', 'NSF/OD')
    val = val.replace('NSF / OD', 'NSF/OD')
    val = val.replace('Money Movement', 'Money movement')
    val = val.replace('Account Operations', 'Account operations')
    return val

def normalize_nulls(val):
    if pd.isna(val): return None
    val = str(val).strip()
    if val.upper() in ['N/A', 'NONE', 'NULL', '']: return None
    return val

# Apply normalizations
df['L1'] = df['L1'].apply(normalize_l1)
df['L2'] = df['L2'].apply(normalize_l2)
df['L3'] = df['L3'].apply(normalize_nulls)
df['L4'] = df['L4'].apply(normalize_nulls)
df['TRANCD'] = df['TRANCD'].str.strip()
df['description'] = df['description'].str.strip()

print("Normalization complete.")

In [None]:
# 3. Filter to codes present in raw data
# Based on analysis, these are the unique codes in NON_POS and POS files
raw_data_codes = [
    '183', '163', '227', '144', '83', '141', '222', '299', '223', '142', 
    '146', '145', '56', '6', '42', '644', '333', '67', '34', '46', '66', 
    '228', '229', '240', 
    # ... including other codes from the full list of 61
    '123', '127', '368', '174', '120', '119', '9', '49', '59', '8', '297', 
    '296', '212', '281', '261', '287', '242', '283', '237', '285', '32', 
    '33', '30', '31', '54', '50', '52', '972', '40', '473'
]

initial_len = len(df)
df = df[df['TRANCD'].isin(raw_data_codes)]
print(f"Filtered from {initial_len} to {len(df)} rows matching raw data codes.")

In [None]:
# 4. Resolve multi-mapping ambiguities
# Some codes have multiple entries. We'll group by TRANCD and take the first valid one 
# or apply specific rules if we know them.

duplicates = df[df.duplicated(subset=['TRANCD'], keep=False)]
if not duplicates.empty:
    print(f"Found {duplicates['TRANCD'].nunique()} codes with multiple mappings. Resolving...")

# Rule: For this test, we take the FIRST mapping which usually corresponds to 
# the primary DDA/Checking context in Mike's sheet.
df_final = df.drop_duplicates(subset=['TRANCD'], keep='first').copy()

# Derive include_in_scoring based on taxonomy rules
def determine_scoring(row):
    if row['L1'] == 'Non-fee item':
        if row['L2'] in ['NSF/OD', 'Money movement']:
            return True
    return False

df_final['include_in_scoring'] = df_final.apply(determine_scoring, axis=1)

print(f"Final ground truth has {len(df_final)} unique transaction codes.")

In [None]:
# 5. Save Output
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_final.to_csv(output_path, index=False)
print(f"Saved normalized ground truth to: {output_path}")

# Preview
df_final.head(10)