# 01 â€” Prepare Transaction Catalog

**Objective:** Extract all unique transaction codes from the raw BankPlus data files and pick sample descriptions for each to provide context to the LLM.

### Tasks:
1. Read `NON_POS` and `POS` raw data files.
2. Identify unique `TRANCD` values.
3. Pick 3 sample descriptions for each code (from `EFHDS1` for NON_POS, `description` for POS).
4. Calculate volume (count) per code.
5. Save catalog to `taxonomy/data/transaction_code_catalog.csv`.

In [None]:
import pandas as pd
import numpy as np
import os

# Paths
non_pos_path = "../data/CheckingIQ_NON_POS_Daily_012626_rerun.csv"
pos_path = "../data/CheckingIQ_POS_Daily_012626_rerun.csv"
output_path = "../taxonomy/data/transaction_code_catalog.csv"

print("Paths initialized.")

In [None]:
def extract_catalog(file_path, desc_col, source_name):
    print(f"Processing {source_name} file...")
    # Read only necessary columns to save memory
    df = pd.read_csv(file_path, usecols=['TRANCD', desc_col], dtype={'TRANCD': str})
    
    # Group by TRANCD
    agg = df.groupby('TRANCD').agg(
        volume=('TRANCD', 'count'),
        samples=(desc_col, lambda x: list(x.dropna().unique()[:3]))
    ).reset_index()
    
    # Expand samples into columns
    for i in range(3):
        agg[f'sample_desc_{i+1}'] = agg['samples'].apply(lambda x: x[i] if len(x) > i else None)
    
    agg['source_file'] = source_name
    return agg.drop(columns=['samples'])

# 1. Process NON_POS (Primary source)
catalog_non_pos = extract_catalog(non_pos_path, 'EFHDS1', 'NON_POS')

# 2. Process POS
catalog_pos = extract_catalog(pos_path, 'description', 'POS')

print(f"Extracted {len(catalog_non_pos)} codes from NON_POS and {len(catalog_pos)} codes from POS.")

In [None]:
# 3. Merge catalogs
# If a code exists in both, we'll prefer the NON_POS samples but combine volumes
catalog_combined = pd.concat([catalog_non_pos, catalog_pos], ignore_index=True)

# Resolve overlaps by taking the first one (NON_POS) and summing volumes
catalog_final = catalog_combined.groupby('TRANCD').agg({
    'volume': 'sum',
    'sample_desc_1': 'first',
    'sample_desc_2': 'first',
    'sample_desc_3': 'first',
    'source_file': 'first'
}).reset_index()

print(f"Final combined catalog has {len(catalog_final)} unique codes.")

In [None]:
# 4. Save results
os.makedirs(os.path.dirname(output_path), exist_ok=True)
catalog_final.to_csv(output_path, index=False)
print(f"Saved transaction catalog to: {output_path}")

catalog_final.sort_values('volume', ascending=False).head(10)