In [1]:
"""
FFIEC Bank Splitter (Step 2 Track 2) - Per-Bank Output
======================================================
Filters the large FFIEC CSV to specific banks using chunked reading,
then saves each bank to its own CSV file for per-bank ML pipelines.

Input:  ffiec_all_banks.csv (6GB+)
Output: per_bank/ffiec_<bank_name>.csv  (one file per bank)
        per_bank/ffiec_filtered_banks.csv (combined file, for reference)

Author: Wake Forest MSBA Practicum Team 4
Date: January 2026
"""

import pandas as pd
from pathlib import Path
import gc
import warnings
print("start")
# Suppress the mixed dtype warnings (expected with FFIEC data)
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)

# =============================================================================
# CONFIGURATION
# =============================================================================

INPUT_FILE = "ffiec_all_banks.csv"
OUTPUT_DIR = Path("per_bank")
CHUNK_SIZE = 10_000

BANKS_TO_INCLUDE = {
    '480228': 'Bank of America',
    '852218': 'JPMorgan Chase Bank',
    '476810': 'Citibank',
    '451965': 'Wells Fargo Bank',
    '2182786': 'Goldman Sachs Bank USA',
    '1456501': 'Morgan Stanley Bank',
}

# =============================================================================
# HELPER: Sanitize bank name for filenames
# =============================================================================

def sanitize_name(name):
    """Convert bank name to a safe filename slug."""
    return name.lower().replace(' ', '_').replace('.', '').replace(',', '')

# =============================================================================
# MAIN
# =============================================================================

def main():
    # Create output directory
    OUTPUT_DIR.mkdir(exist_ok=True)
    
    print(f"\n{'='*60}")
    print(f"FFIEC BANK SPLITTER (Step 2 Track 2)")
    print(f"{'='*60}")
    print(f"\nInput:  {INPUT_FILE}")
    print(f"Output: {OUTPUT_DIR}/  (one CSV per bank)")
    print(f"\nBanks to extract ({len(BANKS_TO_INCLUDE)}):")
    for rssd, name in BANKS_TO_INCLUDE.items():
        print(f"  - {name} (RSSD: {rssd})")
    
    # -----------------------------------------------------------------
    # Phase 1: Chunked reading & filtering
    # -----------------------------------------------------------------
    print(f"\nPhase 1: Loading {INPUT_FILE} in chunks...")
    
    rssd_ids = set(BANKS_TO_INCLUDE.keys())
    filtered_chunks = []
    total_rows_read = 0
    chunk_count = 0
    
    chunk_reader = pd.read_csv(
        INPUT_FILE,
        dtype={'IDRSSD': str},
        chunksize=CHUNK_SIZE,
        low_memory=True
    )
    
    for chunk in chunk_reader:
        chunk_count += 1
        total_rows_read += len(chunk)
        
        filtered_chunk = chunk[chunk['IDRSSD'].isin(rssd_ids)].copy()
        if len(filtered_chunk) > 0:
            filtered_chunks.append(filtered_chunk)
        
        if chunk_count % 20 == 0:
            print(f"  Processed {chunk_count} chunks ({total_rows_read:,} rows)")
        
        del chunk
        gc.collect()
    
    print(f"\nTotal rows read: {total_rows_read:,}")
    
    if not filtered_chunks:
        print("ERROR: No matching banks found!")
        return None
    
    df = pd.concat(filtered_chunks, ignore_index=True)
    df = df.sort_values(['IDRSSD', 'quarter']).reset_index(drop=True)
    del filtered_chunks
    gc.collect()
    
    print(f"\nCombined shape: {df.shape[0]:,} rows x {df.shape[1]:,} columns")
    
    # -----------------------------------------------------------------
    # Phase 2: Split and save per-bank files
    # -----------------------------------------------------------------
    print(f"\nPhase 2: Splitting into per-bank files...")
    print(f"-" * 60)
    
    saved_files = {}
    
    for rssd_id, bank_name in BANKS_TO_INCLUDE.items():
        bank_df = df[df['IDRSSD'] == rssd_id].copy().reset_index(drop=True)
        
        if len(bank_df) == 0:
            print(f"  WARNING: {bank_name} (RSSD: {rssd_id}) - no data found, skipping")
            continue
        
        filename = f"ffiec_{sanitize_name(bank_name)}.csv"
        filepath = OUTPUT_DIR / filename
        bank_df.to_csv(filepath, index=False)
        
        saved_files[bank_name] = {
            'file': filename,
            'quarters': len(bank_df),
            'columns': len(bank_df.columns),
        }
        
        print(f"  {bank_name:30s} -> {filename}")
        print(f"    {len(bank_df):>4} quarters | {len(bank_df.columns):,} columns")
    
    # Also save the combined file for reference
    combined_path = OUTPUT_DIR / "ffiec_filtered_banks.csv"
    df.to_csv(combined_path, index=False)
    
    # -----------------------------------------------------------------
    # Summary
    # -----------------------------------------------------------------
    print(f"\n{'='*60}")
    print(f"SUMMARY")
    print(f"{'='*60}")
    print(f"\nFiles saved to: {OUTPUT_DIR.resolve()}/")
    print(f"\n  Per-bank files ({len(saved_files)}):")
    for name, info in saved_files.items():
        print(f"    {info['file']:45s} ({info['quarters']} quarters)")
    print(f"\n  Combined file:")
    print(f"    ffiec_filtered_banks.csv (all {len(df):,} rows)")
    print(f"\nDone.")
    
    return df


if __name__ == "__main__":
    df = main()

start

FFIEC BANK SPLITTER (Step 2 Track 2)

Input:  ffiec_all_banks.csv
Output: per_bank/  (one CSV per bank)

Banks to extract (6):
  - Bank of America (RSSD: 480228)
  - JPMorgan Chase Bank (RSSD: 852218)
  - Citibank (RSSD: 476810)
  - Wells Fargo Bank (RSSD: 451965)
  - Goldman Sachs Bank USA (RSSD: 2182786)
  - Morgan Stanley Bank (RSSD: 1456501)

Phase 1: Loading ffiec_all_banks.csv in chunks...
  Processed 20 chunks (200,000 rows)
  Processed 40 chunks (400,000 rows)
  Processed 60 chunks (600,000 rows)

Total rows read: 657,973


  df = pd.concat(filtered_chunks, ignore_index=True)



Combined shape: 563 rows x 6,444 columns

Phase 2: Splitting into per-bank files...
------------------------------------------------------------
  Bank of America                -> ffiec_bank_of_america.csv
      99 quarters | 6,444 columns
  JPMorgan Chase Bank            -> ffiec_jpmorgan_chase_bank.csv
      99 quarters | 6,444 columns
  Citibank                       -> ffiec_citibank.csv
      99 quarters | 6,444 columns
  Wells Fargo Bank               -> ffiec_wells_fargo_bank.csv
      99 quarters | 6,444 columns
  Goldman Sachs Bank USA         -> ffiec_goldman_sachs_bank_usa.csv
      68 quarters | 6,444 columns
  Morgan Stanley Bank            -> ffiec_morgan_stanley_bank.csv
      99 quarters | 6,444 columns

SUMMARY

Files saved to: C:\Users\olive\per_bank/

  Per-bank files (6):
    ffiec_bank_of_america.csv                     (99 quarters)
    ffiec_jpmorgan_chase_bank.csv                 (99 quarters)
    ffiec_citibank.csv                            (99 quarters)
   