In [5]:
"""
FFIEC Bank Filter (Step 2) - Memory Efficient Version
=====================================================
Filters large FFIEC CSV to specific banks using chunked reading.

Input:  ffiec_all_banks.csv (6GB+)
Output: ffiec_filtered_banks.csv (only selected banks)
"""

import pandas as pd
from pathlib import Path
import gc
import warnings

# Suppress the mixed dtype warnings (expected with FFIEC data)
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)

# =============================================================================
# CONFIGURATION
# =============================================================================

INPUT_FILE = "ffiec_all_banks.csv"
OUTPUT_FILE = "ffiec_filtered_banks.csv"
CHUNK_SIZE = 10_000

BANKS_TO_INCLUDE = {
    '480228': 'Bank of America',
    '852218': 'JPMorgan Chase Bank',
    '476810': 'Citibank',
    '451965': 'Wells Fargo Bank',
    '2182786': 'Goldman Sachs Bank USA',
    '1456501': 'Morgan Stanley Bank',
}

# =============================================================================
# MAIN
# =============================================================================

def main():
    print(f"\nLoading {INPUT_FILE} in chunks...")
    
    rssd_ids = set(BANKS_TO_INCLUDE.keys())
    filtered_chunks = []
    total_rows_read = 0
    chunk_count = 0
    
    chunk_reader = pd.read_csv(
        INPUT_FILE,
        dtype={'IDRSSD': str},
        chunksize=CHUNK_SIZE,
        low_memory=True
    )
    
    for chunk in chunk_reader:
        chunk_count += 1
        total_rows_read += len(chunk)
        
        filtered_chunk = chunk[chunk['IDRSSD'].isin(rssd_ids)].copy()
        if len(filtered_chunk) > 0:
            filtered_chunks.append(filtered_chunk)
        
        if chunk_count % 20 == 0:
            print(f"  Processed {chunk_count} chunks ({total_rows_read:,} rows)")
        
        del chunk
        gc.collect()
    
    print(f"\nTotal rows read: {total_rows_read:,}")
    
    if not filtered_chunks:
        print("ERROR: No matching banks found!")
        return None
    
    df = pd.concat(filtered_chunks, ignore_index=True)
    df = df.sort_values(['IDRSSD', 'quarter']).reset_index(drop=True)
    
    print(f"\nBanks found:")
    for rssd in df['IDRSSD'].unique():
        name = BANKS_TO_INCLUDE.get(rssd, "Unknown")
        n = len(df[df['IDRSSD'] == rssd])
        print(f"  {name}: {n} quarters")
    
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"\nSaved: {OUTPUT_FILE}")
    print(f"Shape: {df.shape[0]:,} rows x {df.shape[1]:,} columns")
    
    return df


if __name__ == "__main__":
    df = main()


Loading ffiec_all_banks.csv in chunks...
  Processed 20 chunks (200,000 rows)
  Processed 40 chunks (400,000 rows)
  Processed 60 chunks (600,000 rows)

Total rows read: 657,973


  df = pd.concat(filtered_chunks, ignore_index=True)



Banks found:
  Morgan Stanley Bank: 99 quarters
  Goldman Sachs Bank USA: 68 quarters
  Wells Fargo Bank: 99 quarters
  Citibank: 99 quarters
  Bank of America: 99 quarters
  JPMorgan Chase Bank: 99 quarters

Saved: ffiec_filtered_banks.csv
Shape: 563 rows x 6,444 columns
