In [2]:
"""
FFIEC Quarter-over-Quarter Change Computation (Vectorized) - FIXED VERSION
============================================================================
Purpose: Transform filtered FFIEC data from raw values to quarter-over-quarter
         percentage changes. Anomalies in CHANGES are more meaningful for
         audit purposes than anomalies in raw levels.

Input:  ffiec_filtered_features.csv (output from filtering script)
Output: ffiec_qoq_changes.csv (same banks/quarters, but values are % changes)
        qoq_computation_report.txt (documents the transformation)

Why QoQ changes?
- A bank with $1B in deposits is not "anomalous" just because it's large
- A bank whose deposits DROPPED 40% in one quarter IS anomalous
- Changes normalize for bank size and capture the behavior we care about

FIX in v0.2:
- Properly handles divide-by-zero cases when previous quarter value is 0
- Case 1: previous=0, current=0  -> 0% change (no change)
- Case 2: previous=0, current≠0  -> capped at configurable max (default 100%)
- Added tracking of how many values were affected by these edge cases

Author: Wake Forest MSBA Practicum Team 4
Date: January 2026 (Updated February 2026)
"""

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import re


# =============================================================================
# CONFIGURATION
# =============================================================================

INPUT_FILE = "ffiec_filtered_features.csv"
OUTPUT_FILE = "ffiec_qoq_changes_v2.csv"
REPORT_FILE = "qoq_computation_report.txt"

# Columns that identify the observation (not features)
ID_COLUMN = "IDRSSD"
QUARTER_COLUMN = "quarter"

# Additional metadata columns to exclude from QoQ computation
# These are identifiers that got through filtering because they're numeric
EXCLUDE_COLUMNS = [
    'FDIC Certificate Number',
    'OCC Charter Number',
    'OTS Docket Number',
    'Primary ABA Routing Number',
    'Financial Institution Name',
    'Financial Institution Address',
    'Financial Institution City',
    'Financial Institution State',
    'Financial Institution Zip Code',
]

# How to handle edge cases
MIN_QUARTERS_PER_BANK = 2  # Banks with fewer quarters cannot have QoQ changes
WINSORIZE_PERCENTILE = 99  # Cap extreme changes at this percentile (both tails)

# NEW: Configuration for divide-by-zero handling
# When previous=0 and current≠0, what % change should we assign?
# Options:
#   - A fixed cap (e.g., 100% means "grew from zero to something")
#   - np.nan to exclude these cases
#   - A very large number to flag them as extreme
MAX_CHANGE_FROM_ZERO = 100.0  # Cap at 100% when going from 0 to non-zero


# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def parse_quarter_to_date(quarter_str):
    """
    Convert FFIEC quarter string to a sortable date.
    
    Input format: "FFIEC CDR Call Bulk All Schedules 03312024"
    Output: datetime(2024, 3, 31)
    
    This ensures quarters sort chronologically, not alphabetically.
    """
    match = re.search(r'(\d{8})$', quarter_str)
    if match:
        date_str = match.group(1)
        # Format: MMDDYYYY
        month = int(date_str[0:2])
        day = int(date_str[2:4])
        year = int(date_str[4:8])
        return datetime(year, month, day)
    else:
        return quarter_str


def compute_qoq_changes_safe(current, previous, max_change_from_zero=MAX_CHANGE_FROM_ZERO):
    """
    Compute quarter-over-quarter percentage changes with proper handling of edge cases.
    
    Formula: (current - previous) / |previous| * 100
    
    Edge cases handled:
    1. previous=0, current=0  -> 0% (no change)
    2. previous=0, current>0  -> +max_change_from_zero% (capped positive growth)
    3. previous=0, current<0  -> -max_change_from_zero% (capped negative growth)
    4. previous=NaN           -> NaN (first quarter, no previous data)
    
    Returns:
        pct_change: DataFrame with percentage changes
        stats: dict with counts of how each edge case was handled
    """
    # Initialize stats tracking
    stats = {
        'total_cells': current.size,
        'normal_computation': 0,
        'both_zero': 0,
        'from_zero_positive': 0,
        'from_zero_negative': 0,
        'previous_nan': 0,
    }
    
    # Create masks for different cases BEFORE computation
    # These need to be computed on the original DataFrames
    mask_prev_nan = previous.isna()
    mask_prev_zero = (previous == 0) & ~mask_prev_nan
    mask_curr_zero = (current == 0)
    mask_curr_positive = (current > 0)
    mask_curr_negative = (current < 0)
    
    # Case 1: Both zero -> 0% change
    mask_both_zero = mask_prev_zero & mask_curr_zero
    
    # Case 2: Previous zero, current positive -> cap at +max%
    mask_from_zero_positive = mask_prev_zero & mask_curr_positive
    
    # Case 3: Previous zero, current negative -> cap at -max%
    mask_from_zero_negative = mask_prev_zero & mask_curr_negative
    
    # Compute the standard percentage change
    # Suppress warnings for divide-by-zero (we handle it explicitly)
    with np.errstate(divide='ignore', invalid='ignore'):
        pct_change = (current - previous) / previous.abs() * 100
    
    # Replace inf/-inf with NaN temporarily
    pct_change = pct_change.replace([np.inf, -np.inf], np.nan)
    
    # Now apply our edge case fixes
    # Case 1: Both zero -> 0%
    pct_change = pct_change.where(~mask_both_zero, 0.0)
    
    # Case 2: From zero to positive -> +max_change_from_zero%
    pct_change = pct_change.where(~mask_from_zero_positive, max_change_from_zero)
    
    # Case 3: From zero to negative -> -max_change_from_zero%
    pct_change = pct_change.where(~mask_from_zero_negative, -max_change_from_zero)
    
    # Compute statistics
    stats['both_zero'] = mask_both_zero.sum().sum()
    stats['from_zero_positive'] = mask_from_zero_positive.sum().sum()
    stats['from_zero_negative'] = mask_from_zero_negative.sum().sum()
    stats['previous_nan'] = mask_prev_nan.sum().sum()
    stats['normal_computation'] = (
        stats['total_cells'] 
        - stats['both_zero'] 
        - stats['from_zero_positive'] 
        - stats['from_zero_negative']
        - stats['previous_nan']
    )
    
    return pct_change, stats


def generate_report(df_in, df_out, feature_cols, change_cols, banks_removed, edge_case_stats):
    """
    Generate a report documenting the QoQ transformation.
    """
    lines = [
        "=" * 70,
        "FFIEC QUARTER-OVER-QUARTER CHANGE COMPUTATION REPORT",
        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "=" * 70,
        "",
        "INPUT SUMMARY",
        "-" * 40,
        f"Input file: {INPUT_FILE}",
        f"Rows (bank-quarters): {len(df_in):,}",
        f"Unique banks: {df_in[ID_COLUMN].nunique():,}",
        f"Unique quarters: {df_in[QUARTER_COLUMN].nunique():,}",
        f"Feature columns: {len(feature_cols):,}",
        "",
        "TRANSFORMATION",
        "-" * 40,
        "For each bank, sorted by quarter date:",
        "  change = (current - previous) / |previous| * 100",
        "",
        f"Banks removed (fewer than {MIN_QUARTERS_PER_BANK} quarters): {banks_removed:,}",
        f"Winsorization: Values capped at {WINSORIZE_PERCENTILE}th percentile",
        "",
        "EDGE CASE HANDLING (Divide-by-Zero Fix)",
        "-" * 40,
        f"Total cells computed: {edge_case_stats['total_cells']:,}",
        f"Normal computations: {edge_case_stats['normal_computation']:,}",
        f"Both zero (0->0 = 0%): {edge_case_stats['both_zero']:,}",
        f"From zero positive (0->+ = +{MAX_CHANGE_FROM_ZERO}%): {edge_case_stats['from_zero_positive']:,}",
        f"From zero negative (0->- = -{MAX_CHANGE_FROM_ZERO}%): {edge_case_stats['from_zero_negative']:,}",
        f"Previous NaN (first quarter): {edge_case_stats['previous_nan']:,}",
        "",
        "OUTPUT SUMMARY",
        "-" * 40,
        f"Output file: {OUTPUT_FILE}",
        f"Rows (bank-quarters with valid changes): {len(df_out):,}",
        f"Unique banks: {df_out[ID_COLUMN].nunique():,}",
        f"Change columns: {len(change_cols):,}",
        "",
        "NOTES",
        "-" * 40,
        "- First quarter for each bank has NaN changes (no previous quarter)",
        "- Rows with all-NaN changes are removed",
        "- Original feature values are NOT included (only changes)",
        "- Use IDRSSD + quarter to join back to original data if needed",
        "- v0.2 FIX: Divide-by-zero cases now handled properly",
        "",
        "SAMPLE CHANGE COLUMNS",
        "-" * 40,
    ]
    
    for col in change_cols[:20]:
        lines.append(f"  {col}")
    if len(change_cols) > 20:
        lines.append(f"  ... and {len(change_cols) - 20} more")
    
    lines.extend([
        "",
        "SAMPLE STATISTICS (after winsorization)",
        "-" * 40,
    ])
    
    sample_cols = change_cols[:5]
    for col in sample_cols:
        if col in df_out.columns:
            stats = df_out[col].describe()
            null_count = df_out[col].isna().sum()
            lines.append(f"\n{col}:")
            lines.append(f"  Mean:   {stats['mean']:,.2f}%")
            lines.append(f"  Std:    {stats['std']:,.2f}%")
            lines.append(f"  Min:    {stats['min']:,.2f}%")
            lines.append(f"  Max:    {stats['max']:,.2f}%")
            lines.append(f"  NaN:    {null_count:,}")
    
    return "\n".join(lines)


# =============================================================================
# MAIN PIPELINE
# =============================================================================

def main():
    """
    Main QoQ computation pipeline (vectorized for speed).
    """
    
    print("\n" + "=" * 70)
    print("FFIEC QUARTER-OVER-QUARTER CHANGE COMPUTATION (v0.2 - FIXED)")
    print("=" * 70 + "\n")
    
    # Step 1: Load data
    print(f"Loading {INPUT_FILE}...")
    df = pd.read_csv(INPUT_FILE, low_memory=False)
    print(f"  Loaded: {df.shape[0]:,} rows x {df.shape[1]:,} columns")
    print(f"  Banks: {df[ID_COLUMN].nunique():,}")
    print(f"  Quarters: {df[QUARTER_COLUMN].nunique():,}")
    
    # Identify feature columns (everything except ID, quarter, and other metadata)
    exclude_cols = [ID_COLUMN, QUARTER_COLUMN] + EXCLUDE_COLUMNS
    feature_cols = [c for c in df.columns if c not in exclude_cols]
    print(f"  Feature columns: {len(feature_cols):,}")
    
    # Force all feature columns to numeric
    print("\nConverting feature columns to numeric...")
    for col in feature_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Step 2: Parse quarter strings to dates
    print("\nParsing quarter dates for sorting...")
    df['quarter_date'] = df[QUARTER_COLUMN].apply(parse_quarter_to_date)
    
    # Show quarter order
    quarter_order = df.groupby(QUARTER_COLUMN)['quarter_date'].first().sort_values()
    print("  Quarter order (chronological):")
    for q, d in quarter_order.items():
        date_str = d.strftime('%Y-%m-%d') if isinstance(d, datetime) else str(d)
        print(f"    {date_str}: {q}")
    
    # Step 3: Remove banks with insufficient quarters
    print(f"\nRemoving banks with fewer than {MIN_QUARTERS_PER_BANK} quarters...")
    quarters_per_bank = df.groupby(ID_COLUMN).size()
    banks_to_keep = quarters_per_bank[quarters_per_bank >= MIN_QUARTERS_PER_BANK].index
    banks_removed = df[ID_COLUMN].nunique() - len(banks_to_keep)
    
    df = df[df[ID_COLUMN].isin(banks_to_keep)].copy()
    print(f"  Banks removed: {banks_removed:,}")
    print(f"  Banks remaining: {len(banks_to_keep):,}")
    
    # Step 4: Filter features to only those with data in ALL quarters
    # This ensures consistent coverage across the full time period
    print("\nFiltering features to those available in all quarters...")
    n_quarters = df[QUARTER_COLUMN].nunique()
    
    features_to_keep = []
    features_removed = 0
    
    for col in feature_cols:
        # Count how many quarters have at least one non-null value for this feature
        quarters_with_data = df.groupby(QUARTER_COLUMN)[col].apply(lambda x: x.notna().any()).sum()
        
        if quarters_with_data == n_quarters:
            features_to_keep.append(col)
        else:
            features_removed += 1
    
    print(f"  Features with data in all {n_quarters} quarters: {len(features_to_keep):,}")
    print(f"  Features removed (incomplete coverage): {features_removed:,}")
    
    feature_cols = features_to_keep
    
    # Step 5: Sort by bank and quarter date (required for correct shift)
    print("\nSorting data by bank and quarter...")
    df = df.sort_values([ID_COLUMN, 'quarter_date']).reset_index(drop=True)
    
    # Step 6: Compute QoQ changes (vectorized) - NOW WITH PROPER EDGE CASE HANDLING
    print("\nComputing quarter-over-quarter changes (with divide-by-zero fix)...")
    
    # Get current values
    current = df[feature_cols].copy()
    
    # Get previous values (shift within each bank group)
    previous = df.groupby(ID_COLUMN)[feature_cols].shift(1)
    
    # Use our safe computation function
    pct_change, edge_case_stats = compute_qoq_changes_safe(current, previous)
    
    # Print edge case statistics
    print(f"  Edge case handling:")
    print(f"    - Both zero (0%): {edge_case_stats['both_zero']:,} cells")
    print(f"    - From zero (+{MAX_CHANGE_FROM_ZERO}%): {edge_case_stats['from_zero_positive']:,} cells")
    print(f"    - From zero (-{MAX_CHANGE_FROM_ZERO}%): {edge_case_stats['from_zero_negative']:,} cells")
    
    # Rename columns to indicate QoQ
    pct_change.columns = [col + '_qoq' for col in feature_cols]
    change_cols = pct_change.columns.tolist()
    
    # Build output dataframe
    df_changes = pd.concat([
        df[[ID_COLUMN, QUARTER_COLUMN]].reset_index(drop=True),
        pct_change.reset_index(drop=True)
    ], axis=1)
    
    # Step 7: Remove rows that are all NaN (first quarter for each bank)
    print("\nRemoving rows with no valid changes (first quarter per bank)...")
    rows_before = len(df_changes)
    df_changes = df_changes.dropna(subset=change_cols, how='all')
    rows_removed = rows_before - len(df_changes)
    print(f"  Rows removed: {rows_removed:,}")
    print(f"  Rows remaining: {len(df_changes):,}")
    
    # Step 8: Winsorize extreme values
    print(f"\nWinsorizing extreme values at {WINSORIZE_PERCENTILE}th percentile...")
    for col in change_cols:
        lower = df_changes[col].quantile((100 - WINSORIZE_PERCENTILE) / 100)
        upper = df_changes[col].quantile(WINSORIZE_PERCENTILE / 100)
        df_changes[col] = df_changes[col].clip(lower=lower, upper=upper)
    
    # Step 9: Reorder columns
    output_cols = [ID_COLUMN, QUARTER_COLUMN] + sorted(change_cols)
    df_out = df_changes[output_cols]
    
    # Step 10: Save outputs
    print(f"\nSaving QoQ changes to {OUTPUT_FILE}...")
    df_out.to_csv(OUTPUT_FILE, index=False)
    print(f"  Shape: {df_out.shape[0]:,} rows x {df_out.shape[1]:,} columns")
    
    # Check for remaining NaN values
    total_nan = df_out[change_cols].isna().sum().sum()
    total_cells = len(df_out) * len(change_cols)
    print(f"  Remaining NaN values: {total_nan:,} / {total_cells:,} ({total_nan/total_cells:.2%})")
    
    # Generate and save report
    print(f"Saving report to {REPORT_FILE}...")
    report = generate_report(df, df_out, feature_cols, change_cols, banks_removed, edge_case_stats)
    with open(REPORT_FILE, 'w') as f:
        f.write(report)
    
    # Print summary
    print("\n" + "=" * 70)
    print("COMPLETE")
    print("=" * 70)
    print(f"Input:  {INPUT_FILE}")
    print(f"Output: {OUTPUT_FILE}")
    print(f"  Rows: {len(df_out):,} bank-quarter observations")
    print(f"  Columns: {len(change_cols):,} QoQ change features")
    print("=" * 70 + "\n")
    
    # Show sample statistics
    print("SAMPLE QoQ CHANGE STATISTICS:")
    print("-" * 40)
    for col in change_cols[:5]:
        mean = df_out[col].mean()
        std = df_out[col].std()
        nan_count = df_out[col].isna().sum()
        print(f"  {col}: mean={mean:+.2f}%, std={std:.2f}%, NaN={nan_count}")
    
    return df_out, change_cols


if __name__ == "__main__":
    df_qoq, change_cols = main()


FFIEC QUARTER-OVER-QUARTER CHANGE COMPUTATION (v0.2 - FIXED)

Loading ffiec_filtered_features.csv...
  Loaded: 563 rows x 168 columns
  Banks: 6
  Quarters: 99
  Feature columns: 157

Converting feature columns to numeric...

Parsing quarter dates for sorting...
  Quarter order (chronological):
    03/31/2001: 03/31/2001
    03/31/2002: 03/31/2002
    03/31/2003: 03/31/2003
    03/31/2004: 03/31/2004
    03/31/2005: 03/31/2005
    03/31/2006: 03/31/2006
    03/31/2007: 03/31/2007
    03/31/2008: 03/31/2008
    03/31/2009: 03/31/2009
    03/31/2010: 03/31/2010
    03/31/2011: 03/31/2011
    03/31/2012: 03/31/2012
    03/31/2013: 03/31/2013
    03/31/2014: 03/31/2014
    03/31/2015: 03/31/2015
    03/31/2016: 03/31/2016
    03/31/2017: 03/31/2017
    03/31/2018: 03/31/2018
    03/31/2019: 03/31/2019
    03/31/2020: 03/31/2020
    03/31/2021: 03/31/2021
    03/31/2022: 03/31/2022
    03/31/2023: 03/31/2023
    03/31/2024: 03/31/2024
    03/31/2025: 03/31/2025
    06/30/2001: 06/30/2001
 

  df['quarter_date'] = df[QUARTER_COLUMN].apply(parse_quarter_to_date)


  Features with data in all 99 quarters: 157
  Features removed (incomplete coverage): 0

Sorting data by bank and quarter...

Computing quarter-over-quarter changes (with divide-by-zero fix)...
  Edge case handling:
    - Both zero (0%): 19,555 cells
    - From zero (+100.0%): 1,944 cells
    - From zero (-100.0%): 149 cells

Removing rows with no valid changes (first quarter per bank)...
  Rows removed: 6
  Rows remaining: 557

Winsorizing extreme values at 99th percentile...

Saving QoQ changes to ffiec_qoq_changes_v2.csv...
  Shape: 557 rows x 159 columns
  Remaining NaN values: 0 / 87,449 (0.00%)
Saving report to qoq_computation_report.txt...

COMPLETE
Input:  ffiec_filtered_features.csv
Output: ffiec_qoq_changes_v2.csv
  Rows: 557 bank-quarter observations
  Columns: 157 QoQ change features

SAMPLE QoQ CHANGE STATISTICS:
----------------------------------------
  RCON2236_qoq: mean=+40.43%, std=196.16%, NaN=0
  RCONA564_qoq: mean=+45.09%, std=254.96%, NaN=0
  RIAD5411_qoq: mean=+