In [5]:
"""
FFIEC Quarter-over-Quarter Change Computation (Step 4 Track 2 - v0.2)
=========================================================================
Purpose: Transform per-bank feature CSVs from Step 3 Track 2 v0.2 into
         quarter-over-quarter percentage changes, one file per bank.

INPUT FORMAT (v0.2): Transposed CSVs from Step 3 Track 2 v0.2:
  - Rows    = feature names (index column named 'feature')
  - Columns = quarters (e.g., '03/31/2000', '06/30/2000', ...)

OUTPUT FORMAT: Same transposed orientation:
  - Rows    = feature names (with '_qoq' suffix)
  - Columns = quarters (first quarter dropped since there's no prior)

The QoQ computation operates along the column axis (across quarters).

Input:  per_bank_features/ffiec_<bank_name>_features.csv  (from Step 3 Track 2 v0.2)
Output: per_bank_qoq/ffiec_<bank_name>_qoq.csv  (one per bank, transposed)
        per_bank_qoq/qoq_report_<bank_name>.txt
        per_bank_qoq/qoq_summary.txt  (cross-bank summary)

Author: Wake Forest MSBA Practicum Team 4
Date: February 2026
"""

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import re


# =============================================================================
# CONFIGURATION
# =============================================================================

INPUT_DIR = Path("per_bank_features")
OUTPUT_DIR = Path("per_bank_qoq")

# Edge case handling
MIN_QUARTERS_PER_BANK = 2
MAX_CHANGE_FROM_ZERO = 100.0

# Bank name mapping (for display)
BANKS = {
    '480228': 'Bank of America',
    '852218': 'JPMorgan Chase Bank',
    '476810': 'Citibank',
    '451965': 'Wells Fargo Bank',
    '2182786': 'Goldman Sachs Bank USA',
    '1456501': 'Morgan Stanley Bank',
}


# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def parse_quarter_label_to_date(label):
    """
    Convert quarter column header to a sortable datetime.
    Handles multiple formats that may result from CSV round-tripping:
      - 'MM/DD/YYYY'  (our intended format from Step 3)
      - 'YYYY-MM-DD'  (pandas default datetime string)
      - 'M/D/YYYY'    (without leading zeros)
      - 'YYYY-MM-DD HH:MM:SS' (if pandas added time component)
    """
    label = str(label).strip()
    for fmt in ('%m/%d/%Y', '%Y-%m-%d', '%m/%d/%y', '%Y-%m-%d %H:%M:%S',
                '%m-%d-%Y', '%d/%m/%Y'):
        try:
            return datetime.strptime(label, fmt)
        except (ValueError, TypeError):
            continue
    return None


def compute_qoq_changes_safe(current, previous, max_change_from_zero=MAX_CHANGE_FROM_ZERO):
    """
    Compute QoQ percentage changes with proper edge-case handling.
    
    current and previous are DataFrames with same shape (features x 1-quarter-slice
    or features-as-rows).  Works on any aligned pair of DataFrames/Series.

    Edge cases:
      previous=0, current=0  -> 0%
      previous=0, current>0  -> +max_change_from_zero%
      previous=0, current<0  -> -max_change_from_zero%
      previous=NaN           -> NaN (first quarter)
    """
    stats = {
        'total_cells': current.size,
        'normal_computation': 0,
        'both_zero': 0,
        'from_zero_positive': 0,
        'from_zero_negative': 0,
        'previous_nan': 0,
    }
    
    mask_prev_nan = previous.isna()
    mask_prev_zero = (previous == 0) & ~mask_prev_nan
    mask_curr_zero = (current == 0)
    mask_curr_positive = (current > 0)
    mask_curr_negative = (current < 0)
    
    mask_both_zero = mask_prev_zero & mask_curr_zero
    mask_from_zero_positive = mask_prev_zero & mask_curr_positive
    mask_from_zero_negative = mask_prev_zero & mask_curr_negative
    
    with np.errstate(divide='ignore', invalid='ignore'):
        pct_change = (current - previous) / previous.abs() * 100
    
    pct_change = pct_change.replace([np.inf, -np.inf], np.nan)
    pct_change = pct_change.where(~mask_both_zero, 0.0)
    pct_change = pct_change.where(~mask_from_zero_positive, max_change_from_zero)
    pct_change = pct_change.where(~mask_from_zero_negative, -max_change_from_zero)
    
    stats['both_zero'] = int(mask_both_zero.sum().sum())
    stats['from_zero_positive'] = int(mask_from_zero_positive.sum().sum())
    stats['from_zero_negative'] = int(mask_from_zero_negative.sum().sum())
    stats['previous_nan'] = int(mask_prev_nan.sum().sum())
    stats['normal_computation'] = (
        stats['total_cells']
        - stats['both_zero']
        - stats['from_zero_positive']
        - stats['from_zero_negative']
        - stats['previous_nan']
    )
    
    return pct_change, stats


def generate_bank_report(bank_name, n_quarters_in, n_features, n_quarters_out,
                         change_feature_names, edge_case_stats, df_qoq):
    """Generate a per-bank QoQ report."""
    lines = [
        "=" * 70,
        f"FFIEC QoQ CHANGE REPORT - {bank_name}",
        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "=" * 70,
        "",
        "INPUT SUMMARY",
        "-" * 40,
        f"Quarters: {n_quarters_in:,}",
        f"Features: {n_features:,}",
        f"Input format: Transposed (rows=features, cols=quarters)",
        "",
        "EDGE CASE HANDLING",
        "-" * 40,
        f"Total cells computed: {edge_case_stats['total_cells']:,}",
        f"Normal computations: {edge_case_stats['normal_computation']:,}",
        f"Both zero (0%): {edge_case_stats['both_zero']:,}",
        f"From zero positive (+{MAX_CHANGE_FROM_ZERO}%): {edge_case_stats['from_zero_positive']:,}",
        f"From zero negative (-{MAX_CHANGE_FROM_ZERO}%): {edge_case_stats['from_zero_negative']:,}",
        f"Previous NaN (first quarter): {edge_case_stats['previous_nan']:,}",
        "",
        "OUTPUT SUMMARY",
        "-" * 40,
        f"Features (rows): {len(change_feature_names):,}",
        f"Quarters (columns): {n_quarters_out:,}",
        f"Output format: Transposed (rows=features_qoq, cols=quarters)",
        "",
        "SAMPLE STATISTICS (across quarters for first 10 features)",
        "-" * 40,
    ]
    
    # For the transposed output, each row is a feature; stats are across columns (quarters)
    for feat in change_feature_names[:10]:
        if feat in df_qoq.index:
            row = df_qoq.loc[feat].astype(float)
            lines.append(f"  {feat}: mean={row.mean():+.2f}%, std={row.std():.2f}%")
    
    return "\n".join(lines)


# =============================================================================
# PER-BANK QoQ PIPELINE (transposed input/output)
# =============================================================================

def process_single_bank(filepath, bank_name):
    """
    Run the full QoQ pipeline on a single bank's transposed feature CSV.
    
    Input CSV format:  rows=features, columns=quarters (MM/DD/YYYY headers)
    Output CSV format: rows=features_qoq, columns=quarters (first quarter dropped)
    
    Returns (df_qoq_transposed, change_feature_names, report_text, summary_dict).
    """
    print(f"\n  Loading {filepath.name}...")
    df_t = pd.read_csv(filepath, index_col=0)  # 'feature' column as index
    n_features = df_t.shape[0]
    n_quarters = df_t.shape[1]
    print(f"    Loaded: {n_features:,} features x {n_quarters} quarters (transposed)")
    
    # Quarter column headers are date strings like '03/31/2000'
    quarter_labels = list(df_t.columns)
    print(f"    First 3 column headers: {quarter_labels[:3]}")
    print(f"    Last 3 column headers:  {quarter_labels[-3:]}")
    
    # Sort columns chronologically (should already be sorted, but ensure)
    quarter_dates = [parse_quarter_label_to_date(q) for q in quarter_labels]
    
    # Filter out any columns that couldn't be parsed as dates
    valid_pairs = [(d, lbl) for d, lbl in zip(quarter_dates, quarter_labels) if d is not None]
    if len(valid_pairs) < len(quarter_labels):
        invalid = [lbl for d, lbl in zip(quarter_dates, quarter_labels) if d is None]
        print(f"    WARNING: {len(invalid)} column(s) could not be parsed as dates: {invalid[:5]}")
    
    if len(valid_pairs) == 0:
        print(f"    ERROR: No quarter columns could be parsed. Aborting this bank.")
        return None, None, None, None
    
    sorted_pairs = sorted(valid_pairs, key=lambda x: x[0])
    sorted_labels = [lbl for _, lbl in sorted_pairs]
    df_t = df_t[sorted_labels]
    quarter_labels = sorted_labels
    n_quarters = len(quarter_labels)
    quarter_labels = sorted_labels
    
    print(f"    Quarter range: {quarter_labels[0]} -> {quarter_labels[-1]}")
    
    # Check minimum quarters
    if n_quarters < MIN_QUARTERS_PER_BANK:
        print(f"    SKIPPED: Only {n_quarters} quarter(s), need at least {MIN_QUARTERS_PER_BANK}")
        return None, None, None, None
    
    # Force numeric
    df_t = df_t.apply(pd.to_numeric, errors='coerce')
    
    # Compute QoQ: for each feature (row), compare adjacent quarters (columns)
    # current = columns [1:], previous = columns [:-1]
    current = df_t.iloc[:, 1:].copy()
    previous = df_t.iloc[:, :-1].copy()
    previous.columns = current.columns  # align column names for arithmetic
    
    pct_change, edge_case_stats = compute_qoq_changes_safe(current, previous)
    
    print(f"    Edge cases: {edge_case_stats['both_zero']:,} both-zero, "
          f"{edge_case_stats['from_zero_positive']:,} from-zero-pos, "
          f"{edge_case_stats['from_zero_negative']:,} from-zero-neg")
    
    # Rename index to add '_qoq' suffix
    pct_change.index = [f"{feat}_qoq" for feat in pct_change.index]
    pct_change.index.name = 'feature'
    change_feature_names = list(pct_change.index)
    
    # Output quarter count (first quarter dropped)
    n_quarters_out = pct_change.shape[1]
    
    # Stats
    total_nan = pct_change.isna().sum().sum()
    total_cells = pct_change.size
    nan_pct = total_nan / total_cells if total_cells > 0 else 0
    
    print(f"    => {len(change_feature_names):,} QoQ features x {n_quarters_out} quarters | NaN: {nan_pct:.2%}")
    
    # Generate report
    report = generate_bank_report(
        bank_name, n_quarters, n_features, n_quarters_out,
        change_feature_names, edge_case_stats, pct_change
    )
    
    summary = {
        'n_quarters_in': n_quarters,
        'n_quarters_out': n_quarters_out,
        'n_features': n_features,
        'n_qoq_features': len(change_feature_names),
        'nan_pct': nan_pct,
        'edge_cases': edge_case_stats,
    }
    
    return pct_change, change_feature_names, report, summary


# =============================================================================
# MAIN
# =============================================================================

def main():
    OUTPUT_DIR.mkdir(exist_ok=True)
    
    print("=" * 70)
    print("FFIEC QUARTER-OVER-QUARTER COMPUTATION (Step 4 Track 2 - v0.2)")
    print("  Input/Output format: TRANSPOSED (rows=features, columns=quarters)")
    print("=" * 70)
    
    # Discover per-bank feature CSVs
    bank_files = sorted(INPUT_DIR.glob("ffiec_*_features.csv"))
    
    if not bank_files:
        print(f"\nERROR: No per-bank feature CSV files found in {INPUT_DIR}/")
        print("  Run Step 3 Track 2 first.")
        return None
    
    print(f"\nFound {len(bank_files)} per-bank feature files in {INPUT_DIR}/")
    
    # Process each bank
    all_summaries = {}
    
    for filepath in bank_files:
        # Derive bank name from filename
        # e.g., ffiec_bank_of_america_features.csv -> bank_of_america
        slug = filepath.stem.replace('ffiec_', '').replace('_features', '')
        
        bank_name = slug.replace('_', ' ').title()
        for rssd, name in BANKS.items():
            if name.lower().replace(' ', '_').replace('.', '').replace(',', '') == slug:
                bank_name = name
                break
        
        print(f"\n{'─'*70}")
        print(f"  {bank_name}")
        print(f"{'─'*70}")
        
        df_qoq, change_features, report, summary = process_single_bank(filepath, bank_name)
        
        if df_qoq is None:
            continue
        
        # Save outputs (transposed format: rows=features, cols=quarters)
        out_csv = OUTPUT_DIR / f"ffiec_{slug}_qoq.csv"
        out_report = OUTPUT_DIR / f"qoq_report_{slug}.txt"
        
        df_qoq.to_csv(out_csv, index=True)  # index=True keeps the 'feature' column
        with open(out_report, 'w') as f:
            f.write(report)
        
        all_summaries[bank_name] = summary
    
    # =========================================================================
    # Cross-bank summary
    # =========================================================================
    print(f"\n\n{'='*70}")
    print("SUMMARY - ALL BANKS")
    print(f"{'='*70}\n")
    
    summary_lines = [
        "=" * 70,
        "FFIEC QoQ COMPUTATION SUMMARY - ALL BANKS",
        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "=" * 70,
        "",
        "Output format: TRANSPOSED (rows=features_qoq, columns=quarters)",
        "",
    ]
    
    print(f"{'Bank':30s} | {'Qtrs In':>7} | {'Qtrs Out':>8} | {'Features':>8} | {'NaN %':>7}")
    print("-" * 70)
    summary_lines.append(f"{'Bank':30s} | {'Qtrs In':>7} | {'Qtrs Out':>8} | {'Features':>8} | {'NaN %':>7}")
    summary_lines.append("-" * 70)
    
    for bank_name, s in all_summaries.items():
        line = f"{bank_name:30s} | {s['n_quarters_in']:>7} | {s['n_quarters_out']:>8} | {s['n_qoq_features']:>8} | {s['nan_pct']:>6.2%}"
        print(line)
        summary_lines.append(line)
    
    summary_lines.extend([
        "",
        "NOTES",
        "-" * 40,
        f"- Max change from zero: +/-{MAX_CHANGE_FROM_ZERO}%",
        "- First quarter per bank dropped (no previous quarter for QoQ)",
        "- Each bank's QoQ file is ready for per-bank ML / anomaly detection",
        "- Format: rows=features_qoq, columns=quarters",
    ])
    
    with open(OUTPUT_DIR / "qoq_summary.txt", 'w') as f:
        f.write("\n".join(summary_lines))
    
    # Final
    print(f"\n{'='*70}")
    print("COMPLETE")
    print(f"{'='*70}")
    print(f"\nOutputs saved to {OUTPUT_DIR.resolve()}/")
    print(f"  Per-bank QoQ CSVs:  {len(all_summaries)}  (transposed: rows=features, cols=quarters)")
    print(f"  Per-bank reports:   {len(all_summaries)}")
    print(f"  Cross-bank summary: qoq_summary.txt")
    print(f"{'='*70}\n")
    
    return all_summaries


if __name__ == "__main__":
    summaries = main()

FFIEC QUARTER-OVER-QUARTER COMPUTATION (Step 4 Track 2 - v0.2)
  Input/Output format: TRANSPOSED (rows=features, columns=quarters)

Found 6 per-bank feature files in per_bank_features/

──────────────────────────────────────────────────────────────────────
  Bank of America
──────────────────────────────────────────────────────────────────────

  Loading ffiec_bank_of_america_features.csv...
    Loaded: 430 features x 99 quarters (transposed)
    First 3 column headers: ['03/31/2001', '06/30/2001', '09/30/2001']
    Last 3 column headers:  ['03/31/2025', '06/30/2025', '09/30/2025']
    Quarter range: 03/31/2001 -> 09/30/2025
    Edge cases: 6,130 both-zero, 495 from-zero-pos, 21 from-zero-neg
    => 430 QoQ features x 98 quarters | NaN: 0.00%

──────────────────────────────────────────────────────────────────────
  Citibank
──────────────────────────────────────────────────────────────────────

  Loading ffiec_citibank_features.csv...
    Loaded: 421 features x 99 quarters (transposed)