In [12]:
"""
FFIEC Call Report Feature Filtering Script (Step 3 - Updated)
==============================================================
Purpose: Take the raw combined FFIEC CSV and filter down to
         a clean set of numeric features suitable for anomaly detection.

Approach: Pure data-driven filtering - no subjective domain judgment.
          Let the data tell us which columns are useful.

Input:  ffiec_data/ffiec_complete_call_reports.csv (from Step 1)
        - 99 rows (quarters) x 6,444 columns
        
Output: ffiec_filtered_features.csv (same rows, fewer columns)
        filtering_report.txt (documents what was removed and why)

Author: Wake Forest MSBA Practicum Team 4
Date: January 2026
"""

import pandas as pd
import numpy as np
from pathlib import Path
import re
from datetime import datetime


# =============================================================================
# CONFIGURATION - Adjust these thresholds as needed
# =============================================================================

# Path to your combined CSV file (from Step 1)
INPUT_FILE = "ffiec_complete_call_reports.csv"

# Output files
OUTPUT_FILE = "ffiec_filtered_features.csv"
REPORT_FILE = "filtering_report.txt"

# Filtering thresholds
NULL_THRESHOLD = 0.0        # Drop columns with ANY null values (complete case analysis)
VARIANCE_THRESHOLD = 0.001  # Drop columns with near-zero variance (after scaling)
UNIQUE_RATIO_THRESHOLD = 0.01  # Adjusted for 99 rows (need at least ~1 unique value per 100 rows)

# Columns to always exclude (metadata, identifiers - not features)
# These are kept in the output but not used for anomaly detection
METADATA_COLUMNS = [
    'IDRSSD',                           # Bank identifier - need this for grouping
    'quarter',                          # Time period - need this for QoQ analysis
    'FDIC Certificate Number',
    'OCC Charter Number', 
    'OTS Docket Number',
    'Primary ABA Routing Number',
    'Financial Institution Name',
    'Financial Institution Address',
    'Financial Institution City',
    'Financial Institution State',
    'Financial Institution Zip Code',
]


# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def load_data(filepath):
    """
    Load the combined FFIEC CSV file.
    
    Note: low_memory=False is important because FFIEC data has mixed types
    in some columns (numbers stored as strings, etc.)
    """
    print(f"Loading {filepath}...")
    df = pd.read_csv(filepath, low_memory=False)
    print(f"  Loaded: {df.shape[0]:,} rows x {df.shape[1]:,} columns")
    return df


def identify_column_types(df):
    """
    Categorize columns into metadata, numeric, and non-numeric.
    
    FFIEC columns follow patterns:
    - RCFD/RCON = Balance sheet items (domestic/foreign consolidated)
    - RIAD/RIAE = Income statement items
    - UBPR = Uniform Bank Performance Report ratios
    
    Returns dict with column lists.
    """
    all_cols = set(df.columns)
    
    # Metadata columns (identifiers, not features)
    metadata = set(METADATA_COLUMNS) & all_cols
    
    # Find numeric columns (potential features)
    # Try to convert each column to numeric - if it works, it's numeric
    numeric_cols = []
    non_numeric_cols = []
    
    for col in all_cols - metadata:
        # Skip columns that are clearly junk (unnamed columns from parsing errors)
        if col.startswith('Unnamed:'):
            non_numeric_cols.append(col)
            continue
            
        # Try to infer if column is numeric
        # Check if pandas thinks it's numeric OR if it can be converted
        if pd.api.types.is_numeric_dtype(df[col]):
            numeric_cols.append(col)
        else:
            # Try converting - some numeric data is stored as strings
            try:
                converted = pd.to_numeric(df[col], errors='coerce')
                # If more than 50% converted successfully, treat as numeric
                if converted.notna().mean() > 0.5:
                    numeric_cols.append(col)
                else:
                    non_numeric_cols.append(col)
            except:
                non_numeric_cols.append(col)
    
    return {
        'metadata': list(metadata),
        'numeric': numeric_cols,
        'non_numeric': non_numeric_cols
    }


def filter_by_null_percentage(df, columns, threshold):
    """
    Remove columns with too many null values.
    
    Why: Columns with >50% nulls are often:
    - Line items only reported by certain bank types
    - Discontinued reporting items
    - Items only reported in certain quarters
    
    These create problems for anomaly detection (imputation would dominate)
    
    Returns: (kept_columns, removed_columns_with_reasons)
    """
    null_pct = df[columns].isnull().mean()
    
    kept = null_pct[null_pct <= threshold].index.tolist()
    removed = null_pct[null_pct > threshold]
    
    removed_with_reasons = {
        col: f"Null percentage: {pct:.1%}" 
        for col, pct in removed.items()
    }
    
    return kept, removed_with_reasons


def filter_by_variance(df, columns, threshold):
    """
    Remove columns with near-zero variance.
    
    Why: Columns where almost every value is the same provide no signal
    for anomaly detection. Common causes:
    - Flag columns that are almost always 0 or 1
    - Line items that apply to very few banks
    - Deprecated items filled with a constant
    
    Process:
    1. Standardize the column (subtract mean, divide by std)
    2. Compute variance of standardized values
    3. If variance is near zero, the column is effectively constant
    
    Returns: (kept_columns, removed_columns_with_reasons)
    """
    kept = []
    removed_with_reasons = {}
    
    for col in columns:
        series = pd.to_numeric(df[col], errors='coerce')
        
        # Skip if all null after conversion
        if series.notna().sum() == 0:
            removed_with_reasons[col] = "All values null after numeric conversion"
            continue
        
        # Compute variance (handle edge cases)
        std = series.std()
        
        if std == 0 or pd.isna(std):
            # Constant column - zero variance
            removed_with_reasons[col] = f"Zero variance (constant value: {series.dropna().iloc[0] if series.notna().any() else 'N/A'})"
        else:
            # Normalize and check variance
            normalized = (series - series.mean()) / std
            var = normalized.var()
            
            if var < threshold:
                removed_with_reasons[col] = f"Near-zero variance: {var:.6f}"
            else:
                kept.append(col)
    
    return kept, removed_with_reasons


def filter_by_unique_ratio(df, columns, threshold):
    """
    Remove columns with very few unique values relative to dataset size.
    
    Why: Columns with only a handful of distinct values (e.g., 3 unique values
    across 99 rows) behave more like categorical flags than continuous
    features. They can dominate anomaly detection in misleading ways.
    
    This catches things like:
    - Boolean flags (only 0/1)
    - Code columns (only a few valid codes)
    - Binned/categorical data stored as numbers
    
    Returns: (kept_columns, removed_columns_with_reasons)
    """
    kept = []
    removed_with_reasons = {}
    
    n_rows = len(df)
    
    for col in columns:
        series = pd.to_numeric(df[col], errors='coerce').dropna()
        
        if len(series) == 0:
            removed_with_reasons[col] = "No valid numeric values"
            continue
            
        n_unique = series.nunique()
        ratio = n_unique / n_rows
        
        # Also check absolute number - if only 2-3 unique values, suspicious
        if n_unique <= 3:
            removed_with_reasons[col] = f"Only {n_unique} unique values (likely categorical/flag)"
        elif ratio < threshold:
            removed_with_reasons[col] = f"Low unique ratio: {n_unique} unique / {n_rows} rows = {ratio:.6f}"
        else:
            kept.append(col)
    
    return kept, removed_with_reasons


def filter_duplicate_columns(df, columns):
    """
    Remove columns that are duplicates of each other.
    
    Why: The FFIEC data merge process created some duplicate columns 
    (e.g., RCON1234_dup2, RCON1234_dup3). These are redundant and 
    will artificially inflate the importance of those items in 
    anomaly detection.
    
    Strategy: Keep the first occurrence, remove _dup versions if 
    they're highly correlated with the original.
    
    Returns: (kept_columns, removed_columns_with_reasons)
    """
    kept = []
    removed_with_reasons = {}
    
    # First pass: identify base names and their duplicates
    base_names = {}  # base_name -> [col1, col2_dup, col3_dup, ...]
    
    for col in columns:
        # Check if this is a duplicate column
        dup_match = re.match(r'(.+)_dup\d+$', col)
        if dup_match:
            base = dup_match.group(1)
            if base not in base_names:
                base_names[base] = []
            base_names[base].append(col)
        else:
            # This might be a base column
            if col not in base_names:
                base_names[col] = []
    
    # Second pass: for each group, keep only one representative
    seen_bases = set()
    
    for col in columns:
        dup_match = re.match(r'(.+)_dup\d+$', col)
        
        if dup_match:
            base = dup_match.group(1)
            if base in seen_bases:
                # We already have this base column, skip the duplicate
                removed_with_reasons[col] = f"Duplicate of {base}"
            else:
                # Base doesn't exist in our columns, keep this dup as representative
                kept.append(col)
                seen_bases.add(base)
        else:
            # Not a duplicate column
            kept.append(col)
            seen_bases.add(col)
    
    return kept, removed_with_reasons


def convert_to_numeric(df, columns):
    """
    Convert selected columns to numeric dtype.
    
    Some FFIEC columns are stored as strings but contain numeric data.
    This ensures consistent numeric types for modeling.
    """
    df_out = df.copy()
    
    for col in columns:
        if col in df_out.columns:
            df_out[col] = pd.to_numeric(df_out[col], errors='coerce')
    
    return df_out


def generate_report(original_count, col_types, filter_results, final_columns):
    """
    Generate a human-readable report of what was filtered and why.
    """
    lines = [
        "=" * 70,
        "FFIEC FEATURE FILTERING REPORT",
        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "=" * 70,
        "",
        "SUMMARY",
        "-" * 40,
        f"Original columns:           {original_count:,}",
        f"Metadata columns (kept):    {len(col_types['metadata']):,}",
        f"Non-numeric (removed):      {len(col_types['non_numeric']):,}",
        f"Numeric columns analyzed:   {len(col_types['numeric']):,}",
        f"Final feature columns:      {len(final_columns):,}",
        "",
        "FILTERING STEPS",
        "-" * 40,
    ]
    
    # Document each filtering step
    for step_name, (kept_count, removed_dict) in filter_results.items():
        lines.append(f"\n{step_name}:")
        lines.append(f"  Kept: {kept_count:,} columns")
        lines.append(f"  Removed: {len(removed_dict):,} columns")
        
        # Show sample of removed columns (not all - could be thousands)
        if removed_dict:
            lines.append("  Sample of removed columns:")
            for i, (col, reason) in enumerate(list(removed_dict.items())[:10]):
                lines.append(f"    - {col}: {reason}")
            if len(removed_dict) > 10:
                lines.append(f"    ... and {len(removed_dict) - 10} more")
    
    # List final columns
    lines.extend([
        "",
        "FINAL FEATURE COLUMNS",
        "-" * 40,
        f"Total: {len(final_columns)} columns",
        "",
    ])
    
    # Group columns by prefix for readability
    prefixes = {}
    for col in sorted(final_columns):
        prefix = col[:4] if len(col) >= 4 else col
        if prefix not in prefixes:
            prefixes[prefix] = []
        prefixes[prefix].append(col)
    
    for prefix in sorted(prefixes.keys()):
        cols = prefixes[prefix]
        lines.append(f"{prefix}*: {len(cols)} columns")
    
    return "\n".join(lines)


# =============================================================================
# MAIN PIPELINE
# =============================================================================

def main():
    """
    Main filtering pipeline.
    
    Steps:
    1. Load raw data
    2. Categorize columns (metadata, numeric, non-numeric)
    3. Filter numeric columns by:
       a. Null percentage (remove >50% null)
       b. Variance (remove near-constant columns)
       c. Unique ratio (remove quasi-categorical columns)
       d. Duplicates (remove _dup columns)
    4. Save filtered dataset and report
    """
    
    print("\n" + "=" * 70)
    print("FFIEC FEATURE FILTERING PIPELINE (Step 3)")
    print("=" * 70 + "\n")
    
    # Step 1: Load data
    df = load_data(INPUT_FILE)
    original_count = len(df.columns)
    
    # Step 2: Categorize columns
    print("\nCategorizing columns...")
    col_types = identify_column_types(df)
    print(f"  Metadata columns:    {len(col_types['metadata']):,}")
    print(f"  Numeric columns:     {len(col_types['numeric']):,}")
    print(f"  Non-numeric columns: {len(col_types['non_numeric']):,}")
    
    # Track filtering results for report
    filter_results = {}
    current_columns = col_types['numeric']
    
    # Step 3a: Filter by null percentage
    print(f"\nFiltering by null percentage (threshold: {NULL_THRESHOLD:.0%})...")
    kept, removed = filter_by_null_percentage(df, current_columns, NULL_THRESHOLD)
    filter_results['Null Percentage Filter'] = (len(kept), removed)
    print(f"  Kept: {len(kept):,}  |  Removed: {len(removed):,}")
    current_columns = kept
    
    # Step 3b: Filter by variance
    print(f"\nFiltering by variance (threshold: {VARIANCE_THRESHOLD})...")
    kept, removed = filter_by_variance(df, current_columns, VARIANCE_THRESHOLD)
    filter_results['Variance Filter'] = (len(kept), removed)
    print(f"  Kept: {len(kept):,}  |  Removed: {len(removed):,}")
    current_columns = kept
    
    # Step 3c: Filter by unique ratio
    print(f"\nFiltering by unique value ratio (threshold: {UNIQUE_RATIO_THRESHOLD})...")
    kept, removed = filter_by_unique_ratio(df, current_columns, UNIQUE_RATIO_THRESHOLD)
    filter_results['Unique Ratio Filter'] = (len(kept), removed)
    print(f"  Kept: {len(kept):,}  |  Removed: {len(removed):,}")
    current_columns = kept
    
    # Step 3d: Filter duplicates
    print("\nFiltering duplicate columns...")
    kept, removed = filter_duplicate_columns(df, current_columns)
    filter_results['Duplicate Filter'] = (len(kept), removed)
    print(f"  Kept: {len(kept):,}  |  Removed: {len(removed):,}")
    final_feature_columns = kept
    
    # Step 4: Build output dataframe
    print("\nBuilding output dataset...")
    
    # Output columns = metadata + filtered features
    output_columns = col_types['metadata'] + final_feature_columns
    output_columns = [c for c in output_columns if c in df.columns]  # Safety check
    
    df_out = df[output_columns].copy()
    
    # Convert feature columns to numeric
    df_out = convert_to_numeric(df_out, final_feature_columns)
    
    # Step 5: Save outputs
    print(f"\nSaving filtered data to {OUTPUT_FILE}...")
    df_out.to_csv(OUTPUT_FILE, index=False)
    print(f"  Shape: {df_out.shape[0]:,} rows x {df_out.shape[1]:,} columns")
    
    # Generate and save report
    print(f"Saving filtering report to {REPORT_FILE}...")
    report = generate_report(original_count, col_types, filter_results, final_feature_columns)
    with open(REPORT_FILE, 'w') as f:
        f.write(report)
    
    # Print summary
    print("\n" + "=" * 70)
    print("COMPLETE")
    print("=" * 70)
    print(f"Original:  {original_count:,} columns")
    print(f"Final:     {len(output_columns):,} columns ({len(final_feature_columns):,} features + {len(col_types['metadata']):,} metadata)")
    print(f"Reduction: {(1 - len(output_columns)/original_count):.1%}")
    print("=" * 70 + "\n")
    
    return df_out, final_feature_columns


if __name__ == "__main__":
    df_filtered, feature_cols = main()
    
    # Print the feature columns for reference
    print("\nFEATURE COLUMNS FOR ANOMALY DETECTION:")
    print("-" * 40)
    for i, col in enumerate(sorted(feature_cols)[:50], 1):
        print(f"  {i:3}. {col}")
    if len(feature_cols) > 50:
        print(f"  ... and {len(feature_cols) - 50} more")


FFIEC FEATURE FILTERING PIPELINE (Step 3)

Loading ffiec_complete_call_reports.csv...
  Loaded: 99 rows x 6,444 columns

Categorizing columns...
  Metadata columns:    11
  Numeric columns:     5,731
  Non-numeric columns: 702

Filtering by null percentage (threshold: 0%)...
  Kept: 462  |  Removed: 5,269

Filtering by variance (threshold: 0.001)...
  Kept: 433  |  Removed: 29

Filtering by unique value ratio (threshold: 0.01)...
  Kept: 430  |  Removed: 3

Filtering duplicate columns...
  Kept: 430  |  Removed: 0

Building output dataset...

Saving filtered data to ffiec_filtered_features.csv...
  Shape: 99 rows x 441 columns
Saving filtering report to filtering_report.txt...

COMPLETE
Original:  6,444 columns
Final:     441 columns (430 features + 11 metadata)
Reduction: 93.2%


FEATURE COLUMNS FOR ANOMALY DETECTION:
----------------------------------------
    1. RCFD0010
    2. RCFD0022
    3. RCFD0071
    4. RCFD0081
    5. RCFD0090
    6. RCFD0211
    7. RCFD0213
    8. RCFD0416