In [105]:
import pandas as pd
import glob

In [106]:
"Reload the CSV file after code execution state reset"
financial_files_path = "./data_csv/Financial_*.csv"
all_files = glob.glob(financial_files_path)

In [107]:
df_list = []
for file in all_files:
    temp_df = pd.read_csv(file)
    quarter = file.split("_")[2] + "_" + file.split("_")[3].split(".")[0]
    temp_df["quarter"] = quarter
    df_list.append(temp_df)

In [108]:
merged_df = pd.concat(df_list)

In [109]:
# Define all additional CAMELS ratios from the available columns
camels_ratios = pd.DataFrame({
    "cert" : merged_df["CERT"],
    "bank_name" : merged_df["NAMEFULL"],
    "quarter" : merged_df["quarter"],
    # Capital Adequacy (C)
    "equity_to_assets": merged_df["EQV"] / merged_df["NAASSET"],
    "tier1_capital_ratio": merged_df["IDT1CER"],
    "tier1_rwa_ratio": merged_df["IDT1RWAJR"],
    "total_equity_ratio": merged_df["EQTOT"] / merged_df["NAASSET"],
    "dividends_to_equity": merged_df["EQCDIVNTINC"] / merged_df["EQV"],

    # Asset Quality (A)
    "loan_loss_reserve_to_loans": merged_df["LNATRESR"] / merged_df["NTLNLSR"],
    "noncurrent_loans_to_loans": merged_df["NCLNLSR"] / merged_df["NTLNLSR"],
    "net_loans_to_assets": merged_df["LNLSNET"] / merged_df["NAASSET"],
    "loan_depreciation_to_loans": merged_df["LNLSDEPR"] / merged_df["NTLNLSR"],
    "ore_to_assets": merged_df["LSAORE"] / merged_df["NAASSET"],
    "assets_past_due_30_89_to_assets": merged_df["P3ASSET"] / merged_df["NAASSET"],
    "noncurrent_real_estate_to_assets": merged_df["NCRER"] / merged_df["NAASSET"],
    "noncurrent_re_to_loans": merged_df["NCRERESR"] / merged_df["NTLNLSR"],

    # Management (M)
    "efficiency_ratio_proxy": merged_df["EINTEXP"] / merged_df["INTINC"],
    "noninterest_income_to_assets": merged_df["NONII"] / merged_df["NAASSET"],
    "operating_income_to_assets": merged_df["NOIJ"] / merged_df["NAASSET"],
    "assets_per_employee": merged_df["NAASSET"] / merged_df["NUMEMP"],

    # Earnings (E)
    "return_on_assets": merged_df["ROA"],
    "pretax_return_on_assets": merged_df["ROAPTX"],
    "return_on_equity": merged_df["ROE"],
    "net_interest_margin": merged_df["NIM"],
    "net_income_to_assets": merged_df["NETINC"] / merged_df["NAASSET"],
    "pretax_income_to_assets": merged_df["PTAXNETINC"] / merged_df["NAASSET"],

    # Liquidity (L)
    "net_loans_to_total_deposits": merged_df["LNLSNET"] / merged_df["DEP"],
    "core_deposits_to_assets": merged_df["COREDEP"] / merged_df["NAASSET"],
    "uninsured_deposits_to_total_deposits": merged_df["DEPUNINS"] / merged_df["DEP"],
    "nonint_bearing_deposits_to_assets": merged_df["DEPNIDOM"] / merged_df["NAASSET"],
    "insured_deposits_to_total_deposits": merged_df["DEPINS"] / merged_df["DEP"],

    # Sensitivity to Market Risk (S)
    "securities_to_assets": merged_df["IGLSEC"] / merged_df["NAASSET"],
    "asset_sensitivity_proxy": merged_df["ASDRRES"] / merged_df["NAASSET"],
})

In [110]:
camels_ratios["cert"].nunique

<bound method IndexOpsMixin.nunique of 0           9
1          14
2          35
3          39
4          41
        ...  
8779    91005
8780    91280
8781    91322
8782    91325
8783    91385
Name: cert, Length: 107971, dtype: int64>

In [None]:
label_df = pd.read_csv("./data_csv/failed_bank-data_2008_2010.csv")
label_df["LABEL"] = 1
label_df = label_df[["CERT","LABEL"]]

FileNotFoundError: [Errno 2] No such file or directory: './data_csv/failed_bank-data.csv'

In [None]:
label_df

Unnamed: 0,CERT,LABEL
0,19040,1
1,23306,1
2,34578,1
3,35065,1
4,58052,1
...,...,...
330,29730,1
331,12736,1
332,33901,1
333,1971,1


In [112]:
def parse_quarter_to_date(quarter_str:str) -> pd.Timestamp:
    year,qtr = quarter_str.split("_")
    quarter_month_map = {"Q1": "01", "Q2": "04", "Q3": "07", "Q4": "10"}
    month = quarter_month_map[qtr]
    return pd.to_datetime(f"{year}-{month}-01")

In [113]:
camels_ratios["date"] = camels_ratios["quarter"].apply(parse_quarter_to_date)

In [114]:
def engineer_lag_features_optimized(df: pd.DataFrame, features: list, max_lag: int = 8,
                                    bank_id_col: str = "cert") -> pd.DataFrame:
    """
    Add lag features for specified CAMELS variables.
    
    """
    df = df.copy()
    
    
    df = df.sort_values(by=[bank_id_col, "date"])

    lag_dfs = [df]

    # Generate lag features for each specified variable
    for lag in range(1, max_lag + 1):
        lagged = (
            df.groupby(bank_id_col)[features]
              .shift(lag)
              .add_suffix(f"_lag{lag}")
        )
        lag_dfs.append(lagged)

    # Concatenate all in one go for performance
    df_with_lags = pd.concat(lag_dfs, axis=1)

    return df_with_lags



In [116]:
engineered_df = engineer_lag_features_optimized(camels_ratios,features = camels_ratios.columns[3:])

In [120]:
engineered_df[engineered_df["quarter"] == '2006_Q4']

Unnamed: 0,cert,bank_name,quarter,equity_to_assets,tier1_capital_ratio,tier1_rwa_ratio,total_equity_ratio,dividends_to_equity,loan_loss_reserve_to_loans,noncurrent_loans_to_loans,...,net_income_to_assets_lag8,pretax_income_to_assets_lag8,net_loans_to_total_deposits_lag8,core_deposits_to_assets_lag8,uninsured_deposits_to_total_deposits_lag8,nonint_bearing_deposits_to_assets_lag8,insured_deposits_to_total_deposits_lag8,securities_to_assets_lag8,asset_sensitivity_proxy_lag8,date_lag8
0,9,UNION TRUST COMPANY,2006_Q4,0.005653,0,12.147068,30.505919,12.088355,-566.483279,-451.849706,...,4.344406,5.924825,1.002659,250.666958,0.161235,21.631119,0.838765,0.207168,0.0,2004-10-01
1,14,STATE STREET BANK AND TRUST COMPANY,2006_Q4,0.001768,0,11.958736,1702.802767,5.807949,inf,,...,113.901050,167.894721,0.081167,347.869404,0.287859,2111.182464,0.005215,2.559278,0.0,2004-10-01
2,35,AUBURNBANK,2006_Q4,0.111326,0,14.444217,703.875000,12.427506,31.591146,0.562454,...,9.604782,11.853727,0.592105,472.962025,0.413019,93.558368,0.586981,1.333333,0.0,2004-10-01
3,39,ROBERTSON BANKING COMPANY,2006_Q4,0.018809,0,14.548683,41.089720,11.021083,5.205299,1.081800,...,0.168471,0.038817,0.867470,32.744389,0.178007,4.990494,0.821993,0.092686,0.0,2004-10-01
4,41,PHENIX-GIRARD BANK,2006_Q4,0.080263,0,21.058015,121.901961,3.625643,2.832083,0.969891,...,3.190073,3.181598,0.612607,97.375303,0.342715,25.814770,0.657285,0.196126,0.0,2004-10-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,91005,5 STAR BANK,2006_Q4,3.341680,0,34.490712,5538.500000,0.000000,2.781270,0.523959,...,inf,inf,0.561551,inf,0.207891,inf,0.792109,inf,,2004-10-01
8780,91280,BANK OF LITTLE ROCK,2006_Q4,0.006914,0,12.489915,9.730263,3.732819,10.183448,15.723427,...,1.958264,2.921536,0.770934,130.365609,0.399525,27.694491,0.600475,0.000000,0.0,2004-10-01
8781,91322,FISERV TRUST COMPANY,2006_Q4,inf,0,16.125524,inf,75.239078,,,...,inf,inf,0.000000,inf,0.255897,,0.744103,inf,,2004-10-01
8782,91325,AMVESCAP NATIONAL TRUST COMPANY,2006_Q4,inf,0,189.645143,inf,3.188618,,,...,inf,inf,0.000000,inf,0.001479,,0.998521,,,2004-10-01
