In [54]:
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore


# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Preprocessing and Evaluation
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    accuracy_score
)



In [55]:
label_df = pd.read_csv("./data_csv/failed_bank-data_2008_2010.csv")
label_df["LABEL"] = 1
label_df = label_df[["CERT","LABEL"]]
label_df = label_df.rename(columns={"CERT" : "cert", "LABEL" : "label"})

In [56]:
label_df = pd.read_csv("./data_csv/failed_bank-data_2008_2010.csv")

In [57]:
"Reload the CSV file after code execution state reset"
financial_files_path = "./data_csv/Financial_*.csv"
all_files = glob.glob(financial_files_path)

In [58]:
df_list = []
for file in all_files:
    temp_df = pd.read_csv(file)
    quarter = file.split("_")[2] + "_" + file.split("_")[3].split(".")[0]
    temp_df["QUARTER"] = quarter
    df_list.append(temp_df)

In [59]:
merged_df = pd.concat(df_list)

In [60]:
merged_df = merged_df[merged_df['IDT1RWAJR'] < 100] # added after first round

In [61]:
merged_df["EXTRA"].describe()

count    1.218770e+05
mean     3.202341e+01
std      5.733672e+03
min     -3.740000e+05
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.520000e+06
Name: EXTRA, dtype: float64

In [62]:
merged_df["TFRA"].describe()

count    1.221480e+05
mean     1.952207e+06
std      6.280274e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      5.589705e+09
Name: TFRA, dtype: float64

In [63]:
# Define useful features for CAMELS-based feature engineering

camel_core_features = [
    # Capital Adequacy
    'EQV', 'EQTOT', 'IDT1RWAJR', 'RBCRWAJ',

    # Asset Quality
    'LNATRESR', 'NTLNLSR', 'NCLNLSR', 'LNLSDEPR', 'LNLSNET',
    'ORE', 'P3ASSET', 'NCRER', 'NCRERESR', 'NTRECOSR','EQCDIVNTINC',

    # Management
    'EINTEXP', 'INTINC', 'NUMEMP',

    # Earnings
    'ROA', 'ROAPTX', 'ROE', 'PTAXNETINC', 'NETINC',
    'NOIJ', 'NONII', 'NIM',

    # Liquidity
    'DEP', 'COREDEP', 'DEPINS', 'DEPUNINS', 'DEPNIDOM',

    # Sensitivity to Market Risk
    'IGLSEC', 'ASDRRES',

    # Total Assets
    'NAASSET'
]

# Optional metadata for grouping, merging, filtering
metadata_fields = [
    'CERT', 'NAMEFULL', 'ZIP', 'STNAME', 'CITY',
    'RSSDID', 'BKCLASS', 'MUTUAL', 'TRUST', 'QUARTER'
]

# Labeling or time metadata
labeling_fields = [
    'INSDATE', 'ESTYMD'
]

# Final full set to keep
columns_to_keep = camel_core_features + metadata_fields + labeling_fields

In [64]:
selected_merged_df = merged_df[columns_to_keep]

In [65]:
selected_merged_df.describe()

Unnamed: 0,EQV,EQTOT,IDT1RWAJR,RBCRWAJ,LNATRESR,NTLNLSR,NCLNLSR,LNLSDEPR,LNLSNET,ORE,...,IGLSEC,ASDRRES,NAASSET,CERT,ZIP,RSSDID,MUTUAL,TRUST,INSDATE,ESTYMD
count,122148.0,121877.0,122148.0,122148.0,122148.0,122148.0,122148.0,122148.0,122148.0,121877.0,...,121877.0,110518.0,122148.0,122148.0,122148.0,122148.0,122148.0,122148.0,122148.0,122148.0
mean,11.811863,128925.7,16.61487,21.251737,1.395141,0.262867,1.143004,1020.168,745764.6,1446.708,...,456.2897,6720.117,7165.643,21146.176303,52557.558863,962166.1,0.067336,0.2835,20474140.0,19378200.0
std,9.046395,1983144.0,10.077985,102.66435,1.096435,0.929028,2.227,90611.79,10561580.0,33413.33,...,21123.58,271458.4,209390.5,15861.200129,23489.123251,951209.0,0.250605,0.450699,8309366.0,432474.2
min,-16.065911,-13112.0,-18.955043,-18.955043,0.0,-42.105263,0.0,0.0,0.0,-19399.0,...,-1463000.0,0.0,0.0,9.0,0.0,37.0,0.0,0.0,19331200.0,17840100.0
25%,8.395889,6328.0,10.959607,12.187411,0.970249,0.0,0.127098,64.89055,33227.75,0.0,...,0.0,0.0,12.0,9359.0,35474.0,325141.0,0.0,0.0,19340100.0,19040100.0
50%,9.851081,12505.0,13.660306,14.914276,1.227722,0.04923,0.518708,80.15823,76815.5,1.0,...,0.0,0.0,260.5,17719.0,55931.0,652753.0,0.0,0.0,19530120.0,19280520.0
75%,12.296586,28154.0,18.569188,19.971723,1.574595,0.226146,1.316324,92.96359,187286.8,277.0,...,3.0,0.0,1105.0,30076.0,68959.0,971379.0,0.0,1.0,19890810.0,19830130.0
max,100.049505,179813500.0,99.9627,26200.0,73.684211,98.372377,100.0,21193700.0,713727000.0,5794000.0,...,2975000.0,26985000.0,30135520.0,91385.0,99901.0,4210227.0,1.0,1.0,99991230.0,20101210.0


In [66]:
selected_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 122148 entries, 0 to 7747
Data columns (total 46 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   EQV          122148 non-null  float64
 1   EQTOT        121877 non-null  float64
 2   IDT1RWAJR    122148 non-null  float64
 3   RBCRWAJ      122148 non-null  float64
 4   LNATRESR     122148 non-null  float64
 5   NTLNLSR      122148 non-null  float64
 6   NCLNLSR      122148 non-null  float64
 7   LNLSDEPR     122148 non-null  float64
 8   LNLSNET      122148 non-null  int64  
 9   ORE          121877 non-null  float64
 10  P3ASSET      122148 non-null  int64  
 11  NCRER        122148 non-null  float64
 12  NCRERESR     122148 non-null  float64
 13  NTRECOSR     122148 non-null  float64
 14  EQCDIVNTINC  122148 non-null  float64
 15  EINTEXP      121877 non-null  float64
 16  INTINC       121877 non-null  float64
 17  NUMEMP       121877 non-null  float64
 18  ROA          122148 non-null  f

In [67]:
# non_null_counts = selected_merged_df.notnull().sum()

In [68]:
# selected_merged_df = selected_merged_df.dropna()

In [69]:
# Set the visual style
sns.set(style="whitegrid", palette="muted", color_codes=True)

# Plot distributions
def plot_feature_distributions(df, features, bins=100):
    n_cols = 4
    n_rows = (len(features) + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4 * n_rows))
    axes = axes.flatten()

    for i, feature in enumerate(features):
        if feature in df.columns:
            sns.histplot(df[feature].dropna(), kde=True, bins=bins, ax=axes[i])
            axes[i].set_title(feature)
        else:
            axes[i].axis("off")

    # # Turn off any unused subplots
    # for j in range(i + 1, len(axes)):
    #     axes[j].axis("off")

    plt.tight_layout()
    plt.show()



In [70]:
# # Run the plot function
# plot_feature_distributions(selected_merged_df, camel_core_features)

In [71]:
camels_ratios_df = pd.DataFrame({
    "cert": selected_merged_df["CERT"],
    "bank_name": selected_merged_df["NAMEFULL"],
    "quarter": selected_merged_df["QUARTER"],

    # Capital Adequacy (C)
    "equity_to_assets": selected_merged_df["EQV"] / selected_merged_df["NAASSET"],
    "tier1_rwa_ratio": selected_merged_df["IDT1RWAJR"],
    "total_equity_ratio": selected_merged_df["EQTOT"] / selected_merged_df["NAASSET"],
    "dividends_to_equity": selected_merged_df["EQCDIVNTINC"] / selected_merged_df["EQV"],

    # Asset Quality (A)
    "loan_loss_reserve_to_loans": selected_merged_df["LNATRESR"] / selected_merged_df["NTLNLSR"],
    "noncurrent_loans_to_loans": selected_merged_df["NCLNLSR"] / selected_merged_df["NTLNLSR"],
    "ore_to_assets": selected_merged_df["ORE"] / selected_merged_df["NAASSET"],
    "assets_past_due_30_89_to_assets": selected_merged_df["P3ASSET"] / selected_merged_df["NAASSET"],
    "noncurrent_real_estate_to_assets": selected_merged_df["NCRER"] / selected_merged_df["NAASSET"],
    "noncurrent_re_to_loans": selected_merged_df["NCRERESR"] / selected_merged_df["NTLNLSR"],

    # Management (M)
    "efficiency_ratio_proxy": selected_merged_df["EINTEXP"] / selected_merged_df["INTINC"],
    "noninterest_income_to_assets": selected_merged_df["NONII"] / selected_merged_df["NAASSET"],
    "operating_income_to_assets": selected_merged_df["NOIJ"] / selected_merged_df["NAASSET"],
    "assets_per_employee": selected_merged_df["NAASSET"] / selected_merged_df["NUMEMP"],

    # Earnings (E)
    "return_on_assets": selected_merged_df["ROA"],
    "pretax_return_on_assets": selected_merged_df["ROAPTX"],
    "return_on_equity": selected_merged_df["ROE"],
    "net_income_to_assets": selected_merged_df["NETINC"] / selected_merged_df["NAASSET"],
    "pretax_income_to_assets": selected_merged_df["PTAXNETINC"] / selected_merged_df["NAASSET"],

    # Liquidity (L)
    "net_loans_to_total_deposits": selected_merged_df["LNLSNET"] / selected_merged_df["DEP"],
    "uninsured_deposits_to_total_deposits": selected_merged_df["DEPUNINS"] / selected_merged_df["DEP"],
    "insured_deposits_to_total_deposits": selected_merged_df["DEPINS"] / selected_merged_df["DEP"],

    # Sensitivity (S) — intentionally excluded due to poor data
    # "securities_to_assets" and "asset_sensitivity_proxy" excluded
})

In [72]:
# # Run the plot function
# plot_feature_distributions(camels_ratios_df, camels_ratios_df.columns[3:])

In [73]:
camels_ratios_df = camels_ratios_df.replace([np.inf, -np.inf], np.nan)

In [74]:
print(camels_ratios_df.describe())

                cert  equity_to_assets  tier1_rwa_ratio  total_equity_ratio  \
count  122148.000000      95675.000000    122148.000000        95556.000000   
mean    21146.176303          0.209989        16.614870          250.003775   
std     15861.200129          1.053369        10.077985         2709.086680   
min         9.000000         -0.004659       -18.955043           -2.219512   
25%      9359.000000          0.006018        10.959607           12.629838   
50%     17719.000000          0.020341        13.660306           30.673811   
75%     30076.000000          0.074329        18.569188           88.370073   
max     91385.000000         46.992215        99.962700       415196.000000   

       dividends_to_equity  loan_loss_reserve_to_loans  \
count        121877.000000               104360.000000   
mean              4.569520                    8.649288   
std              38.759150                  224.201544   
min               0.000000               -12703.362999  

In [75]:
camels_ratios_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 122148 entries, 0 to 7747
Data columns (total 25 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   cert                                  122148 non-null  int64  
 1   bank_name                             122148 non-null  object 
 2   quarter                               122148 non-null  object 
 3   equity_to_assets                      95675 non-null   float64
 4   tier1_rwa_ratio                       122148 non-null  float64
 5   total_equity_ratio                    95556 non-null   float64
 6   dividends_to_equity                   121877 non-null  float64
 7   loan_loss_reserve_to_loans            104360 non-null  float64
 8   noncurrent_loans_to_loans             104360 non-null  float64
 9   ore_to_assets                         95556 non-null   float64
 10  assets_past_due_30_89_to_assets       95675 non-null   float64
 11  noncurr

In [76]:
not_null = camels_ratios_df.notnull().sum()

In [77]:
not_null

cert                                    122148
bank_name                               122148
quarter                                 122148
equity_to_assets                         95675
tier1_rwa_ratio                         122148
total_equity_ratio                       95556
dividends_to_equity                     121877
loan_loss_reserve_to_loans              104360
noncurrent_loans_to_loans               104360
ore_to_assets                            95556
assets_past_due_30_89_to_assets          95675
noncurrent_real_estate_to_assets         95675
noncurrent_re_to_loans                  104360
efficiency_ratio_proxy                  121803
noninterest_income_to_assets             95556
operating_income_to_assets               95556
assets_per_employee                     121707
return_on_assets                        122148
pretax_return_on_assets                 122148
return_on_equity                        121874
net_income_to_assets                     95556
pretax_income

In [78]:
def clean_camels_ratios(df):
    df = df.copy()

    # Clip extreme outliers (winsorization)
    clip_limits = {
    "equity_to_assets": (-1, 1),
    "tier1_rwa_ratio": (0, 100),  # 100% Tier 1 cap
    "total_equity_ratio": (0, 1000),
    "dividends_to_equity": (0, 20),
    "loan_loss_reserve_to_loans": (0, 100),
    "noncurrent_loans_to_loans": (0, 100),
    "ore_to_assets": (0, 1),
    "assets_past_due_30_89_to_assets": (0, 1),
    "noncurrent_real_estate_to_assets": (0, 1),
    "efficiency_ratio_proxy": (0, 2),
    "noninterest_income_to_assets": (-1, 1),
    "operating_income_to_assets": (-1, 20),  # beyond that looks like noise
    "assets_per_employee": (0, np.nanpercentile(camels_ratios_df["assets_per_employee"], 99)),
    "return_on_assets": (-1, 5),
    "pretax_return_on_assets": (-1, 5),
    "return_on_equity": (-50, 50),  # allow a broader range
    "net_income_to_assets": (-10, 10),
    "pretax_income_to_assets": (-10, 10),
    "net_loans_to_total_deposits": (0, 2),
    "uninsured_deposits_to_total_deposits": (0, 2),
    "insured_deposits_to_total_deposits": (0, 1.5),
    }

    for col, (low, high) in clip_limits.items():
        df[col] = df[col].clip(lower=low, upper=high)

    # Optional: drop rows with too many NaNs
    df = df.dropna(thresh=int(df.shape[1] * 0.8))  # keep rows with at least 80% non-NaN

    return df

In [79]:
camels_ratios_df.describe()

Unnamed: 0,cert,equity_to_assets,tier1_rwa_ratio,total_equity_ratio,dividends_to_equity,loan_loss_reserve_to_loans,noncurrent_loans_to_loans,ore_to_assets,assets_past_due_30_89_to_assets,noncurrent_real_estate_to_assets,...,operating_income_to_assets,assets_per_employee,return_on_assets,pretax_return_on_assets,return_on_equity,net_income_to_assets,pretax_income_to_assets,net_loans_to_total_deposits,uninsured_deposits_to_total_deposits,insured_deposits_to_total_deposits
count,122148.0,95675.0,122148.0,95556.0,121877.0,104360.0,104360.0,95556.0,95675.0,95675.0,...,95556.0,121707.0,122148.0,122148.0,121874.0,95556.0,95675.0,120933.0,120932.0,120933.0
mean,21146.176303,0.209989,16.61487,250.003775,4.56952,8.649288,6.673168,1.720812,12.293012,0.006531,...,17.768332,33.401527,1.022992,1.372687,9.420844,18.055717,23.069041,10.304171,0.213026,0.7849
std,15861.200129,1.053369,10.077985,2709.08668,38.75915,224.201544,181.380901,19.56671,182.676989,0.069307,...,297.219021,1132.712646,2.489833,3.557419,14.220838,296.889412,333.996539,910.657838,0.161574,0.154556
min,9.0,-0.004659,-18.955043,-2.219512,0.0,-12703.362999,-8756.146012,-4.285714,0.0,0.0,...,-11298.25,0.0,-132.957028,-143.290493,-1794.57,-9926.0,-13325.0,0.0,-1.616178,0.0
25%,9359.0,0.006018,10.959607,12.629838,0.0,1.105584,0.062526,0.0,0.589061,0.000228,...,0.515906,0.440668,0.606213,0.799678,5.57,0.537104,0.666667,0.654802,0.107461,0.716724
50%,17719.0,0.020341,13.660306,30.673811,1.570337,5.515419,2.520016,0.073055,1.615672,0.000891,...,1.687975,6.395833,0.999315,1.330642,9.71,1.724345,2.193966,0.80412,0.181632,0.817938
75%,30076.0,0.074329,18.569188,88.370073,5.955355,18.627293,8.583587,0.504885,4.568976,0.002675,...,5.496385,20.387993,1.409555,1.831962,14.2775,5.581208,7.181612,0.930978,0.281773,0.892483
max,91385.0,46.992215,99.9627,415196.0,7571.369245,18554.620817,28635.576144,2481.0,37623.0,8.486419,...,59857.0,232600.0,184.960644,286.022819,589.99,59872.666667,48666.5,211937.0,15.655395,2.616178


In [80]:
# # Run the plot function
# plot_feature_distributions(camels_ratios_df, camels_ratios_df.columns[3:])

In [81]:
cleaned_camels_ratios_df = clean_camels_ratios(camels_ratios_df)

In [82]:
# # Run the plot function
# plot_feature_distributions(cleaned_camels_ratios_df, camels_ratios_df.columns[3:])

In [83]:
print(cleaned_camels_ratios_df.describe())

               cert  equity_to_assets  tier1_rwa_ratio  total_equity_ratio  \
count  95556.000000      95556.000000     95556.000000        95556.000000   
mean   19388.842846          0.106428        15.392430          113.017935   
std    14488.624947          0.225275         7.520891          219.211037   
min        9.000000         -0.004659         0.000000            0.000000   
25%     8800.750000          0.006050        10.820852           12.629838   
50%    16583.000000          0.020411        13.162137           30.673811   
75%    28555.000000          0.074486        17.255818           88.370073   
max    91363.000000          1.000000        99.694220         1000.000000   

       dividends_to_equity  loan_loss_reserve_to_loans  \
count         95556.000000                89002.000000   
mean              3.853094                   16.827259   
std               4.835326                   26.804673   
min               0.000000                    0.000000   
25%    

In [84]:
def parse_quarter_to_date(quarter_str:str) -> pd.Timestamp:
    year,qtr = quarter_str.split("_")
    quarter_month_map = {"Q1": "01", "Q2": "04", "Q3": "07", "Q4": "10"}
    month = quarter_month_map[qtr]
    return pd.to_datetime(f"{year}-{month}-01")

In [85]:
cleaned_camels_ratios_df["date"] = cleaned_camels_ratios_df["quarter"].apply(parse_quarter_to_date)

In [86]:
def engineer_lag_features_optimized(df: pd.DataFrame, features: list, max_lag: int = 8,
                                    bank_id_col: str = "cert") -> pd.DataFrame:
    """
    Add lag features for specified CAMELS variables.
    
    """
    df = df.copy()
    
    
    df = df.sort_values(by=[bank_id_col, "date"])

    lag_dfs = [df]

    # Generate lag features for each specified variable
    for lag in range(1, max_lag + 1):
        lagged = (
            df.groupby(bank_id_col)[features]
              .shift(lag)
              .add_suffix(f"_lag{lag}")
        )
        lag_dfs.append(lagged)

    # Concatenate all in one go for performance
    df_with_lags = pd.concat(lag_dfs, axis=1)

    return df_with_lags

In [87]:
# cleaned_camels_ratios_df.columns[3:-1]

In [88]:
engineered_df = engineer_lag_features_optimized(cleaned_camels_ratios_df,features = camels_ratios_df.columns[3:-1])

In [89]:
engineered_df = engineered_df[engineered_df["quarter"] == '2006_Q4']

In [90]:
len(engineered_df)

6549

In [91]:
with pd.option_context('display.max_rows', None):
    print(engineered_df.isnull().sum())

cert                                            0
bank_name                                       0
quarter                                         0
equity_to_assets                                0
tier1_rwa_ratio                                 0
total_equity_ratio                              0
dividends_to_equity                             0
loan_loss_reserve_to_loans                    309
noncurrent_loans_to_loans                     309
ore_to_assets                                   0
assets_past_due_30_89_to_assets                 0
noncurrent_real_estate_to_assets                0
noncurrent_re_to_loans                        309
efficiency_ratio_proxy                          1
noninterest_income_to_assets                    0
operating_income_to_assets                      0
assets_per_employee                             1
return_on_assets                                0
pretax_return_on_assets                         0
return_on_equity                                0


In [92]:
# def impute_selected_with_mean(df, columns):
#     df = df.copy()
#     for col in columns:
#         mean_value = df[col].mean()
#         df[col] = df[col].fillna(mean_value)
#     return df

# # Columns you want to impute with mean
# cols_to_impute_with_mean = ["efficiency_ratio_proxy","return_on_assets", "return_on_equity",]

# # Apply the function
# engineered_df = impute_selected_with_mean(engineered_df, cols_to_impute_with_mean)

In [93]:
def impute_selected_with_mean(df, columns):
    df = df.copy()
    for col in columns:
        mean_value = df[col].median()
        df[col] = df[col].fillna(mean_value)
    return df

cols_to_impute_with_median = engineered_df.columns[3:]

# Apply the functio
engineered_df = impute_selected_with_mean(engineered_df, cols_to_impute_with_median)

In [94]:
# # Run the plot function
# plot_feature_distributions(engineered_df, engineered_df.columns[3:])

In [95]:
len(engineered_df.columns)

194

In [96]:
merged_df = engineered_df.merge(label_df, on = "cert", how = "left")
merged_df["label"] = merged_df["label"].fillna(0).astype(int) 

KeyError: 'cert'

In [None]:
X = merged_df.drop(columns=['cert', 'bank_name', 'quarter', 'label'], errors='ignore')
y = merged_df['label']

In [None]:
X = merged_df.drop(columns=['cert', 'bank_name', 'quarter', 'date', 'label'], errors='ignore')
y = merged_df['label']

In [None]:
len(merged_df)
len(merged_df[merged_df['label']== 1])

261

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [101]:
y_test[y_test == 1]

6005    1
3639    1
5637    1
5140    1
4653    1
       ..
3527    1
639     1
5597    1
6422    1
1375    1
Name: label, Length: 78, dtype: int64

In [35]:
scaler = StandardScaler()
# Re-standardize for Logistic Regression
X_train_scaled = scaler.fit_transform(X_train.fillna(0))
X_test_scaled = scaler.transform(X_test.fillna(0))

In [36]:
# Ensure target is integer type
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Define models
lr_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
xgb_model = XGBClassifier(
    n_estimators=100,
    eval_metric='logloss',
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    random_state=42
)

# Train models
lr_model.fit(X_train_scaled, y_train)
lr_preds = lr_model.predict(X_test_scaled)
lr_probs = lr_model.predict_proba(X_test_scaled)[:, 1]

rf_model.fit(X_train_scaled, y_train)
rf_preds = rf_model.predict(X_test_scaled)
rf_probs = rf_model.predict_proba(X_test_scaled)[:, 1]

xgb_model.fit(X_train_scaled, y_train)
xgb_preds = xgb_model.predict(X_test_scaled)
xgb_probs = xgb_model.predict_proba(X_test_scaled)[:, 1]

# Collect results
results_cleaned = {
    'Logistic Regression': {
        'ROC AUC': roc_auc_score(y_test, lr_probs),
        'Confusion Matrix': confusion_matrix(y_test, lr_preds),
        'Classification Report': classification_report(y_test, lr_preds, output_dict=True)
    },
    'Random Forest': {
        'ROC AUC': roc_auc_score(y_test, rf_probs),
        'Confusion Matrix': confusion_matrix(y_test, rf_preds),
        'Classification Report': classification_report(y_test, rf_preds, output_dict=True)
    },
    'XGBoost': {
        'ROC AUC': roc_auc_score(y_test, xgb_probs),
        'Confusion Matrix': confusion_matrix(y_test, xgb_preds),
        'Classification Report': classification_report(y_test, xgb_preds, output_dict=True)
    }
}

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
results_cleaned

{'Logistic Regression': {'ROC AUC': np.float64(0.7385688856277091),
  'Confusion Matrix': array([[1553,  334],
         [  36,   42]]),
  'Classification Report': {'0': {'precision': 0.9773442416614223,
    'recall': 0.8229994700582935,
    'f1-score': 0.8935558112773303,
    'support': 1887.0},
   '1': {'precision': 0.11170212765957446,
    'recall': 0.5384615384615384,
    'f1-score': 0.18502202643171806,
    'support': 78.0},
   'accuracy': 0.811704834605598,
   'macro avg': {'precision': 0.5445231846604984,
    'recall': 0.680730504259916,
    'f1-score': 0.5392889188545242,
    'support': 1965.0},
   'weighted avg': {'precision': 0.9429828753040971,
    'recall': 0.811704834605598,
    'f1-score': 0.8654308060773518,
    'support': 1965.0}}},
 'Random Forest': {'ROC AUC': np.float64(0.7358478387890153),
  'Confusion Matrix': array([[1887,    0],
         [  78,    0]]),
  'Classification Report': {'0': {'precision': 0.9603053435114504,
    'recall': 1.0,
    'f1-score': 0.97975077

Results are bad , lets resample out minority class by using SMOTE technique

In [38]:
from imblearn.over_sampling import SMOTE

In [39]:
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply on scaled features
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [40]:
import numpy as np
unique, counts = np.unique(y_train_resampled, return_counts=True)
print(dict(zip(unique, counts)))

{np.int64(0): np.int64(4401), np.int64(1): np.int64(4401)}


In [41]:
# Predict probabilities
lr_probs = lr_model.predict_proba(X_test_scaled)[:, 1]
rf_probs = rf_model.predict_proba(X_test_scaled)[:, 1]
xgb_probs = xgb_model.predict_proba(X_test_scaled)[:, 1]

# Predict class labels
lr_preds = lr_model.predict(X_test_scaled)
rf_preds = rf_model.predict(X_test_scaled)
xgb_preds = xgb_model.predict(X_test_scaled)

In [42]:
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

results_after_smote = {
    'Logistic Regression (SMOTE)': {
        'ROC AUC': roc_auc_score(y_test, lr_probs),
        'Confusion Matrix': confusion_matrix(y_test, lr_preds),
        'Classification Report': classification_report(y_test, lr_preds, output_dict=True)
    },
    'Random Forest (SMOTE)': {
        'ROC AUC': roc_auc_score(y_test, rf_probs),
        'Confusion Matrix': confusion_matrix(y_test, rf_preds),
        'Classification Report': classification_report(y_test, rf_preds, output_dict=True)
    },
    'XGBoost (SMOTE)': {
        'ROC AUC': roc_auc_score(y_test, xgb_probs),
        'Confusion Matrix': confusion_matrix(y_test, xgb_preds),
        'Classification Report': classification_report(y_test, xgb_preds, output_dict=True)
    }
}

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:
for model_name, metrics in results_after_smote.items():
    print(f"\n=== {model_name} ===")
    print(f"ROC AUC: {metrics['ROC AUC']:.3f}")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])
    print("Classification Report:")
    print(classification_report(y_test, 
                                lr_preds if "Logistic" in model_name else 
                                rf_preds if "Random" in model_name else 
                                xgb_preds,
                                zero_division=0))


=== Logistic Regression (SMOTE) ===
ROC AUC: 0.739
Confusion Matrix:
[[1553  334]
 [  36   42]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.82      0.89      1887
           1       0.11      0.54      0.19        78

    accuracy                           0.81      1965
   macro avg       0.54      0.68      0.54      1965
weighted avg       0.94      0.81      0.87      1965


=== Random Forest (SMOTE) ===
ROC AUC: 0.736
Confusion Matrix:
[[1887    0]
 [  78    0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1887
           1       0.00      0.00      0.00        78

    accuracy                           0.96      1965
   macro avg       0.48      0.50      0.49      1965
weighted avg       0.92      0.96      0.94      1965


=== XGBoost (SMOTE) ===
ROC AUC: 0.766
Confusion Matrix:
[[1872   15]
 [  74    4]]
Classification Report:
    

In [44]:
from sklearn.metrics import precision_recall_curve

probs = xgb_model.predict_proba(X_test_scaled)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, probs)

# Choose a threshold with higher recall (e.g., 0.2)
custom_preds = (probs >= 0.2).astype(int)

In [45]:
print(confusion_matrix(y_test, custom_preds))
print(classification_report(y_test, custom_preds))

[[1851   36]
 [  69    9]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1887
           1       0.20      0.12      0.15        78

    accuracy                           0.95      1965
   macro avg       0.58      0.55      0.56      1965
weighted avg       0.93      0.95      0.94      1965



In [46]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# Feature selector based on feature importance
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')

# Fit on the resampled training data
selector.fit(X_train_resampled, y_train_resampled)

# Transform train and test data
X_train_sel = selector.transform(X_train_resampled)
X_test_sel = selector.transform(X_test_scaled)

In [47]:
print("Selected features:", X_train_sel.shape[1], "out of", X_train_resampled.shape[1])

Selected features: 95 out of 190


In [48]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# Define base models
lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
xgb = XGBClassifier(n_estimators=100, eval_metric='logloss', scale_pos_weight=(y_train_resampled == 0).sum() / (y_train_resampled == 1).sum(), random_state=42)

# Combine in a soft voting ensemble
voting_model = VotingClassifier(
    estimators=[('lr', lr), ('xgb', xgb)],
    voting='soft'
)

# Fit ensemble on reduced training set
voting_model.fit(X_train_sel, y_train_resampled)

In [49]:
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

# Predict
voting_probs = voting_model.predict_proba(X_test_sel)[:, 1]
voting_preds = voting_model.predict(X_test_sel)

# Evaluate
print("=== Voting Ensemble ===")
print(f"ROC AUC: {roc_auc_score(y_test, voting_probs):.3f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, voting_preds))
print("Classification Report:")
print(classification_report(y_test, voting_preds, zero_division=0))


=== Voting Ensemble ===
ROC AUC: 0.774
Confusion Matrix:
[[1809   78]
 [  64   14]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1887
           1       0.15      0.18      0.16        78

    accuracy                           0.93      1965
   macro avg       0.56      0.57      0.56      1965
weighted avg       0.93      0.93      0.93      1965

