# Bank Loan Defaulter Hackathon Project
### Submitted by Himanshu Saini
#### Dataset Link: https://www.kaggle.com/datasets/ankitkalauni/bank-loan-defaulter-prediction-hackathon

In [1]:
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = FutureWarning)
warnings.filterwarnings("ignore", category = UserWarning)

In [21]:
# Basic libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
import joblib


# Global style for plots
plt.rcParams.update({
    "font.family": "Times New Roman",
    "font.size": 12,
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "axes.edgecolor": "black",
    "axes.linewidth": 1,
    "xtick.color": "black",
    "ytick.color": "black",
    "xtick.direction": "out",
    "ytick.direction": "out",
    "xtick.bottom": True,
    "ytick.left": True,
    "xtick.top": False,
    "ytick.right": False,
    "figure.dpi": 150,
    "legend.frameon": True,
    "legend.facecolor": "white",
    "legend.edgecolor": "black",
    "legend.fontsize": 12
})

# Apply to seaborn
sns.set_theme(context="notebook", style="ticks")

# Reproducibility
rnd_num = 42
np.random.seed(rnd_num)

# Root directory
ROOT = Path(os.getcwd()).parent

In [32]:
# Figure saver helper (use after each plot)
def savefig(name, out_dir):
    out = Path(out_dir)
    out.mkdir(exist_ok=True, parents=True)
    plt.tight_layout()
    plt.savefig(out/name, dpi=300, bbox_inches="tight")
    print("Saved:", out)

In [4]:
# Download the data
df_train = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")

### Inspecting training data.

In [10]:
# First five rows.
df_train.head()

In [11]:
df_train.shape

In [12]:
df_train.info()

In [13]:
for dtype, cols in df_train.groupby(df_train.dtypes, axis=1):
    print(f"{dtype} ({len(cols.columns)} columns):")
    print(list(cols.columns))

In [14]:
# Summary statics.
df_train.describe()

### Inspecting test data.

In [15]:
# First five rows.
df_test.head()

In [16]:
df_test.shape

In [17]:
df_test.info()

In [18]:
for dtype, cols in df_test.groupby(df_test.dtypes, axis=1):
    print(f"{dtype} ({len(cols.columns)} columns):")
    print(list(cols.columns))

In [19]:
# Summary statics.
df_test.describe()

In [20]:
# Clean the test data (remove target column).
df_test.drop("Loan Status", axis=1, inplace=True)

In [23]:
# Identify train dataset column types.
df_train_categorical_cols = df_train.select_dtypes(include="object").columns.tolist()
df_train_numerical_cols = df_train.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [24]:
# Identify test dataset column types.
df_test_categorical_cols = df_test.select_dtypes(include="object").columns.tolist()
df_test_numerical_cols = df_test.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [25]:
for col in df_train_numerical_cols:
    if df_train[col].nunique()<=200:
        print (col, df_train[col].nunique())

In [26]:
for col in df_train_categorical_cols:
    if df_train[col].nunique()<=200:
        print (col, df_train[col].nunique())

In [33]:
df_train.nunique().sort_values().plot(kind="barh", figsize=(10, 12))
plt.xscale("log")
plt.title("Unique Values per Column")
plt.axvline(x=40, color='red', linestyle='--', label='Threshold')
plt.legend()
plt.tight_layout()
savefig("01_UniqueValuesPerColumn.png", "../results")
plt.show()

In [34]:
# Get all the categorical features out from train data.
df_train_filter = df_train.loc[:, df_train.nunique() <= 40]
df_train_filter.info()

In [35]:
# Define outlier capping function (calling and flooring).
def outlier_processing(x):
     x = x.clip(lower=x.quantile(0.05), upper=x.quantile(0.95))
     return x

In [36]:
# Apply Capping to Numeric Columns of train dataset.
df_train[df_train_numerical_cols] = df_train[df_train_numerical_cols].apply(outlier_processing)

In [37]:
# Describe Data After Capping
df_train[df_train_numerical_cols].describe(percentiles=[0.01,0.05,0.25,0.50,0.75,0.95,0.99])

In [38]:
# Variance of Columns
df_train[df_test_numerical_cols].var().sort_values()

In [39]:
# Define thresholds
drop_threshold = 0.0001
flag_threshold = 0.05

# Drop zero/near-zero variance columns
zero_var_cols = df_train[df_test_numerical_cols].var(numeric_only=True)
zero_var_cols = zero_var_cols[zero_var_cols < drop_threshold].index.tolist()

# Flag low-variance columns for review (but not yet dropped)
low_var_cols = df_train[df_test_numerical_cols].var(numeric_only=True)
low_var_cols = low_var_cols[(low_var_cols >= drop_threshold) & (low_var_cols < flag_threshold)].index.tolist()

print("Drop these (zero or near-zero variance):", zero_var_cols)
print("Review these (low variance):", low_var_cols)

In [40]:
# Initialize result dictionary
cat_analysis = {}

# Thresholds
dominance_threshold = 0.95
high_card_threshold = 100

# Analyze each categorical column
for col in df_test_categorical_cols:
    n_unique = df_train[col].nunique(dropna=False)
    top_freq = df_train[col].value_counts(normalize=True, dropna=False).iloc[0]
    cat_analysis[col] = {
        "Unique Values": n_unique,
        "Top Category %": round(top_freq * 100, 2),
        "Drop (High Cardinality)": n_unique > high_card_threshold,
        "Drop (Dominant Category)": top_freq > dominance_threshold
    }

# Convert to DataFrame
cat_analysis_df = pd.DataFrame(cat_analysis).T.sort_values(by="Unique Values")

In [41]:
cat_analysis_df

### From the above analysis we found the following:
### 1. Drop numerical col "Collection 12 months Medical", "Accounts Delinquent" because they have zero variance therefore, should drop.
### 2. Drop categorical col "Payment Plan", "Application Type" because contant values and highly imbalance.

In [42]:
drop_col = ['ID', 'Collection 12 months Medical', 'Accounts Delinquent', 'Payment Plan', 'Application Type']
df_train.drop(columns=drop_col, inplace=True)
df_test.drop(columns=[col for col in drop_col if col in df_test.columns], inplace=True)

In [43]:
df_test.head()

In [44]:
# Combine Train & Test for Uniform Processing.
# use keys to track the data.
df_combined = pd.concat([df_train, df_test], axis=0, keys=["train", "test"])

In [45]:
df_train.columns

In [46]:
df_combined.columns

In [47]:
# Select categorical columns
cat_cols = df_train.select_dtypes(include=['object', 'category']).columns

# Prepare summary
encoding_suggestions = []

for col in cat_cols:
    unique_vals = df_train[col].nunique(dropna=False)
    top_cat_pct = df_train[col].value_counts(normalize=True, dropna=False).iloc[0] # The percentage of rows that belong to the most frequent category in a given column.

    if unique_vals <= 10:
        if col in ['Grade']:  # Known ordinal from domain
            encoding_type = "Ordinal Encoding"
        else:
            encoding_type = "One-Hot Encoding"
    elif 10 < unique_vals <= 50:
        encoding_type = "Label Encoding"
    elif unique_vals > 50:
        encoding_type = "Frequency Encoding / Target Encoding"
    else:
        encoding_type = "Review Manually"

    dominance_flag = "Dominant Category" if top_cat_pct > 0.95 else ""
    
    encoding_suggestions.append({
        "Column": col,
        "Unique Values": unique_vals,
        "Top Category %": round(top_cat_pct * 100, 2),
        "Suggested Encoding": encoding_type,
        "Note": dominance_flag
    })

# Convert to DataFrame
encoding_df = pd.DataFrame(encoding_suggestions).sort_values(by="Unique Values")

In [48]:
encoding_df

In [49]:
# Implemnting encoding

from sklearn.preprocessing import LabelEncoder

# --- 1. Ordinal Encoding for 'Grade'
grade_order = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
df_combined["Grade"] = df_combined["Grade"].map(grade_order)

# --- 2. Label Encoding for 'Sub Grade' and 'Batch Enrolled'
label_cols = ["Sub Grade", "Batch Enrolled"]
le = LabelEncoder()
for col in label_cols:
    df_combined[col] = le.fit_transform(df_combined[col])

# --- 3. Frequency Encoding for 'Loan Title'
freq_map = df_combined["Loan Title"].value_counts().to_dict()
df_combined["Loan Title"] = df_combined["Loan Title"].map(freq_map)

# --- 4. One-Hot Encoding for 'Initial List Status', 'Employment Duration', 'Verification Status'
one_hot_cols = ["Initial List Status", "Employment Duration", "Verification Status"]
df_combined = pd.get_dummies(df_combined, columns=one_hot_cols, drop_first=True)

In [50]:
df_combined.head()

In [51]:
df_combined.columns

In [52]:
# Restore train and test from MultiIndex
df_train_encoded = df_combined.loc["train"].copy()
df_test_encoded = df_combined.loc["test"].copy()

In [53]:
print ("Loan Status" in df_train_encoded.columns)

In [54]:
print ("Loan Status" in df_test_encoded.columns)

In [55]:
df_test_encoded["Loan Status"].nunique()

In [56]:
df_test_encoded.head()

In [57]:
df_test_encoded.info()

### Dixtribution Analysis.

In [59]:
# Import Libraries
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from collections import Counter

In [60]:
# Prepare Features and Target
X = df_train_encoded.drop(columns=["Loan Status"])
y = df_train_encoded["Loan Status"]
X_test = df_test_encoded.drop(columns=["Loan Status"])

print("Original Class Distribution:")
print(Counter(y))

In [61]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)
print("After SMOTE")  
print(Counter(y_smote))

In [62]:
# Apply NearMiss
nearmiss = NearMiss(version=1)
X_nm, y_nm = nearmiss.fit_resample(X, y)
print("After NearMiss:")
print(Counter(y_nm))

In [63]:
comparison_df = pd.DataFrame({
    "Original": pd.Series(Counter(y)),
    "SMOTE": pd.Series(Counter(y_smote)),
    "NearMiss": pd.Series(Counter(y_nm))
}).T
comparison_df.columns = ["Non-Defaulter (0)", "Defaulter (1)"]
print("Class Distribution Comparison:\n")
display(comparison_df)

In [64]:
# Plotting
comparison_df.plot(kind="bar", figsize=(8, 4), colormap="viridis")
plt.title("Class Distribution: Original vs SMOTE vs NearMiss")
plt.ylabel("Number of Samples")
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.tight_layout()
savefig("02_DistributionPlot.png", "../results")
plt.show()

### ML Modeling

In [65]:
X.head()

In [66]:
y.head()

In [67]:
X_test.head()

In [139]:
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Metrics (for classification)
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import log_loss

# Model Selection (Optional for tuning)
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold

# Pipeline
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline


In [69]:
def evaluate_model(model, X_train, y_train, X_test, y_test, name=""):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    con_mat = confusion_matrix(y_test, y_pred)

    print(f"Results: {name}")
    print("-" * 40)
    print(f"Accuracy      : {acc:.4f}")
    print(f"Precision     : {prec:.4f}")
    print(f"Recall        : {rec:.4f}")
    print(f"F1 Score      : {f1:.4f}")
    print(f"ROC AUC Score : {roc_auc:.4f}" if roc_auc is not None else "ROC AUC not available")
    print("\nConfusion Matrix:")
    print(con_mat)
    print("-" * 40)

    # Add feature importance if supported
    feature_importance = None
    if hasattr(model, "feature_importances_"):
        feature_importance = model.feature_importances_
    elif hasattr(model, "coef_"):
        try:
            feature_importance = abs(model.coef_[0])  # Optional, if linear
        except:
            feature_importance = None
    
    return {
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Confusion Matrix": con_mat,
        "Feature Importance": feature_importance
    }

In [155]:
# Step 1: Split the data (80/20) for clean validation
from sklearn.model_selection import train_test_split

# Original (X, y).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Test case
test = df_test_encoded.drop(columns=["Loan Status"])

# Over-sampled using SMOTE (X_smote, y_smote)
smote = SMOTE(random_state=42)
X_smote_train, y_smote_train = smote.fit_resample(X_train, y_train)

# Under-sampled using NearMiss (X_nm, y_nm)
nearmiss = NearMiss()
X_nm_train, y_nm_train = nearmiss.fit_resample(X_train, y_train)

## Model 1. Logistic Regression

In [71]:
# Create model
lr = LogisticRegression(max_iter=1000)

# Original
result_lr_orig = evaluate_model(lr, X_train, y_train, X_test, y_test, name="LogReg - Original")

# SMOTE
result_lr_smote = evaluate_model(lr, X_smote_train, y_smote_train, X_test, y_test, name="LogReg - SMOTE")

# NearMiss
result_lr_nm = evaluate_model(lr, X_nm_train, y_nm_train, X_test, y_test, name="LogReg - NearMiss")


## Model 2. Decison Tree Classifier

In [72]:
# Base model setup
dt = DecisionTreeClassifier(random_state=42)

# Original
result_dt_orig = evaluate_model(dt, X_train, y_train, X_test, y_test, name="DecisionTree - Original")

# SMOTE
result_dt_smote = evaluate_model(dt, X_smote_train, y_smote_train, X_test, y_test, name="DecisionTree - SMOTE")

# NearMiss
result_dt_nm = evaluate_model(dt, X_nm_train, y_nm_train, X_test, y_test, name="DecisionTree - NearMiss")

## Model 3. Random Forest Classifier

In [73]:
# Initialize with base settings
rf = RandomForestClassifier(random_state=42)

# Original
result_rf_orig = evaluate_model(rf, X_train, y_train, X_test, y_test, name="RandomForest - Original")

# SMOTE
result_rf_smote = evaluate_model(rf, X_smote_train, y_smote_train, X_test, y_test, name="RandomForest - SMOTE")

# NearMiss
result_rf_nm = evaluate_model(rf, X_nm_train, y_nm_train, X_test, y_test, name="RandomForest - NearMiss")

## Model 4: Support Vector Classifier

In [74]:
# Initialize with probability enabled for ROC AUC
svc = SVC(probability=True, random_state=42)

# Original
result_svc_orig = evaluate_model(svc, X_train, y_train, X_test, y_test, name="SVC - Original")

# SMOTE
result_svc_smote = evaluate_model(svc, X_smote_train, y_smote_train, X_test, y_test, name="SVC - SMOTE")

# NearMiss
result_svc_nm = evaluate_model(svc, X_nm_train, y_nm_train, X_test, y_test, name="SVC - NearMiss")

## Model 5: XGBoost

In [75]:
# Initialize XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# Evaluate on Original Data
result_xgb_orig = evaluate_model(xgb, X_train, y_train, X_test, y_test, name="XGBoost - Original")

# Evaluate on SMOTE-balanced Data
result_xgb_smote = evaluate_model(xgb, X_smote_train, y_smote_train, X_test, y_test, name="XGBoost - SMOTE")

# Evaluate on NearMiss-balanced Data
result_xgb_nm = evaluate_model(xgb, X_nm_train, y_nm_train, X_test, y_test, name="XGBoost - NearMiss")


In [76]:
model_metrices = pd.DataFrame([result_lr_orig, result_lr_smote, result_lr_nm, result_dt_orig, result_dt_smote, result_dt_nm, result_rf_orig, result_rf_smote, result_rf_nm, result_svc_orig, result_svc_smote, result_svc_nm, result_xgb_orig, result_xgb_smote, result_xgb_nm])

In [77]:
model_metrices

In [78]:
# PLot the Accuracies of the different models
# Sort models by Accuracy for better visualization

plt.figure(figsize=(10, 6))
bars = plt.barh(model_metrices.sort_values(by="Accuracy", ascending=True)["Model"], model_metrices.sort_values(by="Accuracy", ascending=True)["Accuracy"], color="dimgray")
plt.xlabel("Accuracy")
plt.title("Model Accuracy Comparison")
plt.grid(axis='x', linestyle="--", alpha=0.7)

# Annotate bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.005, bar.get_y() + bar.get_height() / 2, f"{width:.2f}", va='center')

plt.tight_layout()
savefig("03_ModelAccuracyComparison.png", "../results")
plt.show()

In [80]:
# PLot the Precision of the different models
# Sort models by Accuracy for better visualization

plt.figure(figsize=(10, 6))
bars = plt.barh(model_metrices.sort_values(by="Precision", ascending=True)["Model"], model_metrices.sort_values(by="Precision", ascending=True)["Precision"], color="dimgray")
plt.xlabel("Precision")
plt.title("Model Precision Comparison")
plt.grid(axis='x', linestyle="--", alpha=0.7)

# Annotate bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.005, bar.get_y() + bar.get_height() / 2, f"{width:.2f}", va='center')

plt.tight_layout()
savefig("04_ModelPrecisionComparison.png", "../results")
plt.show()

In [81]:
# PLot the Recall of the different models
# Sort models by Recall for better visualization

plt.figure(figsize=(10, 6))
bars = plt.barh(model_metrices.sort_values(by="Recall", ascending=True)["Model"], model_metrices.sort_values(by="Recall", ascending=True)["Recall"], color="dimgray")
plt.xlabel("Recall")
plt.title("Model Recall Comparison")
plt.grid(axis='x', linestyle="--", alpha=0.7)

# Annotate bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.005, bar.get_y() + bar.get_height() / 2, f"{width:.2f}", va='center')

plt.tight_layout()
savefig("05_ModelRecallComparison.png", "../results")
plt.show()

In [83]:
# PLot the F1 Score of the different models
# Sort models by Recall for better visualization

plt.figure(figsize=(10, 6))
bars = plt.barh(model_metrices.sort_values(by="F1 Score", ascending=True)["Model"], model_metrices.sort_values(by="F1 Score", ascending=True)["F1 Score"], color="dimgray")
plt.xlabel("F1 Score")
plt.title("Model F1 Score Comparison")
plt.grid(axis='x', linestyle="--", alpha=0.7)

# Annotate bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.005, bar.get_y() + bar.get_height() / 2, f"{width:.2f}", va='center')

plt.tight_layout()
savefig("06_ModelF1ScoreComparison.png", "../results")
plt.show()

In [84]:
# PLot the ROC AUC of the different models
# Sort models by Recall for better visualization

plt.figure(figsize=(10, 6))
bars = plt.barh(model_metrices.sort_values(by="ROC AUC", ascending=True)["Model"], model_metrices.sort_values(by="ROC AUC", ascending=True)["ROC AUC"], color="dimgray")
plt.xlabel("ROC AUC")
plt.title("Model ROC AUC Comparison")
plt.grid(axis='x', linestyle="--", alpha=0.7)

# Annotate bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.005, bar.get_y() + bar.get_height() / 2, f"{width:.2f}", va='center')

plt.tight_layout()
savefig("07_Model_ROC_AUC_Comparison.png", "../results")
plt.show()

In [88]:
# Filter tree-based models with valid feature importance ===
tree_models = model_metrices[model_metrices["Feature Importance"].notnull()].copy()

# Plot each model's top N important features ===
N = 10  # Top N features to display (change if needed)
feature_names = X_train.columns  # Ensure you're using correct feature list
count = 0
for idx, row in tree_models.iterrows():
    count += 1
    importances = np.array(row["Feature Importance"])
    
    # Get indices of top N features
    top_idx = np.argsort(importances)[-N:][::-1]
    top_features = feature_names[top_idx]
    top_importances = importances[top_idx]
    
    # Plot
    plt.figure(figsize=(8, 5))
    plt.barh(top_features[::-1], top_importances[::-1])  # Reversed for descending bars
    plt.title(f"Feature Importance: {row['Model']}")
    plt.xlabel("Importance Score")
    plt.tight_layout()
    plt.grid(axis='x', linestyle='--', alpha=0.5)
    fig = str(str(count) + "_" + str(row['Model']))
    savefig(fig, "../results")
    plt.show()

### With SMOTE

In [147]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
param_grid = {'max_depth': [3, 4, 5], 'max_features': ['int', 'sqrt', 'log2']}
modelgrid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, verbose=3, cv=cv, scoring='neg_log_loss')
modelgrid.fit(X_smote_train, y_smote_train)

In [148]:
modelgrid.best_params_

In [149]:
model_rf_cv = RandomForestClassifier(max_depth=5, max_features='sqrt', random_state=42)

In [150]:
model_rf_cv.fit(X_smote_train, y_smote_train)

## Model Evaluation (Log loss metric)

In [151]:
# Get the predicting probablities
predict_train = model_rf_cv.predict_proba(X_smote_train)

In [153]:
predict_val = model_rf_cv.predict_proba(X_test)

In [154]:
log_loss(y_smote_train, predict_train), log_loss(y_test, predict_val)

In [174]:
y_pred_cv_smt = model_rf_cv.predict(X_test)

In [175]:
acc = accuracy_score(y_test, y_pred_cv_smt)
prec = precision_score(y_test, y_pred_cv_smt, zero_division=0)
rec = recall_score(y_test, y_pred_cv_smt)
f1 = f1_score(y_test, y_pred_cv_smt)
roc_auc = roc_auc_score(y_test, predict_val[:, 1])
con_mat = confusion_matrix(y_test, y_pred_cv_smt)

print(f"Results:")
print("-" * 40)
print(f"Accuracy      : {acc:.4f}")
print(f"Precision     : {prec:.4f}")
print(f"Recall        : {rec:.4f}")
print(f"F1 Score      : {f1:.4f}")
print(f"ROC AUC Score : {roc_auc:.4f}" if roc_auc is not None else "ROC AUC not available")
print("\nConfusion Matrix:")
print(con_mat)
print("-" * 40)

In [156]:
predict_test = model_rf_cv.predict_proba(test)

In [None]:
predict_test

# Genrate Submission file

In [158]:
final_result = pd.DataFrame(predict_test)
final_result.head()

In [159]:
submission = pd.DataFrame(final_result[1])
submission

In [160]:
submission.columns = ["Loan Status"]
submission["Loan Status"] = round(submission["Loan Status"], 6)
submission

In [161]:
submission.to_csv("../results/HS_Submission_SMT.csv", index=False)

### With Nearmiss

In [162]:
from imblearn.pipeline import Pipeline 

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

pipe = Pipeline([
    ("nm", NearMiss()),
    ("clf", RandomForestClassifier(random_state=42))
])

param_grid = {
    "nm__version": [1, 2, 3],
    "clf__n_estimators": [300, 500, 800],
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2"],
}

gs = GridSearchCV(
    pipe,
    param_grid,
    scoring=make_scorer(recall_score, pos_label=1),
    cv=cv,
    n_jobs=-1,
    refit=True,
    verbose=1,
    return_train_score=True,
)

In [163]:
gs.fit(X_train, y_train)

In [164]:
# Get the predicting probablities
predict_train_gs = gs.predict_proba(X_train)

In [165]:
predict_val_gs = gs.predict_proba(X_test)

In [166]:
log_loss(y_train, predict_train_gs), log_loss(y_test, predict_val_gs)

In [169]:
y_pred_gs = gs.predict(X_test)

In [171]:
y_test.shape

In [172]:
predict_val_gs.shape

In [173]:
acc = accuracy_score(y_test, y_pred_gs)
prec = precision_score(y_test, y_pred_gs, zero_division=0)
rec = recall_score(y_test, y_pred_gs)
f1 = f1_score(y_test, y_pred_gs)
roc_auc = roc_auc_score(y_test, predict_val_gs[:, 1])
con_mat = confusion_matrix(y_test, y_pred_gs)

print(f"Results:")
print("-" * 40)
print(f"Accuracy      : {acc:.4f}")
print(f"Precision     : {prec:.4f}")
print(f"Recall        : {rec:.4f}")
print(f"F1 Score      : {f1:.4f}")
print(f"ROC AUC Score : {roc_auc:.4f}" if roc_auc is not None else "ROC AUC not available")
print("\nConfusion Matrix:")
print(con_mat)
print("-" * 40)

In [176]:
predict_test_gs = gs.predict_proba(test)

In [177]:
predict_test_gs

# Genrate Submission file

In [178]:
final_result = pd.DataFrame(predict_test_gs)
final_result.head()

In [179]:
submission = pd.DataFrame(final_result[1])
submission

In [180]:
submission.columns = ["Loan Status"]
submission["Loan Status"] = round(submission["Loan Status"], 6)
submission

In [181]:
submission.to_csv("../results/HS_Submission_NM_GS.csv", index=False)

In [144]:
from scipy.stats import randint
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

pipe = Pipeline([
    ("nm", NearMiss()),  # sampler runs per fold; can be toggled off via grid
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_dist = {
    # toggle sampler + its most impactful knob
    "nm": [NearMiss(version=1), NearMiss(version=3), "passthrough"],
    # smaller, high-impact RF knobs for recall
    "clf__n_estimators": randint(120, 320),          # smaller during search; retrain bigger later
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_split": randint(2, 10),
    "clf__min_samples_leaf": randint(1, 6),
    "clf__max_features": ["sqrt", "log2"],           # stable choices for high-dim tabular
    "clf__class_weight": [None, "balanced"]          # compare against no-sampler path
}

rs = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=48,                         # ~5–10× fewer fits than your grid
    scoring=make_scorer(recall_score, pos_label=1),
    cv=cv,
    n_jobs=-1,
    refit=True,
    random_state=42,
    verbose=1,
    return_train_score=True,
)


In [146]:
rs.fit(X_train, y_train)

In [182]:
# Get the predicting probablities
predict_train_rs = rs.predict_proba(X_train)

In [183]:
predict_val_rs = rs.predict_proba(X_test)

In [184]:
log_loss(y_train, predict_train_rs), log_loss(y_test, predict_val_rs)

In [185]:
y_pred_rs = gs.predict(X_test)

In [186]:
acc = accuracy_score(y_test, y_pred_rs)
prec = precision_score(y_test, y_pred_rs, zero_division=0)
rec = recall_score(y_test, y_pred_rs)
f1 = f1_score(y_test, y_pred_rs)
roc_auc = roc_auc_score(y_test, predict_val_rs[:, 1])
con_mat = confusion_matrix(y_test, y_pred_rs)

print(f"Results:")
print("-" * 40)
print(f"Accuracy      : {acc:.4f}")
print(f"Precision     : {prec:.4f}")
print(f"Recall        : {rec:.4f}")
print(f"F1 Score      : {f1:.4f}")
print(f"ROC AUC Score : {roc_auc:.4f}" if roc_auc is not None else "ROC AUC not available")
print("\nConfusion Matrix:")
print(con_mat)
print("-" * 40)

### Feature Importance Plot for Final Random Forest Model

In [134]:
# Extract trained model from GridSearchCV
final_rf = modelgrid.best_estimator_
final_rf

In [135]:
# Get feature importances
importances = final_rf.feature_importances_
feature_names = X.columns  # Make sure this matches the columns used in training

In [136]:
# Create a DataFrame
importances_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False).reset_index(drop=True)

In [137]:
# Display Top 20 Features
top_n = 10
top_features = importances_df.head(top_n)

In [138]:
# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=top_features, palette="viridis")
plt.title(f"Top {top_n} Important Features - Random Forest")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
savefig("13_ImportantFeatures_RF.png", "../results")
plt.show()