In [60]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")

In [62]:
df = pd.read_csv("cs-training.csv", index_col=0)

In [63]:
df.rename(columns={"SeriousDlqin2yrs": "Target"}, inplace=True)

In [64]:
imputer = SimpleImputer(strategy='median')
df[['MonthlyIncome', 'NumberOfDependents']] = imputer.fit_transform(df[['MonthlyIncome', 'NumberOfDependents']])

In [65]:
df['DebtToIncomeRatio'] = df['RevolvingUtilizationOfUnsecuredLines'] / (df['MonthlyIncome'] + 1)

In [66]:
df['EstimatedMonthlyDebt'] = df['DebtRatio'] * df['MonthlyIncome']

In [67]:
df['TotalLatePayments'] = (
    df['NumberOfTime30-59DaysPastDueNotWorse'] +
    df['NumberOfTime60-89DaysPastDueNotWorse'] +
    df['NumberOfTimes90DaysLate']
)

In [68]:
df.drop(['NumberOfTime30-59DaysPastDueNotWorse',
         'NumberOfTime60-89DaysPastDueNotWorse',
         'NumberOfTimes90DaysLate'], axis=1, inplace=True)

In [69]:
X = df.drop("Target", axis=1)
y = df["Target"]

In [70]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [72]:
# Impute *all* numeric columns with median
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [73]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [75]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [76]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

In [78]:
print(f"\nModel: {name}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_proba))


Model: XGBoost
Confusion Matrix:
 [[22348   863]
 [ 2052 21461]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94     23211
           1       0.96      0.91      0.94     23513

    accuracy                           0.94     46724
   macro avg       0.94      0.94      0.94     46724
weighted avg       0.94      0.94      0.94     46724

AUC-ROC Score: 0.9823025859360737
