In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import BaseEstimator, ClassifierMixin

class XGBClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, **params):
        self.xgb = XGBClassifier(**params)

    def fit(self, X, y, eval_set=None, **kwargs):
        if eval_set:
            self.xgb.fit(X, y, eval_set=eval_set, **kwargs)
        else:
            self.xgb.fit(X, y)
        return self

    def predict(self, X):
        return self.xgb.predict(X)

    def predict_proba(self, X):
        return self.xgb.predict_proba(X)

    def score(self, X, y):
        return self.xgb.score(X, y)

# Load the dataset
data = pd.read_csv('/content/loan_data.csv')

# Check the class distribution
print("Original Class Distribution:")
print(data['loan_status'].value_counts(normalize=True))

# Split the dataset into features and target
X = data.drop('loan_status', axis=1)
y = data['loan_status']

# Identify categorical columns
categorical_columns = [col for col in X.columns if X[col].dtype == 'object']

# Create a preprocessor to handle categorical variables (without dropping the first category)
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)],
    remainder='passthrough')

# Split into training and testing sets with stratification for balanced splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345, stratify=y)

# Fit the preprocessor and transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# ========== Train XGBoost on the Original Data ==========
xgb_model_orig = XGBClassifierWrapper(
    random_state=12345,
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=500,
    early_stopping_rounds=10
)

# Cross-validation on the original dataset
cv_scores_orig = cross_val_score(xgb_model_orig, X_train_transformed, y_train, cv=3, scoring='accuracy')
print(f"Cross-validation accuracy (Original Data, CV=3): {cv_scores_orig.mean():.4f}")

# Fit the model on the original dataset
xgb_model_orig.fit(X_train_transformed, y_train, eval_set=[(X_test_transformed, y_test)], verbose=True)

# Predict using the original dataset
y_train_pred_orig = xgb_model_orig.predict(X_train_transformed)
y_test_pred_orig = xgb_model_orig.predict(X_test_transformed)

# ========== Apply RandomUnderSampler ==========
rus = RandomUnderSampler(random_state=12345, sampling_strategy='auto')
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_transformed, y_train)

# Check the new class distribution
print("\nNew Class Distribution after Undersampling:")
print(pd.Series(y_train_resampled).value_counts(normalize=True))

# ========== Train XGBoost on the Undersampled Data ==========
xgb_model_resampled = XGBClassifierWrapper(
    random_state=12345,
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=500,
    early_stopping_rounds=10
)

# Cross-validation on the undersampled dataset
cv_scores_resampled = cross_val_score(xgb_model_resampled, X_train_resampled, y_train_resampled, cv=3, scoring='accuracy')
print(f"Cross-validation accuracy (Resampled Data, CV=3): {cv_scores_resampled.mean():.4f}")

# Fit the model on the undersampled dataset
xgb_model_resampled.fit(X_train_resampled, y_train_resampled, eval_set=[(X_test_transformed, y_test)], verbose=True)

# Predict using the undersampled dataset
y_train_pred_resampled = xgb_model_resampled.predict(X_train_resampled)
y_test_pred_resampled = xgb_model_resampled.predict(X_test_transformed)

# ========== Function to Compute Performance Metrics ==========
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=1)
    recall = recall_score(y_true, y_pred, pos_label=1)
    f1 = f1_score(y_true, y_pred, pos_label=1)
    conf_mat = confusion_matrix(y_true, y_pred, labels=[0, 1])
    tn, fp, fn, tp = conf_mat.ravel()
    specificity = tn / (tn + fp)
    return accuracy, precision, recall, f1, specificity

# Compute metrics for Original Data
train_metrics_orig = calculate_metrics(y_train, y_train_pred_orig)
test_metrics_orig = calculate_metrics(y_test, y_test_pred_orig)

# Compute metrics for Undersampled Data
train_metrics_resampled = calculate_metrics(y_train_resampled, y_train_pred_resampled)
test_metrics_resampled = calculate_metrics(y_test, y_test_pred_resampled)

# ========== Display Results ==========
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-Score", "Specificity"],
    "Train (Original)": train_metrics_orig,
    "Test (Original)": test_metrics_orig,
    "Train (Resampled)": train_metrics_resampled,
    "Test (Resampled)": test_metrics_resampled
})

print("\n================== Performance Comparison ==================\n")
print(results_df)

# Display number of boosting rounds used
print(f"\nNumber of boosting rounds used (Original Data): {xgb_model_orig.xgb.best_iteration}")
print(f"Number of boosting rounds used (Resampled Data): {xgb_model_resampled.xgb.best_iteration}")




Original Class Distribution:
loan_status
0    0.777778
1    0.222222
Name: proportion, dtype: float64
Cross-validation accuracy (Original Data, CV=3): 0.9329
[0]	validation_0-logloss:0.38579


Parameters: { "use_label_encoder" } are not used.



[1]	validation_0-logloss:0.31831
[2]	validation_0-logloss:0.27645
[3]	validation_0-logloss:0.24853
[4]	validation_0-logloss:0.22859
[5]	validation_0-logloss:0.21442
[6]	validation_0-logloss:0.20378
[7]	validation_0-logloss:0.19601
[8]	validation_0-logloss:0.18909
[9]	validation_0-logloss:0.18468
[10]	validation_0-logloss:0.18028
[11]	validation_0-logloss:0.17766
[12]	validation_0-logloss:0.17541
[13]	validation_0-logloss:0.17350
[14]	validation_0-logloss:0.17215
[15]	validation_0-logloss:0.17088
[16]	validation_0-logloss:0.16920
[17]	validation_0-logloss:0.16860
[18]	validation_0-logloss:0.16706
[19]	validation_0-logloss:0.16655
[20]	validation_0-logloss:0.16583
[21]	validation_0-logloss:0.16536
[22]	validation_0-logloss:0.16421
[23]	validation_0-logloss:0.16385
[24]	validation_0-logloss:0.16297
[25]	validation_0-logloss:0.16251
[26]	validation_0-logloss:0.16247
[27]	validation_0-logloss:0.16118
[28]	validation_0-logloss:0.16098
[29]	validation_0-logloss:0.15995
[30]	validation_0-loglo

Parameters: { "use_label_encoder" } are not used.



[15]	validation_0-logloss:0.21851
[16]	validation_0-logloss:0.21773
[17]	validation_0-logloss:0.21652
[18]	validation_0-logloss:0.21447
[19]	validation_0-logloss:0.21357
[20]	validation_0-logloss:0.21168
[21]	validation_0-logloss:0.21105
[22]	validation_0-logloss:0.21108
[23]	validation_0-logloss:0.21097
[24]	validation_0-logloss:0.21074
[25]	validation_0-logloss:0.21082
[26]	validation_0-logloss:0.21015
[27]	validation_0-logloss:0.20750
[28]	validation_0-logloss:0.20670
[29]	validation_0-logloss:0.20675
[30]	validation_0-logloss:0.20717
[31]	validation_0-logloss:0.20670
[32]	validation_0-logloss:0.20593
[33]	validation_0-logloss:0.20572
[34]	validation_0-logloss:0.20576
[35]	validation_0-logloss:0.20565
[36]	validation_0-logloss:0.20518
[37]	validation_0-logloss:0.20518
[38]	validation_0-logloss:0.20534
[39]	validation_0-logloss:0.20522
[40]	validation_0-logloss:0.20567
[41]	validation_0-logloss:0.20492
[42]	validation_0-logloss:0.20483
[43]	validation_0-logloss:0.20490
[44]	validatio

In [None]:
# Compute confusion matrices for Original Data
conf_mat_orig_train = confusion_matrix(y_train, y_train_pred_orig)
conf_mat_orig_test = confusion_matrix(y_test, y_test_pred_orig)

# Compute confusion matrices for Undersampled Data
conf_mat_resampled_train = confusion_matrix(y_train_resampled, y_train_pred_resampled)
conf_mat_resampled_test = confusion_matrix(y_test, y_test_pred_resampled)

# Print Confusion Matrices
print("\nConfusion Matrix for Original Data (Train):")
print(conf_mat_orig_train)

print("\nConfusion Matrix for Original Data (Test):")
print(conf_mat_orig_test)

print("\nConfusion Matrix for Resampled Data (Train):")
print(conf_mat_resampled_train)

print("\nConfusion Matrix for Resampled Data (Test):")
print(conf_mat_resampled_test)




Confusion Matrix for Original Data (Train):
[[27595   405]
 [ 1294  6706]]

Confusion Matrix for Original Data (Test):
[[6816  184]
 [ 439 1561]]

Confusion Matrix for Resampled Data (Train):
[[7433  567]
 [ 341 7659]]

Confusion Matrix for Resampled Data (Test):
[[6205  795]
 [ 137 1863]]


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFECV
from imblearn.under_sampling import RandomUnderSampler
import joblib

# Load the dataset
data = pd.read_csv('/content/loan_data.csv')

# Split the dataset into features and target
X = data.drop('loan_status', axis=1)
y = data['loan_status']

# Identify categorical columns
categorical_columns = [col for col in X.columns if X[col].dtype == 'object']

# Create a preprocessor to handle categorical variables
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)],
    remainder='passthrough')

# Split into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345, stratify=y)

# Fit the preprocessor and transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Apply RandomUnderSampler for undersampling
rus = RandomUnderSampler(random_state=12345, sampling_strategy='auto')
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_transformed, y_train)

# Print class distribution after undersampling
print("Class distribution after undersampling:")
print(pd.Series(y_train_resampled).value_counts())

# Define a function to train and evaluate the model
def train_and_evaluate(X_train_data, y_train_data, X_test_data, y_test_data):
    # Initialize the base XGBClassifier
    xgb_base = XGBClassifier(random_state=12345, use_label_encoder=False, eval_metric='logloss')

    # Use RFECV for feature selection
    rfecv = RFECV(estimator=xgb_base, step=1, cv=3, scoring='accuracy', n_jobs=-1)
    X_train_selected = rfecv.fit_transform(X_train_data, y_train_data)
    X_test_selected = rfecv.transform(X_test_data)

    # Get feature names after one-hot encoding
    feature_names = preprocessor.get_feature_names_out()

    # Filter selected features using RFECV support mask
    selected_features = np.array(feature_names)[rfecv.support_]

    print("\nSelected Features:")
    print(selected_features)

    # Print the number of selected features
    print(f"\nOptimal number of features: {rfecv.n_features_}")

    # Define the parameter grid for XGBoost
    param_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    # Initialize GridSearchCV with feature-selected data
    grid_search = GridSearchCV(
        estimator=XGBClassifier(random_state=12345, use_label_encoder=False, eval_metric='logloss'),
        param_grid=param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1
    )

    grid_search.fit(X_train_selected, y_train_data)

    # Print the best parameters
    print("Best parameters from GridSearchCV:", grid_search.best_params_)

    # Train the final model with the best parameters
    xgb_model = XGBClassifier(
        n_estimators=grid_search.best_params_['n_estimators'],
        max_depth=grid_search.best_params_['max_depth'],
        learning_rate=grid_search.best_params_['learning_rate'],
        subsample=grid_search.best_params_['subsample'],
        colsample_bytree=grid_search.best_params_['colsample_bytree'],
        random_state=12345,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    xgb_model.fit(X_train_selected, y_train_data)

    # Predict on training and test sets
    y_train_pred = xgb_model.predict(X_train_selected)
    y_test_pred = xgb_model.predict(X_test_selected)

    # Compute and print training and test metrics
    def compute_metrics(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()

        accuracy = (tp + tn) / (tp + tn + fp + fn)
        recall = tp / (tp + fn)  # Sensitivity (True Positive Rate)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        specificity = tn / (tn + fp)  # True Negative Rate

        return accuracy, recall, precision, specificity

    # Compute and print training metrics
    train_accuracy, train_recall, train_precision, train_specificity = compute_metrics(y_train_data, y_train_pred)
    print("\nTraining Set Metrics:")
    print(f"Accuracy: {train_accuracy:.4f}")
    print(f"Recall (Sensitivity): {train_recall:.4f}")
    print(f"Precision: {train_precision:.4f}")
    print(f"Specificity: {train_specificity:.4f}")

    # Compute and print test metrics
    test_accuracy, test_recall, test_precision, test_specificity = compute_metrics(y_test_data, y_test_pred)
    print("\nTest Set Metrics:")
    print(f"Accuracy: {test_accuracy:.4f}")
    print(f"Recall (Sensitivity): {test_recall:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Specificity: {test_specificity:.4f}")

    # Print classification report
    print("\nClassification Report (Training Set):")
    print(classification_report(y_train_data, y_train_pred))

    print("\nClassification Report (Test Set):")
    print(classification_report(y_test_data, y_test_pred))

    # Print confusion matrices for both training and test sets
    print("\nConfusion Matrix (Training Set):")
    print(confusion_matrix(y_train_data, y_train_pred))

    print("\nConfusion Matrix (Test Set):")
    print(confusion_matrix(y_test_data, y_test_pred))

    return xgb_model

# Train and evaluate on the original (imbalanced) data
print("\n--- Results for Original (Imbalanced) Class Distribution ---")
xgb_model_imbalanced = train_and_evaluate(X_train_transformed, y_train, X_test_transformed, y_test)

# Train and evaluate on the undersampled data
print("\n--- Results for Undersampled Class Distribution ---")
xgb_model_undersampled = train_and_evaluate(X_train_resampled, y_train_resampled, X_test_transformed, y_test)

# Save both models
joblib.dump(xgb_model_imbalanced, '/content/xgboost_model_imbalanced.pkl')
joblib.dump(xgb_model_undersampled, '/content/xgboost_model_undersampled.pkl')