In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

# Load the dataset
data = pd.read_csv('/content/loan_data.csv')

# Check the class distribution
print("Class distribution (Original Data):")
print(data['loan_status'].value_counts(normalize=True))

# Split the dataset into features and target
X = data.drop('loan_status', axis=1)
y = data['loan_status']

# Identify categorical columns
categorical_columns = [col for col in X.columns if X[col].dtype == 'object']

# Create a preprocessor to handle categorical variables
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)],
    remainder='passthrough'
)

# Split into training and testing sets with stratification for balanced splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345, stratify=y)

# Fit the preprocessor and transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Apply RandomUnderSampler for undersampling (imbalanced vs undersampled comparison)
rus = RandomUnderSampler(random_state=12345, sampling_strategy='auto')
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_transformed, y_train)

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier(random_state=12345)

# Define the parameter grid with added hyperparameters
param_grid = {
    'max_depth': [5, 10, 15, 20, 30],  # Extended depth options
    'min_samples_split': [2, 5, 10, 20, 50],  # Test higher values
    'min_samples_leaf': [1, 2, 4, 10, 20],    # Test larger values
    'criterion': ['gini'],  # Test both criteria
    'min_impurity_decrease': [0.0, 0.01, 0.05],  # Test values for impurity decrease
    'max_leaf_nodes': [None, 10, 50, 100],  # Test limits on leaf nodes
    'class_weight': [None, 'balanced']  # Add class balancing options
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Metric to optimize
    n_jobs=-1  # Use all available CPU cores
)

# Perform Grid Search
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters from GridSearchCV
print("Best parameters from GridSearchCV:", grid_search.best_params_)

# For Imbalanced Dataset
grid_search_imbalanced = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)
grid_search_imbalanced.fit(X_train_transformed, y_train)

# Print results for Imbalanced Data
print("Best parameters for Imbalanced Data:")
print(grid_search_imbalanced.best_params_)

# Perform RFECV (Recursive Feature Elimination with Cross-Validation) for feature selection
rfecv = RFECV(estimator=dt_model, step=1, cv=StratifiedKFold(3), scoring='accuracy')
rfecv.fit(X_train_transformed, y_train)

# Print the optimal number of features
print("Optimal number of features: ", rfecv.n_features_)

# Print the selected features (original data, based on RFECV selection)
selected_features = [col for col, selected in zip(X.columns, rfecv.support_) if selected]
print("\nSelected Features (Original Dataset - Imbalanced Data):")
print(selected_features)

# One-Hot Encoded Features for Imbalanced Dataset
one_hot_encoded_features_imbalanced = [col for col in preprocessor.get_feature_names_out() if 'cat' in col]
print("\nSelected Features (One-Hot Encoded + Original) for Imbalanced Dataset:")
print(one_hot_encoded_features_imbalanced)

# Now, let's evaluate both the imbalanced and undersampled datasets:
# 1. Imbalanced model with all features (using RFECV selection)
print("\nTraining Decision Tree on Imbalanced Data (No Resampling)...")
dt_model.fit(X_train_transformed, y_train)
y_train_pred_imbalanced = dt_model.predict(X_train_transformed)
y_test_pred_imbalanced = dt_model.predict(X_test_transformed)

# Print confusion matrix and classification report for imbalanced data
print("Training Set Confusion Matrix (Imbalanced):")
print(confusion_matrix(y_train, y_train_pred_imbalanced))
print("Test Set Confusion Matrix (Imbalanced):")
print(confusion_matrix(y_test, y_test_pred_imbalanced))
print("\nClassification Report (Imbalanced - Training Set):")
print(classification_report(y_train, y_train_pred_imbalanced))
print("\nClassification Report (Imbalanced - Test Set):")
print(classification_report(y_test, y_test_pred_imbalanced))

# 2. Undersampled model with selected features from RFECV
print("\nTraining Decision Tree on Undersampled Data...")
# Use the selected features for undersampled data
X_train_resampled_selected = X_train_resampled[:, rfecv.support_]
dt_model.fit(X_train_resampled_selected, y_train_resampled)
y_train_pred_undersampled = dt_model.predict(X_train_resampled_selected)
y_test_pred_undersampled = dt_model.predict(X_test_transformed[:, rfecv.support_])

# Print confusion matrix and classification report for undersampled data
print("Training Set Confusion Matrix (Undersampled):")
print(confusion_matrix(y_train_resampled, y_train_pred_undersampled))
print("Test Set Confusion Matrix (Undersampled):")
print(confusion_matrix(y_test, y_test_pred_undersampled))
print("\nClassification Report (Undersampled - Training Set):")
print(classification_report(y_train_resampled, y_train_pred_undersampled))
print("\nClassification Report (Undersampled - Test Set):")
print(classification_report(y_test, y_test_pred_undersampled))

# Print the selected features for the undersampled dataset
selected_features_resampled = [col for col, selected in zip(X.columns, rfecv.support_) if selected]
print("\nSelected Features (Undersampled Dataset):")
print(selected_features_resampled)

# One-Hot Encoded Features for Undersampled Dataset
one_hot_encoded_features_undersampled = [col for col in preprocessor.get_feature_names_out() if 'cat' in col]
print("\nSelected Features (One-Hot Encoded + Original) for Undersampled Dataset:")
print(one_hot_encoded_features_undersampled)

Class distribution (Original Data):
loan_status
0    0.777778
1    0.222222
Name: proportion, dtype: float64
Best parameters from GridSearchCV: {'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'max_leaf_nodes': 100, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best parameters for Imbalanced Data:
{'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_leaf_nodes': 100, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 2}
Optimal number of features:  27

Selected Features (Original Dataset - Imbalanced Data):
['person_age', 'person_gender', 'person_education', 'person_income', 'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score', 'previous_loan_defaults_on_file']

Selected Features (One-Hot Encoded + Original) for Imbalanced Dataset:
['cat__person_gender_female', 'cat__person_gender_male', 'cat__person_educa

In [4]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

# Function to calculate and print evaluation metrics
def evaluate_model(y_true, y_pred, dataset_type):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()  # Extract true negatives, false positives, false negatives, and true positives

    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred, pos_label=1)  # Recall (Sensitivity)
    specificity = tn / (tn + fp)  # Specificity (TNR)
    precision = precision_score(y_true, y_pred, pos_label=1)  # Precision

    print(f"\nMetrics for {dataset_type} Test Set:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"Precision: {precision:.4f}")

# Evaluate for Imbalanced Test Set
evaluate_model(y_test, y_test_pred_imbalanced, "Imbalanced")

# Evaluate for Undersampled Test Set
evaluate_model(y_test, y_test_pred_undersampled, "Undersampled")




Metrics for Imbalanced Test Set:
Accuracy: 0.9007
Recall (Sensitivity): 0.7720
Specificity: 0.9374
Precision: 0.7790

Metrics for Undersampled Test Set:
Accuracy: 0.8661
Recall (Sensitivity): 0.8600
Specificity: 0.8679
Precision: 0.6503
