## Loan Approval Prediction

### Import dataset

In [68]:
# import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [69]:
# import dataset and view it
df = pd.read_csv('loan_dataset.csv')
df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


### Modify data (adopted from EDA)

In [70]:
# drop outliers
df = df[df['person_age'] != 123]
df = df[df['person_emp_length'] != 123]
df = df[df['person_income'] <= 1000000]

In [71]:
# drop columns due to their high correlation with other columns
# loan_grade with interest rate (0.94)
# person_age with credit_history_length (0.88)

df.drop(['loan_grade', 'person_age'], axis=1, inplace=True)

In [72]:
# Map 'cb_person_default_on_file' (Y or N) to numerical values
df['cb_person_default_on_file'] = df['cb_person_default_on_file'].map({'N': 0, 'Y': 1})

In [73]:
from sklearn.preprocessing import OneHotEncoder

# One-Hot Encoding for loan_intent and person_home_ownership
one_hot_encoder = OneHotEncoder(sparse_output=False) # Keep all categories
# one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')  # Drop first category to avoid multicollinearity
columns_to_encode = ['loan_intent', 'person_home_ownership']
encoded_features = one_hot_encoder.fit_transform(df[columns_to_encode])

# Create a DataFrame for the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(columns_to_encode), index=df.index)

# Drop the original columns and join the encoded columns
df_encoded = df.drop(columns=columns_to_encode).join(encoded_df)

print("DataFrame after One-Hot Encoding:")
df_encoded.head()

DataFrame after One-Hot Encoding:


Unnamed: 0,id,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT
0,0,35000,0.0,6000,11.49,0.17,0,14,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,56000,6.0,4000,13.35,0.07,0,2,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,28800,8.0,6000,8.9,0.21,0,10,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,3,70000,14.0,12000,11.11,0.17,0,5,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,4,60000,2.0,6000,6.92,0.1,0,3,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [74]:
data = df_encoded.copy()

In [75]:
# import the necessary libraries for modeling

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

## Model functions

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_auc_score

# Initialize an empty DataFrame with the specified headers
metrics_df = pd.DataFrame(columns=['Model/Metric', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC'])

def prepare_data(data):
    # Define features and target
    X = data.drop(columns=['id', 'loan_status'])
    y = data['loan_status']

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    return X_train, X_test, y_train, y_test

def evaluate_model(y_test, y_pred, y_pred_proba, model_name):
    # Evaluate the model
    accuracy = round(accuracy_score(y_test, y_pred), 2)
    precision = round(precision_score(y_test, y_pred, average='binary'), 2)
    recall = round(recall_score(y_test, y_pred, average='binary'), 2)
    f1 = round(f1_score(y_test, y_pred, average='binary'), 2)
    auc = round(roc_auc_score(y_test, y_pred_proba), 2)
    
    # print(f"Accuracy: {accuracy}")
    # print(f"Precision: {precision}")
    # print(f"Recall: {recall}")
    # print(f"F1-Score: {f1}")
    # print(f"AUC: {auc}")
    # class_report = classification_report(y_test, y_pred)
    # print("Classification Report:")
    # print(class_report)
    # conf_matrix = confusion_matrix(y_test, y_pred)
    # plt.figure(figsize=(4, 3))
    # sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['0', '1'], yticklabels=['0', '1'])
    # plt.xlabel('Predicted')
    # plt.ylabel('True')
    # plt.title('Confusion Matrix')
    # plt.show()
    
    # Add metrics to the DataFrame
    new_row = pd.DataFrame({
        'Model/Metric': [model_name],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1-score': [f1],
        'AUC': [auc]
    })
    
    global metrics_df
    # Ensure new_row does not contain empty or all-NA columns
    new_row = new_row.dropna(axis=1, how='all')
    # Ensure metrics_df does not contain empty or all-NA columns before concatenation
    metrics_df.dropna(axis=1, how='all', inplace=True)
    # Append the new row to the DataFrame
    metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

### Logistic Regression

In [77]:
X_train, X_test, y_train, y_test = prepare_data(data)

model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000, solver='liblinear')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
evaluate_model(y_test, y_pred, y_pred_proba, 'Logistic Regression')


### KNN (K-Nearest Neighbors)

In [None]:
X_train, X_test, y_train, y_test = prepare_data(data)

model = KNeighborsClassifier(
    n_neighbors=5,      # Number of neighbors to use
    weights='uniform',  # Use uniform weights
    algorithm='auto',   # Choose algorithm used to compute nearest neighbors
    n_jobs=-1           # Use all available processors
)   

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
evaluate_model(y_test, y_pred, y_pred_proba, 'KNN')

### ADABoost

In [79]:
X_train, X_test, y_train, y_test = prepare_data(data)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

base_estimator = DecisionTreeClassifier(max_depth=1, class_weight='balanced', random_state=42)

model = AdaBoostClassifier(
    estimator=base_estimator,
    n_estimators=50,
    algorithm='SAMME',
    learning_rate=0.1,
    random_state=42
)

model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
evaluate_model(y_test, y_pred, y_pred_proba, 'ADABoost')

### XGBoost

In [80]:
# XGBoost with different weights

def create_xgb_classifier(multiplication_factor=1.0):
    X_train, X_test, y_train, y_test = prepare_data(data)
    neg, pos = y_train.value_counts() # Calculate class imbalance ratio
    scale_pos_weight = neg / pos   # Increase weight for the minority class
    adjusted_weight = scale_pos_weight * multiplication_factor

    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        # use_label_encoder=False,
        scale_pos_weight=adjusted_weight,  # Balances classes by giving more weight to the minority class
        max_depth=6,
        learning_rate=0.1,
        n_estimators=200,
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    evaluate_model(y_test, y_pred, y_pred_proba, f"XGBoost (neg/pos * {multiplication_factor})")

create_xgb_classifier(1.0)
# create_xgb_classifier(1.5)
create_xgb_classifier(2.0)
# create_xgb_classifier(2.5)
create_xgb_classifier(3.0)

### Random forest classifier

In [81]:
# No Grid Search CV and class_weight = balanced
X_train, X_test, y_train, y_test = prepare_data(data)

model = RandomForestClassifier(
    n_estimators=200,  # Number of trees
    max_depth=5,      # Limit depth to prevent overfitting
    random_state=42,
    class_weight='balanced'  # Handles class imbalance
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
evaluate_model(y_test, y_pred, y_pred_proba, 'Random Forest')

In [82]:
# Grid Search CV and class_weight = ['balanced', 'balanced_subsample']

from sklearn.model_selection import GridSearchCV

def create_rf_classifier_with_cv(cv=2):
    X_train, X_test, y_train, y_test = prepare_data(data)
    model = RandomForestClassifier(random_state=42, class_weight='balanced')

    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': ['balanced', 'balanced_subsample']
    }

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    print(f"Best Parameters: {best_params}")

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    evaluate_model(y_test, y_pred, y_pred_proba, f"Random Forest (cv = {cv})")


create_rf_classifier_with_cv(2)
create_rf_classifier_with_cv(3)
create_rf_classifier_with_cv(4)

Best Parameters: {'class_weight': 'balanced', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Parameters: {'class_weight': 'balanced', 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best Parameters: {'class_weight': 'balanced_subsample', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [83]:
# No Grid Search CV and class_weight = {0: 1, 1: neg / pos * multiplication_factor}

def create_rf_classifier_with_weights(multiplication_factor=1.0):
    X_train, X_test, y_train, y_test = prepare_data(data)

    neg, pos = y_train.value_counts()
    scale_pos_weight = neg / pos
    class_weights = {0: 1, 1: scale_pos_weight * multiplication_factor}

    model = RandomForestClassifier(
        n_estimators=200,  # Number of trees
        max_depth=5,      # Limit depth to prevent overfitting
        random_state=42,
        class_weight=class_weights  # Custom weights to handle imbalance
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    evaluate_model(y_test, y_pred, y_pred_proba, f"Random Forest (neg/pos * {multiplication_factor})")

create_rf_classifier_with_weights(1.0)
create_rf_classifier_with_weights(2.0)
create_rf_classifier_with_weights(3.0)

In [84]:
# Grid Search CV and class_weight = {0: 1, 1: neg / pos * multiplication_factor}

def create_rf_classifier_with_cv_and_weights(cv=2, multiplication_factor=1.0):
    
    X_train, X_test, y_train, y_test = prepare_data(data)

    neg, pos = y_train.value_counts()
    scale_pos_weight = neg / pos
    class_weights = {0: 1, 1: scale_pos_weight * multiplication_factor}

    model = RandomForestClassifier(random_state=42, class_weight='balanced')

    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [class_weights]
    }

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    print(f"Best Parameters: {best_params}")

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    evaluate_model(y_test, y_pred, y_pred_proba, f"Random Forest (cv = {cv}) and (neg/pos * {multiplication_factor})")

create_rf_classifier_with_cv_and_weights(2, 1.0)
create_rf_classifier_with_cv_and_weights(2, 1.5)
create_rf_classifier_with_cv_and_weights(2, 2.0)

create_rf_classifier_with_cv_and_weights(3, 1.0)
create_rf_classifier_with_cv_and_weights(3, 1.5)
create_rf_classifier_with_cv_and_weights(3, 2.0)

Best Parameters: {'class_weight': {0: 1, 1: 6.023356789938614}, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Parameters: {'class_weight': {0: 1, 1: 9.03503518490792}, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Parameters: {'class_weight': {0: 1, 1: 12.046713579877228}, 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best Parameters: {'class_weight': {0: 1, 1: 6.023356789938614}, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Parameters: {'class_weight': {0: 1, 1: 9.03503518490792}, 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best Parameters: {'class_weight': {0: 1, 1: 12.046713579877228}, 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [85]:
# Print metrics df
metrics_df

Unnamed: 0,Model/Metric,Accuracy,Precision,Recall,F1-score,AUC
0,Logistic Regression,0.73,0.32,0.83,0.47,0.86
1,KNN,0.89,0.67,0.49,0.57,0.84
2,ADABoost,0.89,0.61,0.69,0.65,0.88
3,XGBoost (neg/pos * 1.0),0.92,0.7,0.83,0.76,0.95
4,XGBoost (neg/pos * 2.0),0.89,0.56,0.88,0.69,0.95
5,XGBoost (neg/pos * 3.0),0.85,0.49,0.91,0.64,0.95
6,Random Forest,0.9,0.62,0.78,0.69,0.92
7,Random Forest (cv = 2),0.91,0.64,0.79,0.71,0.93
8,Random Forest (cv = 3),0.91,0.64,0.79,0.71,0.93
9,Random Forest (cv = 4),0.91,0.65,0.78,0.71,0.93


In [86]:
metrics_df.to_csv('model_metrics.csv', index=False)