In [None]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import tomllib
from pathlib import Path
from pandas import DataFrame
from common.utils import load_dataset, optimize_memory
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import joblib
import os
import xgboost as xgb

## Exploratory Data Analysis

In [None]:
color = sns.color_palette()

#### Training dataframe overview

In [None]:
train_df: DataFrame = load_dataset("loan-approval-prediction-clone", index=False)
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.dtypes

In [None]:
optimize_memory(train_df, deep=True)

In [None]:
train_df.dtypes

In [None]:
sns.countplot(train_df['person_home_ownership'])

In [None]:
sns.countplot(train_df['loan_intent'])

In [None]:
sns.countplot(train_df['loan_grade'])

In [None]:
sns.countplot(train_df['cb_person_default_on_file'])

In [None]:
categorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade',
                    'cb_person_default_on_file']

train_df = pd.get_dummies(train_df,
                          columns=categorical_cols,
                          drop_first=True)

train_df.columns = [col.lower() for col in train_df.columns]

train_df.head()

In [None]:
sns.histplot(train_df['person_age'], kde=True)
plt.xticks(rotation='vertical')
plt.xlabel('Person Age', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()

In [None]:
sns.histplot(train_df['person_income'], kde=True)
plt.xticks(rotation='vertical')
plt.xlabel('Person Income', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()

In [None]:
sns.histplot(train_df['person_emp_length'], kde=True)
plt.xticks(rotation='vertical')
plt.xlabel('Person Employment length', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()

In [None]:
sns.histplot(train_df['loan_amnt'], kde=True)
plt.xticks(rotation='vertical')
plt.xlabel('Person Employment length', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()

In [None]:
sns.histplot(train_df['loan_int_rate'], kde=True)
plt.xticks(rotation='vertical')
plt.xlabel('Person Income', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()

In [None]:
sns.histplot(train_df['loan_percent_income'], kde=True)
plt.xticks(rotation='vertical')
plt.xlabel('Person Income', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()

In [None]:
sns.histplot(train_df['cb_person_cred_hist_length'], kde=True)
plt.xticks(rotation='vertical')
plt.xlabel('Person Income', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()

In [None]:
sns.histplot(train_df['loan_status'], kde=True)
plt.xticks(rotation='vertical')
plt.xlabel('Person Income', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()

In [None]:
np.random.seed(42)

In [None]:
print(train_df.isnull().sum())

train_df = train_df.bfill()

X = train_df.drop(columns=['loan_status']) 
y = train_df['loan_status']

X = pd.get_dummies(X)  
X.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Convert to XGBoost DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'objective': 'binary:logistic',   # Binary classification
    'eval_metric': 'logloss',         # Logarithmic loss
    'max_depth': 6,                   # Maximum tree depth
    'eta': 0.5,                       # Learning rate
    'subsample': 0.8,                 # Row sampling
    'colsample_bytree': 0.8,          # Feature sampling
    'seed': 42
}

# Train the model
num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

In [None]:
# Predict probabilities
y_pred_prob = bst.predict(dtest)

# Convert probabilities to binary labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

In [None]:
# Save the trained model using joblib
joblib.dump(bst, "models/xgb_binary_model.pkl")
print("Model saved as xgb_binary_model.pkl")

# Load the model later
loaded_model = joblib.load("xgb_binary_model.pkl")
print("Model loaded successfully")

# Predict with the loaded model
y_pred_prob = loaded_model.predict(dtest)

# Convert probabilities to binary labels
y_pred = (y_pred_prob > 0.5).astype(int)

# Evaluate accuracy

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy after loading: {accuracy:.4f}')

In [None]:
test_df: DataFrame = load_dataset("loan-approval-prediction-clone", index=False)
test_df.head()