<a href="https://colab.research.google.com/github/jerryorajekwe/Predicting-Loan-Default-Risk-with-Machine-Learning-Models/blob/main/loandefaultworkings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries for preprocessing and imputation
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Libraries for model selection and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score

# Machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# For handling imbalanced data
from imblearn.over_sampling import SMOTE

# Additional utilities
from collections import Counter

In [3]:
# Load the dataset
loan_data = pd.read_csv('/content/drive/MyDrive/loan.csv', low_memory=False)

In [4]:
# 30% random sample
loan_data_sample = loan_data.sample(frac=0.3, random_state=42)

In [5]:
# Save the sampled dataset
loan_data_sample.to_csv('loan_data_sample.csv', index=False)

In [6]:
# Display information about the sampled dataset
print("Sampled Dataset Information:")
loan_data_sample.info()

Sampled Dataset Information:
<class 'pandas.core.frame.DataFrame'>
Index: 678200 entries, 1758049 to 2038627
Columns: 145 entries, id to settlement_term
dtypes: float64(105), int64(4), object(36)
memory usage: 755.4+ MB


In [7]:
print("First 5 Rows of the Dataset:")
loan_data_sample.head()

First 5 Rows of the Dataset:


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
1758049,,,35000,35000,35000.0,36 months,12.12,1164.51,B,B3,...,,,Cash,N,,,,,,
686533,,,30000,30000,30000.0,60 months,10.75,648.54,B,B4,...,,,Cash,N,,,,,,
900721,,,15000,15000,15000.0,36 months,7.49,466.53,A,A4,...,,,Cash,N,,,,,,
1727912,,,24000,24000,24000.0,60 months,21.15,651.31,E,E2,...,,,Cash,N,,,,,,
539691,,,14400,14400,14400.0,36 months,8.59,455.18,A,A5,...,,,Cash,N,,,,,,


In [8]:
print("Table 1: Descriptive Statistics:")
loan_data_sample.describe().round(2)

Table 1: Descriptive Statistics:


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,url,dti,...,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
count,0.0,0.0,678200.0,678200.0,678200.0,678200.0,678200.0,678199.0,0.0,677684.0,...,3120.0,3120.0,3120.0,3120.0,2484.0,3120.0,3120.0,9997.0,9997.0,9997.0
mean,,,15063.79,15058.51,15040.42,13.1,446.3,77946.48,,18.85,...,3.0,151.33,3.0,13.77,444.48,11385.3,194.32,5028.51,47.83,13.09
std,,,9189.32,9187.32,9191.01,4.83,267.19,74649.5,,14.53,...,0.0,125.54,0.0,9.75,366.31,7421.83,201.58,3699.03,7.13,8.07
min,,,500.0,500.0,0.0,5.31,15.69,0.0,,-1.0,...,3.0,1.61,3.0,0.0,10.17,193.98,0.01,107.0,0.45,0.0
25%,,,8000.0,8000.0,8000.0,9.49,251.98,46000.0,,11.91,...,3.0,57.76,3.0,5.0,171.97,5531.34,43.69,2240.75,45.0,6.0
50%,,,13000.0,13000.0,12875.0,12.62,378.59,65000.0,,17.85,...,3.0,116.4,3.0,15.0,344.52,9919.18,128.99,4179.0,45.0,14.0
75%,,,20000.0,20000.0,20000.0,15.99,593.82,93000.0,,24.5,...,3.0,208.74,3.0,23.0,609.23,15698.72,286.32,6831.0,50.0,18.0
max,,,40000.0,40000.0,40000.0,30.99,1717.63,10999200.0,,999.0,...,3.0,893.63,3.0,31.0,2680.89,40149.35,1275.36,30000.0,184.36,50.0


In [9]:
# Get the list of features
features = loan_data_sample.columns.tolist()
print(features)

['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_i

In [10]:
# List of columns to drop
columns_to_drop = [
    'id', 'member_id', 'url', 'desc', 'title', 'zip_code',
    'addr_state', 'pymnt_plan', 'policy_code', 'hardship_flag',
    'debt_settlement_flag', 'debt_settlement_flag_date',
    'settlement_status', 'settlement_date', 'settlement_amount',
    'settlement_percentage', 'settlement_term'
]

# Drop the columns
loan_data_sample = loan_data_sample.drop(columns=columns_to_drop)

# Save the cleaned dataset
loan_data_sample.to_csv('loan_data_cleaned.csv', index=False)

In [None]:
# Display information about the cleaned dataset
print("Cleaned Dataset Information:")
loan_data_sample.info()

In [None]:
# Missing values in each column
loan_data_sample.isnull().sum()

In [None]:
# Drop columns with high missing values
threshold = 0.5
columns_to_drop = loan_data_sample.columns[loan_data_sample.isnull().mean() > threshold]
loan_data_sample = loan_data_sample.drop(columns=columns_to_drop)

print(f"Dropped columns: {list(columns_to_drop)}")

In [None]:
# Fill missing values for numerical columns
num_cols = loan_data_sample.select_dtypes(include=['float64', 'int64']).columns
loan_data_sample[num_cols] = loan_data_sample[num_cols].fillna(loan_data_sample[num_cols].median())

In [None]:
# Fill missing values for categorical columns
cat_cols = loan_data_sample.select_dtypes(include=['object']).columns
loan_data_sample[cat_cols] = loan_data_sample[cat_cols].fillna('Unknown')

In [None]:
# Save the dataset after handling missing values
loan_data_sample.to_csv('loan_data_cleaned.csv', index=False)

print("Missing values handled and dataset saved!")

In [None]:
# List of numerical variables to analyze
num_vars = ['loan_amnt', 'int_rate', 'annual_inc', 'dti']

# Plot histograms with figure labels
plt.figure(figsize=(12, 8))
for i, var in enumerate(num_vars, 1):
    plt.subplot(2, 2, i)  # Create subplots
    sns.histplot(loan_data_sample[var], kde=True, bins=30, color='blue')
    plt.title(f'Fig {i}: Distribution of {var}')
    plt.xlabel(var)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()
