# Credit Risk Prediction

## 1. Introduction

## 2. Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from IPython.display import display


## 3. Data

In [2]:
credit_data_df = pd.read_csv(r"C:\Users\User\Desktop\Projects\credit_risk_modelling\data\credit_risk_dataset.csv")
credit_data_df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


## 4. EDA

### Columns and Rows

In [3]:
credit_data_df.shape

(32581, 12)

In [5]:
credit_data_df.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

The target variable here is loan_status which can be:
* 0 if there is not default on the loan
* 1 if there is a default

In [7]:
# Check value counts
loan_status_counts = credit_data_df['loan_status'].value_counts()
print(loan_status_counts)

# Calculate percentage
loan_status_percentages = credit_data_df['loan_status'].value_counts(normalize=True) * 100
print(loan_status_percentages)

# Optional: Pretty print
print("\nLoan Status Distribution:")
for status, percent in loan_status_percentages.items():
    print(f"Status {status}: {percent:.2f}%")

loan_status
0    25473
1     7108
Name: count, dtype: int64
loan_status
0    78.183604
1    21.816396
Name: proportion, dtype: float64

Loan Status Distribution:
Status 0: 78.18%
Status 1: 21.82%


### Split train/test sets

In [None]:
# Separate features and target
X = credit_data_df.drop(columns=['loan_status'])
y = credit_data_df['loan_status']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,      # 30% for testing
    random_state=2802,  # Set random state for reproducibility
    stratify=y          # Maintain same proportion of 0s and 1s
)

# Optionally, re-combine train features and labels for easy EDA
train_credit_data_df = X_train.copy()
train_credit_data_df['loan_status'] = y_train

# Quick check
print(f"Train shape: {train_credit_data_df.shape}")
print(f"Test shape: {X_test.shape}")

Train shape: (22806, 12)
Test shape: (9775, 11)


### EDA on train_df

| Feature Name                | Description                      | Column Type
|------------------------------|----------------------------------|----------------------------------|
| person_age                   | Age                              | Numeric |
| person_income                | Annual Income                   | Numeric |
| person_home_ownership        | Home ownership                   | Categorical |
| person_emp_length            | Employment length (in years)     | Numeric |
| loan_intent                  | Loan intent                      | Categorical |
| loan_grade                   | Loan grade                       | Categorical |
| loan_amnt                    | Loan amount                      | Numeric |
| loan_int_rate                | Interest rate                    | Numeric |
| **loan_status (TARGET)**         | **Loan status (0 is non default 1 is default)** |  **Categorical** |
| loan_percent_income          | Percent income                   |  Numeric |
| cb_person_default_on_file    | Historical default               | Categorical |
| cb_preson_cred_hist_length   | Credit history length            | Numeric |


In [9]:
train_credit_data_df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
29121,50,900000,MORTGAGE,11.0,DEBTCONSOLIDATION,B,30000,12.69,0.03,N,15,0
18869,28,41000,RENT,10.0,DEBTCONSOLIDATION,C,1500,14.79,0.04,Y,6,0
22857,33,59000,MORTGAGE,6.0,HOMEIMPROVEMENT,B,5500,9.91,0.09,N,9,0
7068,24,51000,MORTGAGE,5.0,EDUCATION,B,8000,10.96,0.16,N,3,0
11397,22,54000,RENT,0.0,VENTURE,E,10000,15.95,0.19,N,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
29557,37,30480,OWN,1.0,MEDICAL,B,1200,11.99,0.04,N,15,0
8937,24,50000,RENT,8.0,PERSONAL,C,8000,15.96,0.16,N,3,0
13170,25,89796,MORTGAGE,2.0,VENTURE,B,9000,11.11,0.10,N,4,0
23800,30,41000,RENT,7.0,VENTURE,A,9600,,0.23,N,9,0


In [19]:
# Create a table for missing values
missing_values = pd.DataFrame({
   #  'Column Name': train_credit_data_df.columns,
    'Missing Values': train_credit_data_df.isnull().sum(),
    'Missing Percentage (%)': train_credit_data_df.isnull().mean() * 100
})

# Sort by most missing values first
missing_values = missing_values.sort_values(by='Missing Percentage (%)', ascending=False)

# Display
display(missing_values.style.background_gradient(cmap='Reds'))

Unnamed: 0,Missing Values,Missing Percentage (%)
loan_int_rate,2202,9.655354
person_emp_length,637,2.793125
person_age,0,0.0
person_income,0,0.0
person_home_ownership,0,0.0
loan_intent,0,0.0
loan_grade,0,0.0
loan_amnt,0,0.0
loan_percent_income,0,0.0
cb_person_default_on_file,0,0.0


In [20]:
train_credit_data_df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,22806.0,22806.0,22169.0,22806.0,20604.0,22806.0,22806.0,22806.0
mean,27.732088,66202.03,4.773513,9565.708585,10.996065,0.169968,5.810576,0.218144
std,6.300267,66386.23,4.166638,6302.955116,3.235314,0.106556,4.036588,0.412995
min,20.0,4000.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,38400.0,2.0,5000.0,7.9,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,30.0,79200.0,7.0,12000.0,13.47,0.23,8.0,0.0
max,144.0,6000000.0,123.0,35000.0,23.22,0.78,30.0,1.0
