In [1]:
import pandas as pd 
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('loan_data.csv')
data.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [3]:
data.isnull().sum()

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64

In [4]:
encoder = LabelEncoder()
cat_col = list(data.select_dtypes(include='object').columns)
for col in cat_col:
    encoder.fit(data[col])
    data[col] = encoder.transform(data[col])

In [5]:
x = data.drop(columns=['loan_status'])
y = data['loan_status']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= 0.25, random_state=20, stratify=y)

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
model = RandomForestClassifier(n_estimators= 5, random_state= 23, max_depth= 4)
model.fit(x_train,y_train)
train_preds = model.predict(x_train)
test_preds = model.predict(x_test)

print(f'train: {classification_report(y_train, train_preds)}')
print(f'Test: {classification_report(y_test, test_preds)}')

train:               precision    recall  f1-score   support

           0       0.88      0.98      0.93      8750
           1       0.90      0.55      0.68      2500

    accuracy                           0.89     11250
   macro avg       0.89      0.77      0.81     11250
weighted avg       0.89      0.89      0.88     11250

Test:               precision    recall  f1-score   support

           0       0.88      0.98      0.93     26250
           1       0.90      0.55      0.68      7500

    accuracy                           0.89     33750
   macro avg       0.89      0.77      0.81     33750
weighted avg       0.89      0.89      0.88     33750

