In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [2]:
credit_risk = pd.read_csv("credit_risk_dataset.csv")
credit_risk.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
credit_risk.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [4]:
#What stands out from this data is the age (max should not be 144 its too much)
#income will not be mentioned here although perhaps per year should be something to look for and model for sure
#Person_emp_length no one employed for 123 years. Maybe the person who is 144 might have been. 
#Loan amount fine
#Loan interest rate, 23.22 is a lot. 

In [5]:
credit_risk_copy = credit_risk.copy()

In [6]:
credit_risk.pivot_table(index='person_age',values='person_income',aggfunc='count')

Unnamed: 0_level_0,person_income
person_age,Unnamed: 1_level_1
20,15
21,1229
22,3633
23,3889
24,3549
25,3037
26,2477
27,2138
28,1854
29,1687


In [7]:
credit_risk_age_rmvd = credit_risk[credit_risk['person_age']<81.3]
credit_risk_age_rmvd.reset_index(drop=True,inplace=True)
credit_risk_age_rmvd.head()
# This is as life expectancy in Canada is 81.3. So, to avoid risk, I do 
# not want to give loans to anyone above the life expectancy

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [8]:
credit_risk_age_rmvd.pivot_table(index='person_emp_length',columns='loan_status',values='person_income',aggfunc='count').reset_index().sort_values(by='person_emp_length',ascending=False)

loan_status,person_emp_length,0,1
35,123.0,1.0,1.0
34,41.0,1.0,
33,38.0,1.0,
32,34.0,,1.0
31,31.0,4.0,
30,30.0,1.0,1.0
29,29.0,,1.0
28,28.0,3.0,
27,27.0,4.0,1.0
26,26.0,5.0,1.0


In [9]:
credit_risk_age_emp_rmvd = credit_risk_age_rmvd[credit_risk_age_rmvd['person_emp_length']<81.3-18]
credit_risk_age_emp_rmvd.reset_index(drop=True, inplace=True)
#This is as the retirement age is 65 and people allowed employment at 18. so 65-18 is employment length I would 
#like to capture. I have also resetted the index.
credit_risk_age_emp_rmvd.pivot_table(index='person_emp_length',columns='loan_status',values='person_income',aggfunc='count').reset_index().sort_values(by='person_emp_length',ascending=False)

loan_status,person_emp_length,0,1
34,41.0,1.0,
33,38.0,1.0,
32,34.0,,1.0
31,31.0,4.0,
30,30.0,1.0,1.0
29,29.0,,1.0
28,28.0,3.0,
27,27.0,4.0,1.0
26,26.0,5.0,1.0
25,25.0,8.0,


In [10]:
credit_risk_age_emp_rmvd.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [11]:
credit_risk_age_emp_rmvd.isnull().sum()


person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length                0
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3046
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [85]:
#notice that off of this, loan_int_rate has many null values. I now need to fill in these null values with mean or median
cr_data = credit_risk_age_emp_rmvd.copy()
cr_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [13]:
#notice that for the above data the loan interest rate, the mean(11.04) is very similar to the median (50% : 10.99)
#So, we can replace the null values with whichever one we want. 

#replacement of null value info is down below:
#If the data is normally distributed, inputing with the mean can be appropriate, as it represents the central 
#tendency of the data. Median: The median is more robust to outliers. If the data is skewed or contains significant
#outliers, the median may provide a better representation of the central tendency. Thus, I will choose to replace
#the null values with the median
cr_data.fillna({'loan_int_rate':cr_data['loan_int_rate'].median()},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cr_data.fillna({'loan_int_rate':cr_data['loan_int_rate'].median()},inplace=True)


In [14]:
cr_data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [15]:
cr_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,31677.0,31677.0,31677.0,31677.0,31677.0,31677.0,31677.0,31677.0
mean,27.726805,66490.55,4.782271,9660.051457,11.035034,0.215456,0.169609,5.807968
std,6.194392,52769.68,4.034989,6334.535354,3.070238,0.411145,0.10627,4.056804
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,39396.0,2.0,5000.0,8.49,0.0,0.09,3.0
50%,26.0,56000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,80000.0,7.0,12500.0,13.16,0.0,0.23,8.0
max,80.0,2039784.0,41.0,35000.0,23.22,1.0,0.83,30.0


In [16]:
cr_data.groupby('loan_status').count()['person_age']
#Data isn't really balanced right, so we need to see this forward. 

loan_status
0    24852
1     6825
Name: person_age, dtype: int64

In [17]:
cr_data.groupby('loan_intent').count()['loan_status']

loan_intent
DEBTCONSOLIDATION    5064
EDUCATION            6288
HOMEIMPROVEMENT      3510
MEDICAL              5896
PERSONAL             5366
VENTURE              5553
Name: loan_status, dtype: int64

In [18]:
cr_data.groupby('loan_grade').count()['loan_status']

loan_grade
A    10369
B    10183
C     6318
D     3555
E      952
F      236
G       64
Name: loan_status, dtype: int64

In [19]:
## I have decided to remove the loan grade as that itself is a measure of loan default prediction based on 
## creditworthiness etc.
cr_data_loan_grade_drop = cr_data.drop('loan_grade',axis=1)
cr_data_loan_grade_drop.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,2500,7.14,1,0.25,N,2


In [20]:
#Because we have removed the age and employment years, we need to sort out the index to make sure they match

In [86]:
#Have decided to remove person income as technically the only real valuable term here is loan_percent_income,
#I will even be removing loan amount here then based on the same logic.
cr_data_income_drop = cr_data_loan_grade_drop.drop('person_income', axis=1)
cr_data_income_drop.head()

Unnamed: 0,person_age,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,OWN,5.0,EDUCATION,1000,11.14,0,0.1,N,2
1,25,MORTGAGE,1.0,MEDICAL,5500,12.87,1,0.57,N,3
2,23,RENT,4.0,MEDICAL,35000,15.23,1,0.53,N,2
3,24,RENT,8.0,MEDICAL,35000,14.27,1,0.55,Y,4
4,21,OWN,2.0,VENTURE,2500,7.14,1,0.25,N,2


In [22]:
cr_data_loan_amt_drop = cr_data_income_drop.drop('loan_amnt',axis=1)  
cr_data_loan_amt_drop_copy = cr_data_loan_amt_drop.copy()
cr_data_loan_amt_drop.head()

Unnamed: 0,person_age,person_home_ownership,person_emp_length,loan_intent,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,OWN,5.0,EDUCATION,11.14,0,0.1,N,2
1,25,MORTGAGE,1.0,MEDICAL,12.87,1,0.57,N,3
2,23,RENT,4.0,MEDICAL,15.23,1,0.53,N,2
3,24,RENT,8.0,MEDICAL,14.27,1,0.55,Y,4
4,21,OWN,2.0,VENTURE,7.14,1,0.25,N,2


In [23]:
cr_data_cat_treated = cr_data_loan_amt_drop_copy.copy()

In [24]:
# Now I want to convert these columns to numerical boolean values. Need to convert loan grade to the same
person_loan_grade = pd.get_dummies(cr_data_cat_treated['person_home_ownership']).astype(int)
person_loan_grade

Unnamed: 0,MORTGAGE,OTHER,OWN,RENT
0,0,0,1,0
1,1,0,0,0
2,0,0,0,1
3,0,0,0,1
4,0,0,1,0
...,...,...,...,...
31672,1,0,0,0
31673,1,0,0,0
31674,0,0,0,1
31675,1,0,0,0


In [25]:
person_loan_intent = pd.get_dummies(cr_data_cat_treated['loan_intent']).astype(int)
person_loan_intent.head()

Unnamed: 0,DEBTCONSOLIDATION,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE
0,0,1,0,0,0,0
1,0,0,0,1,0,0
2,0,0,0,1,0,0
3,0,0,0,1,0,0
4,0,0,0,0,0,1


In [87]:
cr_data_cat_treated['cb_person_default_on_file_binary'] = np.where(cr_data_cat_treated['cb_person_default_on_file']=='Y',1,0)
cr_data_cat_treated.head()

Unnamed: 0,person_age,person_home_ownership,person_emp_length,loan_intent,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,cb_person_default_on_file_binary
0,21,OWN,5.0,EDUCATION,11.14,0,0.1,N,2,0
1,25,MORTGAGE,1.0,MEDICAL,12.87,1,0.57,N,3,0
2,23,RENT,4.0,MEDICAL,15.23,1,0.53,N,2,0
3,24,RENT,8.0,MEDICAL,14.27,1,0.55,Y,4,1
4,21,OWN,2.0,VENTURE,7.14,1,0.25,N,2,0


In [27]:
## I need to standardise the difference between numerical values like person_income and person_age. These are
## 2 seperate values that need to be differentiated seperately. Mean is 0 SD is 1, it is standardised. So, I need
# remove the values that will not be scaled in order to scale the rest of them. Don't want to include boolean values
person_home_ownership = pd.get_dummies(cr_data_cat_treated['person_home_ownership']).astype(int)
loan_intent = pd.get_dummies(cr_data_cat_treated['loan_intent']).astype(int)
data_to_scale = cr_data_cat_treated.drop(['person_home_ownership','loan_intent','loan_status','cb_person_default_on_file','cb_person_default_on_file_binary'],axis=1)
data_to_scale.head()


Unnamed: 0,person_age,person_emp_length,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
0,21,5.0,11.14,0.1,2
1,25,1.0,12.87,0.57,3
2,23,4.0,15.23,0.53,2
3,24,8.0,14.27,0.55,4
4,21,2.0,7.14,0.25,2


In [88]:
# (x-mean of x)/std of x --> formula for standardizing
# Many machine learning algorithms (e.g., gradient descent-based models like 
#(logistic regression, linear regression, neural networks) perform better when features are on a similar scale.
#Algorithms that rely on distance calculations 
#(KNN, SVM, k-means clustering) can be biased towards larger magnitude features if not scaled.

scaler = StandardScaler()
data_to_scale.columns

Index(['person_age', 'person_emp_length', 'loan_int_rate',
       'loan_percent_income', 'cb_person_cred_hist_length'],
      dtype='object')

In [29]:
scaled_data = scaler.fit_transform(data_to_scale)

In [30]:
scaled_df = pd.DataFrame(scaled_data,columns=['person_age', 'person_emp_length',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'])
scaled_df.head()


Unnamed: 0,person_age,person_emp_length,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
0,-1.085968,0.053961,0.034189,-0.655028,-0.938677
1,-0.440212,-0.937383,0.597672,3.76773,-0.692173
2,-0.76309,-0.193875,1.366354,3.391325,-0.938677
3,-0.601651,0.797469,1.05367,3.579527,-0.44567
4,-1.085968,-0.689547,-1.268662,0.75649,-0.938677


In [31]:
scaled_df.describe()
#basically, everything has mean 1 std as 0

Unnamed: 0,person_age,person_emp_length,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
count,31677.0,31677.0,31677.0,31677.0,31677.0
mean,1.722691e-16,5.742303e-17,7.945015e-16,-1.004903e-16,0.0
std,1.000016,1.000016,1.000016,1.000016,1.000016
min,-1.247407,-1.185219,-1.828888,-1.59604,-0.938677
25%,-0.7630901,-0.6895471,-0.8289499,-0.7491292,-0.692173
50%,-0.2787735,-0.193875,-0.01466802,-0.1845218,-0.44567
75%,0.3669821,0.5496332,0.6921286,0.568288,0.540343
max,8.438927,8.976059,3.968799,6.214362,5.963417


In [32]:
scaled_df.shape

(31677, 5)

In [33]:
scaled_data_combined = pd.concat([scaled_df,person_loan_intent,person_home_ownership],axis=1)
scaled_data_combined['cb_person_default_on_file']=cr_data_cat_treated['cb_person_default_on_file_binary']
scaled_data_combined['loan_status'] = cr_data_cat_treated['loan_status']
scaled_data_combined.head()

Unnamed: 0,person_age,person_emp_length,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,DEBTCONSOLIDATION,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,MORTGAGE,OTHER,OWN,RENT,cb_person_default_on_file,loan_status
0,-1.085968,0.053961,0.034189,-0.655028,-0.938677,0,1,0,0,0,0,0,0,1,0,0,0
1,-0.440212,-0.937383,0.597672,3.76773,-0.692173,0,0,0,1,0,0,1,0,0,0,0,1
2,-0.76309,-0.193875,1.366354,3.391325,-0.938677,0,0,0,1,0,0,0,0,0,1,0,1
3,-0.601651,0.797469,1.05367,3.579527,-0.44567,0,0,0,1,0,0,0,0,0,1,1,1
4,-1.085968,-0.689547,-1.268662,0.75649,-0.938677,0,0,0,0,0,1,0,0,1,0,0,1


In [34]:
scaled_data_combined.shape

(31677, 17)

In [35]:
#Just want to take a count of number of defaults to payments in loan status
scaled_data_combined.groupby('loan_status').size()

loan_status
0    24852
1     6825
dtype: int64

In [36]:
#look at the ratio. You need to balance this data. Synthetically create data for the minority class. 
6825/(24852+6825)
target = scaled_data_combined['loan_status']
features = scaled_data_combined.drop('loan_status',axis=1)
features.head()

Unnamed: 0,person_age,person_emp_length,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,DEBTCONSOLIDATION,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,MORTGAGE,OTHER,OWN,RENT,cb_person_default_on_file
0,-1.085968,0.053961,0.034189,-0.655028,-0.938677,0,1,0,0,0,0,0,0,1,0,0
1,-0.440212,-0.937383,0.597672,3.76773,-0.692173,0,0,0,1,0,0,1,0,0,0,0
2,-0.76309,-0.193875,1.366354,3.391325,-0.938677,0,0,0,1,0,0,0,0,0,1,0
3,-0.601651,0.797469,1.05367,3.579527,-0.44567,0,0,0,1,0,0,0,0,0,1,1
4,-1.085968,-0.689547,-1.268662,0.75649,-0.938677,0,0,0,0,0,1,0,0,1,0,0


In [37]:
#now we need to balance the data
!pip install imbalanced-learn

Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Using cached sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Using cached sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3


In [38]:
from imblearn.over_sampling import SMOTE

In [39]:
smote = SMOTE()

In [40]:
balanced_features, balanced_target = smote.fit_resample(features,target)

In [41]:
balanced_features.shape

(49704, 16)

In [42]:
scaled_data_combined.groupby('loan_status').size()

loan_status
0    24852
1     6825
dtype: int64

In [43]:
balanced_target_df = pd.DataFrame({'target':balanced_target})
balanced_target_df.groupby('target').size()

target
0    24852
1    24852
dtype: int64

In [44]:
# Train models blud

In [45]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Using cached nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)
Using cached xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
Using cached nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (201.4 MB)
Installing collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.25.1 xgboost-2.1.3
Note: you may need to restart the kernel to use updated packages.


In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [47]:
x_train, x_test, y_train, y_test = train_test_split(balanced_features, balanced_target, test_size=0.20, random_state = 42)


In [48]:
logit = LogisticRegression()

In [49]:
logit.fit(x_train,y_train)

In [50]:
logit.score(x_train,y_train)

0.7735834821316299

In [51]:
logit_prediction = logit.predict(x_test)
logit_prediction

array([0, 1, 0, ..., 0, 0, 0])

In [52]:
print(classification_report(y_test,logit_prediction))
#model could be overfitted (trained on data so well that can't make accurate predictions)

              precision    recall  f1-score   support

           0       0.77      0.78      0.78      5019
           1       0.77      0.77      0.77      4922

    accuracy                           0.77      9941
   macro avg       0.77      0.77      0.77      9941
weighted avg       0.77      0.77      0.77      9941



In [53]:
print(logit.coef_)

[[-0.11736786 -0.10822256  0.88678986  0.93870902 -0.01067791 -3.13039338
  -3.87968866 -3.07553279 -3.2493709  -3.71292708 -4.20029317 -3.36465981
  -3.6724557  -4.95806042 -2.52816416  0.13644678]]


In [54]:
features_imp_logit = pd.DataFrame({'features':balanced_features.columns,'logit_imp':logit.coef_[0]})
features_imp_logit.sort_values(by='logit_imp',ascending=False)

Unnamed: 0,features,logit_imp
3,loan_percent_income,0.938709
2,loan_int_rate,0.88679
15,cb_person_default_on_file,0.136447
4,cb_person_cred_hist_length,-0.010678
1,person_emp_length,-0.108223
0,person_age,-0.117368
14,RENT,-2.528164
7,HOMEIMPROVEMENT,-3.075533
5,DEBTCONSOLIDATION,-3.130393
8,MEDICAL,-3.249371


In [55]:
rf = RandomForestClassifier()

In [56]:
rf.fit(x_train,y_train)

In [57]:
rf.score(x_train,y_train)

0.9997736589291553

In [58]:
rf_prediction = rf.predict(x_test)
rf_prediction 

array([0, 1, 0, ..., 1, 0, 1])

In [59]:
print(classification_report(y_test,rf_prediction))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93      5019
           1       0.96      0.90      0.93      4922

    accuracy                           0.93      9941
   macro avg       0.93      0.93      0.93      9941
weighted avg       0.93      0.93      0.93      9941



In [60]:
rf.feature_importances_

array([0.10035856, 0.10442748, 0.24339317, 0.28450235, 0.09209301,
       0.01262816, 0.01118398, 0.01211743, 0.01269345, 0.0093597 ,
       0.01315095, 0.0254353 , 0.00077975, 0.01662014, 0.04137895,
       0.01987761])

In [61]:
features_imp_rf = pd.DataFrame({'features':balanced_features.columns,'rf_imp':rf.feature_importances_})
features_imp_rf.sort_values(by='rf_imp',ascending=False)

Unnamed: 0,features,rf_imp
3,loan_percent_income,0.284502
2,loan_int_rate,0.243393
1,person_emp_length,0.104427
0,person_age,0.100359
4,cb_person_cred_hist_length,0.092093
14,RENT,0.041379
11,MORTGAGE,0.025435
15,cb_person_default_on_file,0.019878
13,OWN,0.01662
10,VENTURE,0.013151


In [62]:
##XG boost model

In [63]:
xgb_model = XGBClassifier(tree_method = 'exact')

In [64]:
#model.fit(x,y.values.rave1())
xgb_model.fit(x_train,y_train.values.ravel())

In [65]:
xgb_model.score(x_train,y_train.values.ravel())

0.9508588386188165

In [66]:
xgb_model_prediction = xgb_model.predict(x_test)
xgb_model_prediction 

array([0, 1, 0, ..., 1, 0, 1])

In [67]:
print(classification_report(y_test,xgb_model_prediction))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5019
           1       0.98      0.89      0.94      4922

    accuracy                           0.94      9941
   macro avg       0.94      0.94      0.94      9941
weighted avg       0.94      0.94      0.94      9941



In [68]:
features_imp_xgb_model = pd.DataFrame({'features':balanced_features.columns,'xgb_model_imp':xgb_model.feature_importances_})
features_imp_xgb_model.sort_values(by='xgb_model_imp',ascending=False)

Unnamed: 0,features,xgb_model_imp
14,RENT,0.172766
3,loan_percent_income,0.116229
13,OWN,0.107236
2,loan_int_rate,0.076359
10,VENTURE,0.074831
4,cb_person_cred_hist_length,0.071403
8,MEDICAL,0.06088
1,person_emp_length,0.059889
5,DEBTCONSOLIDATION,0.045847
7,HOMEIMPROVEMENT,0.044841


In [69]:
feature_imp = pd.concat([features_imp_xgb_model,features_imp_rf,features_imp_logit],axis=1)
feature_imp
#based on these, we are trying to figure out which features are the most important based on the regression values

Unnamed: 0,features,xgb_model_imp,features.1,rf_imp,features.2,logit_imp
0,person_age,0.042641,person_age,0.100359,person_age,-0.117368
1,person_emp_length,0.059889,person_emp_length,0.104427,person_emp_length,-0.108223
2,loan_int_rate,0.076359,loan_int_rate,0.243393,loan_int_rate,0.88679
3,loan_percent_income,0.116229,loan_percent_income,0.284502,loan_percent_income,0.938709
4,cb_person_cred_hist_length,0.071403,cb_person_cred_hist_length,0.092093,cb_person_cred_hist_length,-0.010678
5,DEBTCONSOLIDATION,0.045847,DEBTCONSOLIDATION,0.012628,DEBTCONSOLIDATION,-3.130393
6,EDUCATION,0.024249,EDUCATION,0.011184,EDUCATION,-3.879689
7,HOMEIMPROVEMENT,0.044841,HOMEIMPROVEMENT,0.012117,HOMEIMPROVEMENT,-3.075533
8,MEDICAL,0.06088,MEDICAL,0.012693,MEDICAL,-3.249371
9,PERSONAL,0.021263,PERSONAL,0.00936,PERSONAL,-3.712927


In [70]:
## Further exploration

In [71]:
xgb_prediction_df = pd.DataFrame({'test_indices_xgb':x_test.index,'xgb_prediction':xgb_model_prediction})
logit_prediction_df = pd.DataFrame({'test_indices_logit':x_test.index,'logit_prediction':logit_prediction})
rf_prediction_df = pd.DataFrame({'test_indices_rf':x_test.index,'rf_prediction':rf_prediction})
logit_prediction_df.head()

Unnamed: 0,test_indices_logit,logit_prediction
0,9060,0
1,44416,1
2,9592,0
3,41317,0
4,31374,1


In [72]:
merged_with_xgb = cr_data_loan_amt_drop_copy.merge(xgb_prediction_df,left_index=True, right_on='test_indices_xgb', how='left')
merged_with_xgb.head()

Unnamed: 0,person_age,person_home_ownership,person_emp_length,loan_intent,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_xgb,xgb_prediction
,21,OWN,5.0,EDUCATION,11.14,0,0.1,N,2,0,
9843.0,25,MORTGAGE,1.0,MEDICAL,12.87,1,0.57,N,3,1,1.0
,23,RENT,4.0,MEDICAL,15.23,1,0.53,N,2,2,
,24,RENT,8.0,MEDICAL,14.27,1,0.55,Y,4,3,
478.0,21,OWN,2.0,VENTURE,7.14,1,0.25,N,2,4,0.0


In [73]:
merged_with_rf = merged_with_xgb.merge(rf_prediction_df,left_index=True, right_on='test_indices_rf', how='left')
merged_with_rf.head()

Unnamed: 0,person_age,person_home_ownership,person_emp_length,loan_intent,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_xgb,xgb_prediction,test_indices_rf,rf_prediction
,21,OWN,5.0,EDUCATION,11.14,0,0.1,N,2,0,,,
,25,MORTGAGE,1.0,MEDICAL,12.87,1,0.57,N,3,1,1.0,9843.0,
,23,RENT,4.0,MEDICAL,15.23,1,0.53,N,2,2,,,
,24,RENT,8.0,MEDICAL,14.27,1,0.55,Y,4,3,,,
,21,OWN,2.0,VENTURE,7.14,1,0.25,N,2,4,0.0,478.0,


In [74]:
merged_with_final = merged_with_rf.merge(logit_prediction_df,left_index=True, right_on='test_indices_logit', how='left')
merged_with_final

Unnamed: 0,person_age,person_home_ownership,person_emp_length,loan_intent,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_xgb,xgb_prediction,test_indices_rf,rf_prediction,test_indices_logit,logit_prediction
,21,OWN,5.0,EDUCATION,11.14,0,0.10,N,2,0,,,,,
,25,MORTGAGE,1.0,MEDICAL,12.87,1,0.57,N,3,1,1.0,9843.0,,,
,23,RENT,4.0,MEDICAL,15.23,1,0.53,N,2,2,,,,,
,24,RENT,8.0,MEDICAL,14.27,1,0.55,Y,4,3,,,,,
,21,OWN,2.0,VENTURE,7.14,1,0.25,N,2,4,0.0,478.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,57,MORTGAGE,1.0,PERSONAL,13.16,0,0.11,N,30,31672,,,,,
,54,MORTGAGE,4.0,PERSONAL,7.49,0,0.15,N,19,31673,,,,,
,65,RENT,3.0,HOMEIMPROVEMENT,10.99,1,0.46,N,28,31674,1.0,5857.0,0.0,9809.0,
,56,MORTGAGE,5.0,PERSONAL,11.48,0,0.10,N,26,31675,,,,,


In [75]:
merged_with_final.dropna(inplace=True)


In [76]:
merged_with_final.shape

(298, 15)

In [77]:
final_data_with_pred = merged_with_final.drop(['test_indices_xgb','test_indices_rf','test_indices_logit'],axis=1)
final_data_with_pred.head()

Unnamed: 0,person_age,person_home_ownership,person_emp_length,loan_intent,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,xgb_prediction,rf_prediction,logit_prediction
4222.0,23,MORTGAGE,8.0,PERSONAL,11.99,0,0.12,N,3,0.0,0.0,0.0
1140.0,25,RENT,9.0,PERSONAL,17.56,1,0.35,N,4,1.0,0.0,1.0
4430.0,22,RENT,5.0,DEBTCONSOLIDATION,12.42,1,0.69,N,3,1.0,0.0,0.0
3036.0,25,RENT,1.0,EDUCATION,11.36,1,0.38,N,4,1.0,0.0,1.0
8239.0,23,RENT,0.0,MEDICAL,7.49,0,0.24,N,2,0.0,1.0,0.0


In [78]:
final_data_with_pred.to_excel("pd_prediction_new_refined_xgbclear.xlsx")

In [79]:
print(classification_report(final_data_with_pred['loan_status'],final_data_with_pred['xgb_prediction']))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94       241
           1       0.85      0.58      0.69        57

    accuracy                           0.90       298
   macro avg       0.88      0.78      0.81       298
weighted avg       0.90      0.90      0.89       298



In [80]:
print(classification_report(final_data_with_pred['loan_status'],final_data_with_pred['logit_prediction']))

              precision    recall  f1-score   support

           0       0.82      0.68      0.74       241
           1       0.21      0.37      0.27        57

    accuracy                           0.62       298
   macro avg       0.52      0.52      0.51       298
weighted avg       0.70      0.62      0.65       298



In [83]:
print(classification_report(final_data_with_pred['loan_status'],final_data_with_pred['rf_prediction']))

              precision    recall  f1-score   support

           0       0.81      0.82      0.82       241
           1       0.21      0.21      0.21        57

    accuracy                           0.70       298
   macro avg       0.51      0.51      0.51       298
weighted avg       0.70      0.70      0.70       298



In [81]:
confusion_matrix(final_data_with_pred['loan_status'],final_data_with_pred['xgb_prediction'])

array([[235,   6],
       [ 24,  33]])

In [82]:
###AUTOMATE PRE PROCESSING

In [84]:
##I completely forgot that when you run everything again, it changes the testing and training set both that can
##skew and change up the numbers a bit. 