In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
credit_risk = pd.read_csv("credit_risk_dataset.csv")
credit_risk.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
credit_risk.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [4]:
#What stands out from this data is the age (max should not be 144 its too much)
#income will not be mentioned here although perhaps per year should be something to look for and model for sure
#Person_emp_length no one employed for 123 years. Maybe the person who is 144 might have been. 
#Loan amount fine
#Loan interest rate, 23.22 is a lot. 

In [5]:
credit_risk_copy = credit_risk.copy()

In [6]:
credit_risk.pivot_table(index='person_age',values='person_income',aggfunc='count')

Unnamed: 0_level_0,person_income
person_age,Unnamed: 1_level_1
20,15
21,1229
22,3633
23,3889
24,3549
25,3037
26,2477
27,2138
28,1854
29,1687


In [7]:
credit_risk_age_rmvd = credit_risk[credit_risk['person_age']<81.3]
credit_risk_age_rmvd.reset_index(drop=True,inplace=True)
credit_risk_age_rmvd.head()
# This is as life expectancy in Canada is 81.3. So, to avoid risk, I do 
# not want to give loans to anyone above the life expectancy

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [8]:
credit_risk_age_rmvd.pivot_table(index='person_emp_length',columns='loan_status',values='person_income',aggfunc='count').reset_index().sort_values(by='person_emp_length',ascending=False)

loan_status,person_emp_length,0,1
35,123.0,1.0,1.0
34,41.0,1.0,
33,38.0,1.0,
32,34.0,,1.0
31,31.0,4.0,
30,30.0,1.0,1.0
29,29.0,,1.0
28,28.0,3.0,
27,27.0,4.0,1.0
26,26.0,5.0,1.0


In [9]:
credit_risk_age_emp_rmvd = credit_risk_age_rmvd[credit_risk_age_rmvd['person_emp_length']<81.3-18]
credit_risk_age_emp_rmvd.reset_index(drop=True, inplace=True)
#This is as the retirement age is 65 and people allowed employment at 18. so 65-18 is employment length I would 
#like to capture. I have also resetted the index.
credit_risk_age_emp_rmvd.pivot_table(index='person_emp_length',columns='loan_status',values='person_income',aggfunc='count').reset_index().sort_values(by='person_emp_length',ascending=False)

loan_status,person_emp_length,0,1
34,41.0,1.0,
33,38.0,1.0,
32,34.0,,1.0
31,31.0,4.0,
30,30.0,1.0,1.0
29,29.0,,1.0
28,28.0,3.0,
27,27.0,4.0,1.0
26,26.0,5.0,1.0
25,25.0,8.0,


In [10]:
credit_risk_age_emp_rmvd.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [11]:
credit_risk_age_emp_rmvd.isnull().sum()


person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length                0
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3046
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [12]:
#notice that off of this, loan_int_rate has many null values. I now need to fill in these null values with mean or median
cr_data = credit_risk_age_emp_rmvd 
cr_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,31677.0,31677.0,31677.0,31677.0,28631.0,31677.0,31677.0,31677.0
mean,27.726805,66490.55,4.782271,9660.051457,11.039825,0.215456,0.169609,5.807968
std,6.194392,52769.68,4.034989,6334.535354,3.229398,0.411145,0.10627,4.056804
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,39396.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,56000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,80000.0,7.0,12500.0,13.48,0.0,0.23,8.0
max,80.0,2039784.0,41.0,35000.0,23.22,1.0,0.83,30.0


In [13]:
#notice that for the above data the loan interest rate, the mean(11.04) is very similar to the median (50% : 10.99)
#So, we can replace the null values with whichever one we want. 

#replacement of null value info is down below:
#If the data is normally distributed, inputing with the mean can be appropriate, as it represents the central 
#tendency of the data. Median: The median is more robust to outliers. If the data is skewed or contains significant
#outliers, the median may provide a better representation of the central tendency. Thus, I will choose to replace
#the null values with the median
cr_data.fillna({'loan_int_rate':cr_data['loan_int_rate'].median()},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cr_data.fillna({'loan_int_rate':cr_data['loan_int_rate'].median()},inplace=True)


In [14]:
cr_data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [15]:
cr_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,31677.0,31677.0,31677.0,31677.0,31677.0,31677.0,31677.0,31677.0
mean,27.726805,66490.55,4.782271,9660.051457,11.035034,0.215456,0.169609,5.807968
std,6.194392,52769.68,4.034989,6334.535354,3.070238,0.411145,0.10627,4.056804
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,39396.0,2.0,5000.0,8.49,0.0,0.09,3.0
50%,26.0,56000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,80000.0,7.0,12500.0,13.16,0.0,0.23,8.0
max,80.0,2039784.0,41.0,35000.0,23.22,1.0,0.83,30.0


In [16]:
cr_data.groupby('loan_status').count()['person_age']
#Data isn't really balanced right, so we need to see this forward. 

loan_status
0    24852
1     6825
Name: person_age, dtype: int64

In [17]:
cr_data.groupby('loan_intent').count()['loan_status']

loan_intent
DEBTCONSOLIDATION    5064
EDUCATION            6288
HOMEIMPROVEMENT      3510
MEDICAL              5896
PERSONAL             5366
VENTURE              5553
Name: loan_status, dtype: int64

In [18]:
cr_data.groupby('loan_grade').count()['loan_status']

loan_grade
A    10369
B    10183
C     6318
D     3555
E      952
F      236
G       64
Name: loan_status, dtype: int64

In [19]:
## I have decided not to remove the loan_grade as I do believe it is an important feature to measure default.
#Atleast through an individual understanding of credit default at different loan grades.

In [20]:
#Because we have removed the age and employment years, we need to sort out the index to make sure they match

In [21]:
cr_data_cat_treated = cr_data.copy()

In [22]:
# Now I want to convert these columns to numerical boolean values. Need to convert loan grade to the same
person_home_ownership = pd.get_dummies(cr_data_cat_treated['person_home_ownership']).astype(int)
person_loan_grade = pd.get_dummies(cr_data_cat_treated['loan_grade']).astype(int)
person_loan_grade

Unnamed: 0,A,B,C,D,E,F,G
0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
31672,0,0,1,0,0,0,0
31673,1,0,0,0,0,0,0
31674,0,1,0,0,0,0,0
31675,0,1,0,0,0,0,0


In [23]:
person_loan_intent = pd.get_dummies(cr_data_cat_treated['loan_intent']).astype(int)
person_loan_intent.head()

Unnamed: 0,DEBTCONSOLIDATION,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE
0,0,1,0,0,0,0
1,0,0,0,1,0,0
2,0,0,0,1,0,0
3,0,0,0,1,0,0
4,0,0,0,0,0,1


In [24]:
cr_data_cat_treated['cb_person_default_on_file_binary'] = np.where(cr_data_cat_treated['cb_person_default_on_file']=='Y',1,0)
cr_data_cat_treated.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,cb_person_default_on_file_binary
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,0
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,0
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,0
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,1
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,0


In [25]:
## I need to standardise the difference between numerical values like person_income and person_age. These are
## 2 seperate values that need to be differentiated seperately. Mean is 0 SD is 1, it is standardised. So, I need
# remove the values that will not be scaled in order to scale the rest of them. Don't want to include boolean values
data_to_scale = cr_data_cat_treated.drop(['person_home_ownership','loan_intent','loan_grade','loan_status','cb_person_default_on_file','cb_person_default_on_file_binary'],axis=1)
data_to_scale.head()


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
0,21,9600,5.0,1000,11.14,0.1,2
1,25,9600,1.0,5500,12.87,0.57,3
2,23,65500,4.0,35000,15.23,0.53,2
3,24,54400,8.0,35000,14.27,0.55,4
4,21,9900,2.0,2500,7.14,0.25,2


In [26]:
# (x-mean of x)/std of x --> formula for standardizing
scaler = StandardScaler()
data_to_scale.columns

Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'],
      dtype='object')

In [27]:
scaled_data = scaler.fit_transform(data_to_scale)

In [28]:
scaled_df = pd.DataFrame(scaled_data,columns=['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'])
scaled_df.head()


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
0,-1.085968,-1.078109,0.053961,-1.367139,0.034189,-0.655028,-0.938677
1,-0.440212,-1.078109,-0.937383,-0.656736,0.597672,3.76773,-0.692173
2,-0.76309,-0.018771,-0.193875,4.000348,1.366354,3.391325,-0.938677
3,-0.601651,-0.229123,0.797469,4.000348,1.05367,3.579527,-0.44567
4,-1.085968,-1.072423,-0.689547,-1.130338,-1.268662,0.75649,-0.938677


In [29]:
scaled_df.describe()
#basically, everything has mean 1 std as 0

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
count,31677.0,31677.0,31677.0,31677.0,31677.0,31677.0,31677.0
mean,1.722691e-16,-5.742303e-17,5.742303e-17,-1.435576e-17,7.945015e-16,-1.004903e-16,0.0
std,1.000016,1.000016,1.000016,1.000016,1.000016,1.000016,1.000016
min,-1.247407,-1.184232,-1.185219,-1.446072,-1.828888,-1.59604,-0.938677
25%,-0.7630901,-0.5134572,-0.6895471,-0.7356696,-0.8289499,-0.7491292,-0.692173
50%,-0.2787735,-0.1988019,-0.193875,-0.2620678,-0.01466802,-0.1845218,-0.44567
75%,0.3669821,0.2560119,0.5496332,0.4483349,0.6921286,0.568288,0.540343
max,8.438927,37.39505,8.976059,4.000348,3.968799,6.214362,5.963417


In [30]:
scaled_data_combined = pd.concat([scaled_df,person_loan_intent,person_home_ownership,person_loan_grade],axis=1)
scaled_data_combined['cb_person_default_on_file']=cr_data_cat_treated['cb_person_default_on_file_binary']
scaled_data_combined['loan_status'] = cr_data_cat_treated['loan_status']
scaled_data_combined.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,DEBTCONSOLIDATION,EDUCATION,HOMEIMPROVEMENT,...,RENT,A,B,C,D,E,F,G,cb_person_default_on_file,loan_status
0,-1.085968,-1.078109,0.053961,-1.367139,0.034189,-0.655028,-0.938677,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,-0.440212,-1.078109,-0.937383,-0.656736,0.597672,3.76773,-0.692173,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,-0.76309,-0.018771,-0.193875,4.000348,1.366354,3.391325,-0.938677,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,-0.601651,-0.229123,0.797469,4.000348,1.05367,3.579527,-0.44567,0,0,0,...,1,0,0,1,0,0,0,0,1,1
4,-1.085968,-1.072423,-0.689547,-1.130338,-1.268662,0.75649,-0.938677,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [31]:
#Just want to take a count of number of defaults to payments in loan status
scaled_data_combined.groupby('loan_status').count()['EDUCATION']

loan_status
0    24852
1     6825
Name: EDUCATION, dtype: int64

In [32]:
#look at the ratio. You need to balance this data. Synthetically create data for the minority class. 
6825/(24852+6825)
target = scaled_data_combined['loan_status']
features = scaled_data_combined.drop('loan_status',axis=1)
features.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,DEBTCONSOLIDATION,EDUCATION,HOMEIMPROVEMENT,...,OWN,RENT,A,B,C,D,E,F,G,cb_person_default_on_file
0,-1.085968,-1.078109,0.053961,-1.367139,0.034189,-0.655028,-0.938677,0,1,0,...,1,0,0,1,0,0,0,0,0,0
1,-0.440212,-1.078109,-0.937383,-0.656736,0.597672,3.76773,-0.692173,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,-0.76309,-0.018771,-0.193875,4.000348,1.366354,3.391325,-0.938677,0,0,0,...,0,1,0,0,1,0,0,0,0,0
3,-0.601651,-0.229123,0.797469,4.000348,1.05367,3.579527,-0.44567,0,0,0,...,0,1,0,0,1,0,0,0,0,1
4,-1.085968,-1.072423,-0.689547,-1.130338,-1.268662,0.75649,-0.938677,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [33]:
#now we need to balance the data
!pip install imbalanced-learn



In [34]:
from imblearn.over_sampling import SMOTE

In [35]:
smote = SMOTE()

In [36]:
balanced_features, balanced_target = smote.fit_resample(features,target)

In [37]:
balanced_features.shape
balanced_features.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,DEBTCONSOLIDATION,EDUCATION,HOMEIMPROVEMENT,...,OWN,RENT,A,B,C,D,E,F,G,cb_person_default_on_file
0,-1.085968,-1.078109,0.053961,-1.367139,0.034189,-0.655028,-0.938677,0,1,0,...,1,0,0,1,0,0,0,0,0,0
1,-0.440212,-1.078109,-0.937383,-0.656736,0.597672,3.76773,-0.692173,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,-0.76309,-0.018771,-0.193875,4.000348,1.366354,3.391325,-0.938677,0,0,0,...,0,1,0,0,1,0,0,0,0,0
3,-0.601651,-0.229123,0.797469,4.000348,1.05367,3.579527,-0.44567,0,0,0,...,0,1,0,0,1,0,0,0,0,1
4,-1.085968,-1.072423,-0.689547,-1.130338,-1.268662,0.75649,-0.938677,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [38]:
# Train models blud

In [39]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [41]:
x_train, x_test, y_train, y_test = train_test_split(balanced_features, balanced_target, test_size=0.20, random_state = 42)


In [42]:
logit = LogisticRegression()

In [43]:
logit.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
logit.score(x_train,y_train)

0.8266730377486609

In [45]:
logit_prediction = logit.predict(x_test)
logit_prediction

array([0, 1, 0, ..., 1, 0, 1])

In [46]:
print(classification_report(y_test,logit_prediction))
#model could be overfitted (trained on data so well that can't make accurate predictions)

              precision    recall  f1-score   support

           0       0.81      0.84      0.82      5019
           1       0.83      0.79      0.81      4922

    accuracy                           0.82      9941
   macro avg       0.82      0.82      0.82      9941
weighted avg       0.82      0.82      0.82      9941



In [47]:
print(logit.coef_)

[[-1.17006686e-01 -5.14140213e-03 -5.31304279e-02 -7.86256571e-01
   3.06092168e-01  1.47526348e+00 -6.81671390e-02 -4.30045660e+00
  -5.05987411e+00 -4.07148470e+00 -4.39208529e+00 -4.80592747e+00
  -5.42134142e+00 -3.94891422e+00 -4.30822400e+00 -6.07297429e+00
  -3.24204241e+00 -4.94138376e+00 -4.74734541e+00 -4.48385694e+00
  -2.39376430e+00 -2.51056424e+00 -2.40553174e+00  1.04865938e+00
  -2.42862687e-01]]


In [48]:
features_imp_logit = pd.DataFrame({'features':balanced_features.columns,'logit_imp':logit.coef_[0]})
features_imp_logit.sort_values(by='logit_imp',ascending=False)

Unnamed: 0,features,logit_imp
5,loan_percent_income,1.475263
23,G,1.048659
4,loan_int_rate,0.306092
1,person_income,-0.005141
2,person_emp_length,-0.05313
6,cb_person_cred_hist_length,-0.068167
0,person_age,-0.117007
24,cb_person_default_on_file,-0.242863
3,loan_amnt,-0.786257
20,D,-2.393764


In [49]:
rf = RandomForestClassifier()

In [50]:
rf.fit(x_train,y_train)

In [51]:
rf.score(x_train,y_train)

1.0

In [52]:
rf_prediction = rf.predict(x_test)
rf_prediction 

array([0, 1, 0, ..., 1, 0, 1])

In [53]:
print(classification_report(y_test,rf_prediction))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      5019
           1       0.98      0.91      0.94      4922

    accuracy                           0.95      9941
   macro avg       0.95      0.95      0.95      9941
weighted avg       0.95      0.95      0.95      9941



In [54]:
rf.feature_importances_

array([0.05257511, 0.12695172, 0.06695105, 0.07402783, 0.13240838,
       0.20602228, 0.05235078, 0.01131301, 0.01275841, 0.01422969,
       0.01224677, 0.01062916, 0.01466472, 0.02610118, 0.00045887,
       0.02167517, 0.03167242, 0.02673411, 0.01393532, 0.02242067,
       0.04771281, 0.00962527, 0.0018936 , 0.0006984 , 0.00994327])

In [55]:
features_imp_rf = pd.DataFrame({'features':balanced_features.columns,'rf_imp':rf.feature_importances_})
features_imp_rf.sort_values(by='rf_imp',ascending=False)

Unnamed: 0,features,rf_imp
5,loan_percent_income,0.206022
4,loan_int_rate,0.132408
1,person_income,0.126952
3,loan_amnt,0.074028
2,person_emp_length,0.066951
0,person_age,0.052575
6,cb_person_cred_hist_length,0.052351
20,D,0.047713
16,RENT,0.031672
17,A,0.026734


In [56]:
##XG boost model

In [57]:
xgb_model = XGBClassifier(tree_method = 'exact')

In [58]:
#model.fit(x,y.values.rave1())
xgb_model.fit(x_train,y_train.values.ravel())

In [59]:
xgb_model.score(x_train,y_train.values.ravel())

0.9690164223021401

In [60]:
xgb_model_prediction = xgb_model.predict(x_test)
xgb_model_prediction 

array([0, 1, 0, ..., 1, 0, 1])

In [61]:
print(classification_report(y_test,xgb_model_prediction))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96      5019
           1       0.99      0.92      0.95      4922

    accuracy                           0.96      9941
   macro avg       0.96      0.96      0.96      9941
weighted avg       0.96      0.96      0.96      9941



In [62]:
features_imp_xgb_model = pd.DataFrame({'features':balanced_features.columns,'xgb_model_imp':xgb_model.feature_importances_})
features_imp_xgb_model.sort_values(by='xgb_model_imp',ascending=False)

Unnamed: 0,features,xgb_model_imp
19,C,0.106867
16,RENT,0.092014
5,loan_percent_income,0.087513
15,OWN,0.087358
12,VENTURE,0.059959
20,D,0.057972
7,DEBTCONSOLIDATION,0.057193
4,loan_int_rate,0.05191
6,cb_person_cred_hist_length,0.049107
17,A,0.047325


In [63]:
feature_imp = pd.concat([features_imp_xgb_model,features_imp_rf,features_imp_logit],axis=1)
feature_imp
#based on these, we are trying to figure out which features are the most important based on the regression values

Unnamed: 0,features,xgb_model_imp,features.1,rf_imp,features.2,logit_imp
0,person_age,0.021953,person_age,0.052575,person_age,-0.117007
1,person_income,0.024373,person_income,0.126952,person_income,-0.005141
2,person_emp_length,0.032048,person_emp_length,0.066951,person_emp_length,-0.05313
3,loan_amnt,0.005804,loan_amnt,0.074028,loan_amnt,-0.786257
4,loan_int_rate,0.05191,loan_int_rate,0.132408,loan_int_rate,0.306092
5,loan_percent_income,0.087513,loan_percent_income,0.206022,loan_percent_income,1.475263
6,cb_person_cred_hist_length,0.049107,cb_person_cred_hist_length,0.052351,cb_person_cred_hist_length,-0.068167
7,DEBTCONSOLIDATION,0.057193,DEBTCONSOLIDATION,0.011313,DEBTCONSOLIDATION,-4.300457
8,EDUCATION,0.02391,EDUCATION,0.012758,EDUCATION,-5.059874
9,HOMEIMPROVEMENT,0.041008,HOMEIMPROVEMENT,0.01423,HOMEIMPROVEMENT,-4.071485


In [64]:
## Further exploration

In [82]:
xgb_prediction_df = pd.DataFrame({'test_indices_xgb':x_test.index,'xgb_prediction':xgb_model_prediction})
logit_prediction_df = pd.DataFrame({'test_indices_logit':x_test.index,'logit_prediction':logit_prediction})
rf_prediction_df = pd.DataFrame({'test_indices_rf':x_test.index,'rf_prediction':rf_prediction})
logit_prediction_df.head()

Unnamed: 0,test_indices_logit,logit_prediction
0,9060,0
1,44416,1
2,9592,0
3,41317,1
4,31374,1


In [83]:
merged_with_xgb = credit_risk_copy.merge(xgb_prediction_df,left_index=True, right_on='test_indices_xgb', how='left')
merged_with_xgb.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_xgb,xgb_prediction
,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3,0,
9843.0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,1,1.0
,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,2,
,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,3,
478.0,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,4,1.0


In [84]:
merged_with_rf = merged_with_xgb.merge(rf_prediction_df,left_index=True, right_on='test_indices_rf', how='left')
merged_with_rf.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_xgb,xgb_prediction,test_indices_rf,rf_prediction
,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3,0,,,
,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,1,1.0,9843.0,
,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,2,,,
,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,3,,,
,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,4,1.0,478.0,


In [87]:
merged_with_final = merged_with_rf.merge(logit_prediction_df,left_index=True, right_on='test_indices_logit', how='left')
merged_with_final

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_xgb,xgb_prediction,test_indices_rf,rf_prediction,test_indices_logit,logit_prediction
,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3,0,,,,,
,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2,1,1.0,9843.0,,,
,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,2,,,,,
,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,3,,,,,
,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,4,1.0,478.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30,32576,,,,,
,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19,32577,,,,,
,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28,32578,,,,,
,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26,32579,,,,,


In [96]:
merged_with_final.dropna(inplace=True)


In [98]:
merged_with_final.shape

(259, 18)

In [101]:
final_data_with_pred = merged_with_final.drop(['test_indices_xgb','test_indices_rf','test_indices_logit'],axis=1)
final_data_with_pred.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,xgb_prediction,rf_prediction,logit_prediction
4222.0,23,15000,MORTGAGE,1.0,MEDICAL,C,1750,12.68,0,0.12,Y,3,0.0,0.0,0.0
1140.0,24,160000,MORTGAGE,9.0,VENTURE,E,12000,16.77,0,0.07,N,4,1.0,0.0,0.0
4430.0,23,56000,RENT,8.0,MEDICAL,G,21600,21.21,1,0.39,Y,4,1.0,0.0,0.0
3036.0,23,49400,RENT,0.0,EDUCATION,A,20000,6.54,1,0.4,N,4,1.0,1.0,0.0
8239.0,25,80000,RENT,7.0,EDUCATION,B,20000,8.88,0,0.25,N,4,0.0,0.0,1.0


In [102]:
final_data_with_pred.to_excel("pd_prediction.xlsx")