In [1]:
# **Numerical Value Cleansing of Credit Scoring Data**

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
credit_risk = pd.read_csv(r"...")
credit_risk.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [4]:
credit_risk.shape

(32581, 12)

In [5]:
credit_risk.describe() # only numerical columns

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [6]:
credit_risk.describe().round(2)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.73,66074.85,4.79,9589.37,11.01,0.22,0.17,5.8
std,6.35,61983.12,4.14,6322.09,3.24,0.41,0.11,4.06
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [7]:
credit_risk_copy = credit_risk.copy()

In [8]:
pivot_table = credit_risk.pivot_table(index = 'person_age', 
                                      values = 'person_income',
                                      aggfunc=['count']) # show unique ages with thier mean values and count
print(pivot_table.round(2))

                   count
           person_income
person_age              
20                    15
21                  1229
22                  3633
23                  3889
24                  3549
25                  3037
26                  2477
27                  2138
28                  1854
29                  1687
30                  1316
31                  1142
32                   964
33                   856
34                   709
35                   620
36                   548
37                   478
38                   373
39                   302
40                   271
41                   241
42                   188
43                   164
44                   141
45                   108
46                    94
47                    94
48                    75
49                    49
50                    52
51                    39
52                    36
53                    30
54                    24
55                    20
56                    15


In [9]:
pivot_table2 = credit_risk.pivot_table(index = 'person_age', 
                                      columns = 'loan_status',
                                      values = 'person_income',
                                      aggfunc=['count']).reset_index().sort_values(by = 'person_age', ascending = 0)
print(pivot_table2.round(2))

            person_age   count       
loan_status                  0      1
57                 144     3.0    NaN
56                 123     2.0    NaN
55                  94     1.0    NaN
54                  84     1.0    NaN
53                  80     1.0    NaN
52                  78     1.0    NaN
51                  76     1.0    NaN
50                  73     3.0    NaN
49                  70     5.0    2.0
48                  69     5.0    NaN
47                  67     1.0    NaN
46                  66     6.0    3.0
45                  65     6.0    3.0
44                  64     6.0    1.0
43                  63     2.0    1.0
42                  62     4.0    3.0
41                  61     5.0    4.0
40                  60     8.0    7.0
39                  59     5.0    NaN
38                  58    16.0    3.0
37                  57    11.0    4.0
36                  56    14.0    1.0
35                  55    13.0    7.0
34                  54    17.0    7.0
33          

In [10]:
cr_age_removed = credit_risk[credit_risk['person_age']<66] # from the bank's side, they want to play low risk. That's why no credit above 65
cr_age_removed.shape

(32546, 12)

In [11]:
cr_age_removed.pivot_table(index = 'person_emp_length', 
                           columns = 'loan_status',
                          values = 'person_income',
                          aggfunc = 'count').reset_index().sort_values(by = 'person_emp_length', ascending = 0)

loan_status,person_emp_length,0,1
34,123.0,1.0,1.0
33,38.0,1.0,
32,34.0,,1.0
31,31.0,4.0,
30,30.0,1.0,1.0
29,29.0,,1.0
28,28.0,3.0,
27,27.0,4.0,1.0
26,26.0,5.0,1.0
25,25.0,8.0,


In [12]:
cr_person_emp_length_removed = cr_age_removed[cr_age_removed['person_emp_length']<48] # if person starts working at 18, till 65: 47 years it is maximum in our case
cr_person_emp_length_removed.reset_index(drop = True, inplace = True)
cr_person_emp_length_removed.shape #31650


(31650, 12)

In [13]:
cr_person_emp_length_removed.describe().round(2)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,31650.0,31650.0,31650.0,31650.0,28607.0,31650.0,31650.0,31650.0
mean,27.69,66488.64,4.78,9659.31,11.04,0.22,0.17,5.79
std,6.07,52764.11,4.03,6333.02,3.23,0.41,0.11,4.02
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,39396.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,56000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,80000.0,7.0,12500.0,13.48,0.0,0.23,8.0
max,65.0,2039784.0,38.0,35000.0,23.22,1.0,0.83,30.0


In [14]:
cr_person_emp_length_removed.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length                0
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3043
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [15]:
(3043/31650)*100  #  The percentage of the missin loan_int_rate values. At that point I decided to drop the null values

9.614533965244867

In [16]:
cr_loan_int_rate_removed = cr_person_emp_length_removed.dropna(subset=['loan_int_rate'])
cr_loan_int_rate_removed.shape

(28607, 12)

In [17]:
cr_loan_int_rate_removed.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [18]:
cr_loan_int_rate_removed.describe().round(2)
cr_loan_int_rate_removed.reset_index(drop = True, inplace = True)

In [19]:
cr_data = cr_loan_int_rate_removed.copy()

In [20]:
cr_data.groupby('loan_status').count()['person_age']

loan_status
0    22409
1     6198
Name: person_age, dtype: int64

In [21]:
round(((6198/(22409+6198))*100),2)

21.67

In [22]:
cr_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


# __**Categorical Features Cleansing of Credit Scoring Data**__

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
cr_data.groupby('person_home_ownership').count()['person_age']

person_home_ownership
MORTGAGE    11791
OTHER          94
OWN          2192
RENT        14530
Name: person_age, dtype: int64

In [26]:
cr_data.groupby('loan_intent').count()['person_age']

loan_intent
DEBTCONSOLIDATION    4563
EDUCATION            5701
HOMEIMPROVEMENT      3198
MEDICAL              5278
PERSONAL             4868
VENTURE              4999
Name: person_age, dtype: int64

In [27]:
cr_data.groupby('loan_grade').count()['person_age']

loan_grade
A    9391
B    9143
C    5694
D    3242
E     869
F     209
G      59
Name: person_age, dtype: int64

In [28]:
# filter loan_grade value = 'G' 
filtered_data_loan_grade_g = cr_data[cr_data['loan_grade'] == 'G']

# display loan_status 
filtered_data_loan_grade_g.groupby('loan_status').count()['person_age']


loan_status
0     1
1    58
Name: person_age, dtype: int64

In [29]:
# filter loan_grade value = 'F' 
filtered_data_loan_grade_g = cr_data[cr_data['loan_grade'] == 'F']

# display loan_status 
filtered_data_loan_grade_g.groupby('loan_status').count()['person_age']


loan_status
0     63
1    146
Name: person_age, dtype: int64

In [30]:
# filter loan_grade value = 'E' 
filtered_data_loan_grade_g = cr_data[cr_data['loan_grade'] == 'E']

# display loan_status 
filtered_data_loan_grade_g.groupby('loan_status').count()['person_age']


loan_status
0    308
1    561
Name: person_age, dtype: int64

In [31]:
# filter loan_grade value = 'D' 
filtered_data_loan_grade_g = cr_data[cr_data['loan_grade'] == 'D']

# display loan_status 
filtered_data_loan_grade_g.groupby('loan_status').count()['person_age']


loan_status
0    1322
1    1920
Name: person_age, dtype: int64

In [32]:
# filter loan_grade value = 'C' 
filtered_data_loan_grade_g = cr_data[cr_data['loan_grade'] == 'C']

# display loan_status 
filtered_data_loan_grade_g.groupby('loan_status').count()['person_age']


loan_status
0    4537
1    1157
Name: person_age, dtype: int64

In [33]:
# filter loan_grade value = 'B' 
filtered_data_loan_grade_g = cr_data[cr_data['loan_grade'] == 'B']

# display loan_status 
filtered_data_loan_grade_g.groupby('loan_status').count()['person_age']


loan_status
0    7690
1    1453
Name: person_age, dtype: int64

In [34]:
# filter loan_grade value = 'A' 
filtered_data_loan_grade_g = cr_data[cr_data['loan_grade'] == 'A']

# display loan_status 
filtered_data_loan_grade_g.groupby('loan_status').count()['person_age']


loan_status
0    8488
1     903
Name: person_age, dtype: int64

In [35]:
cr_data.reset_index(drop = True, inplace = True)
cr_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [36]:
cr_data.shape

(28607, 12)

In [37]:
cr_data_categorical_fix = cr_data.copy()

In [38]:
cr_data_categorical_fix.groupby('person_home_ownership').count()['person_age'] # 4 category

person_home_ownership
MORTGAGE    11791
OTHER          94
OWN          2192
RENT        14530
Name: person_age, dtype: int64

In [39]:
cr_data_categorical_fix.groupby('loan_intent').count()['person_age'] # 6 category

loan_intent
DEBTCONSOLIDATION    4563
EDUCATION            5701
HOMEIMPROVEMENT      3198
MEDICAL              5278
PERSONAL             4868
VENTURE              4999
Name: person_age, dtype: int64

In [40]:
cr_data_categorical_fix.groupby('loan_grade').count()['person_age'] # 7 category

loan_grade
A    9391
B    9143
C    5694
D    3242
E     869
F     209
G      59
Name: person_age, dtype: int64

In [41]:
cr_data_categorical_fix.groupby('cb_person_default_on_file').count()['person_age'] # 2 category

cb_person_default_on_file
N    23511
Y     5096
Name: person_age, dtype: int64

In [42]:
person_home_ownership = pd.get_dummies(cr_data_categorical_fix['person_home_ownership'], drop_first = True).astype(int)
person_home_ownership.head

<bound method NDFrame.head of        OTHER  OWN  RENT
0          0    1     0
1          0    0     0
2          0    0     1
3          0    0     1
4          0    1     0
...      ...  ...   ...
28602      0    0     1
28603      0    0     0
28604      0    0     0
28605      0    0     1
28606      0    0     0

[28607 rows x 3 columns]>

In [43]:
loan_intent = pd.get_dummies(cr_data_categorical_fix['loan_intent'], drop_first = True).astype(int)
loan_intent.head

<bound method NDFrame.head of        EDUCATION  HOMEIMPROVEMENT  MEDICAL  PERSONAL  VENTURE
0              1                0        0         0        0
1              0                0        1         0        0
2              0                0        1         0        0
3              0                0        1         0        0
4              0                0        0         0        1
...          ...              ...      ...       ...      ...
28602          1                0        0         0        0
28603          0                0        0         1        0
28604          0                0        0         1        0
28605          0                1        0         0        0
28606          0                0        0         1        0

[28607 rows x 5 columns]>

In [44]:
loan_grade = pd.get_dummies(cr_data_categorical_fix['loan_grade'], drop_first = True).astype(int)
loan_grade.head

<bound method NDFrame.head of        B  C  D  E  F  G
0      1  0  0  0  0  0
1      0  1  0  0  0  0
2      0  1  0  0  0  0
3      0  1  0  0  0  0
4      0  0  0  0  0  0
...   .. .. .. .. .. ..
28602  1  0  0  0  0  0
28603  0  1  0  0  0  0
28604  0  0  0  0  0  0
28605  1  0  0  0  0  0
28606  1  0  0  0  0  0

[28607 rows x 6 columns]>

In [45]:
cr_data_categorical_fix['cb_person_default_on_file'] = np.where(cr_data_categorical_fix['cb_person_default_on_file']=='Y', 1, 0)

In [46]:
cb_person_default_on_file = cr_data_categorical_fix[['cb_person_default_on_file']]
cb_person_default_on_file.head()

Unnamed: 0,cb_person_default_on_file
0,0
1,0
2,0
3,1
4,0


In [47]:
loan_status = cr_data_categorical_fix[['loan_status']]
loan_status.head()

Unnamed: 0,loan_status
0,0
1,1
2,1
3,1
4,1


In [48]:
cr_data_categorical_fix.head() 

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,0,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,0,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,0,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,1,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,0,2


In [49]:
data_to_scale = cr_data_categorical_fix.drop(['person_home_ownership', 'loan_intent', 'loan_status', 'loan_grade', 'cb_person_default_on_file'], axis = 1)

In [50]:
data_to_scale.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
0,21,9600,5.0,1000,11.14,0.1,2
1,25,9600,1.0,5500,12.87,0.57,3
2,23,65500,4.0,35000,15.23,0.53,2
3,24,54400,8.0,35000,14.27,0.55,4
4,21,9900,2.0,2500,7.14,0.25,2



(x-mean of x)/(std of x) is the standardization formula which we use on above dataset

the process of making mean 0.0 and standard deviation 1.0


In [52]:
scaler = StandardScaler()

In [53]:
scaled_data = scaler.fit_transform(data_to_scale)
scaled_data #it is an nd array

array([[-1.10440773, -1.10258763,  0.05493167, ...,  0.03095671,
        -0.65332136, -0.94419717],
       [-0.44259478, -1.10258763, -0.9380732 , ...,  0.56671006,
         3.76588223, -0.69423512],
       [-0.77350125, -0.01796151, -0.19331955, ...,  1.29756433,
         3.3897798 , -0.94419717],
       ...,
       [ 4.35554908,  1.03950044, -0.19331955, ..., -1.09938994,
        -0.18319332,  3.30515756],
       [ 6.17553468,  0.18576969, -0.44157077, ..., -0.01549589,
         2.73160054,  5.55481594],
       [ 4.68645555,  1.62158959,  0.05493167, ...,  0.13624927,
        -0.65332136,  5.05489186]])

In [54]:
data_to_scale.columns

Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'],
      dtype='object')

In [55]:
scaled_df = pd.DataFrame(scaled_data,columns = data_to_scale.columns)
scaled_df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
0,-1.104408,-1.102588,0.054932,-1.368046,0.030957,-0.653321,-0.944197
1,-0.442595,-1.102588,-0.938073,-0.656773,0.56671,3.765882,-0.694235
2,-0.773501,-0.017962,-0.19332,4.00602,1.297564,3.38978,-0.944197
3,-0.608048,-0.233334,0.799685,4.00602,1.000268,3.577831,-0.444273
4,-1.104408,-1.096767,-0.689822,-1.130955,-1.207779,0.757063,-0.944197


In [56]:
round(np.mean(scaled_df.person_income), 2) # Successful standardization

0.0

In [57]:
round(np.std(scaled_df.person_income), 2) # Successful standardization

1.0

In [58]:
scaled_data_combined = pd.concat([scaled_df, person_home_ownership, loan_grade, loan_intent, loan_status, cb_person_default_on_file], axis = 1)
scaled_data_combined.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,RENT,...,E,F,G,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,loan_status,cb_person_default_on_file
0,-1.104408,-1.102588,0.054932,-1.368046,0.030957,-0.653321,-0.944197,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,-0.442595,-1.102588,-0.938073,-0.656773,0.56671,3.765882,-0.694235,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,-0.773501,-0.017962,-0.19332,4.00602,1.297564,3.38978,-0.944197,0,0,1,...,0,0,0,0,0,1,0,0,1,0
3,-0.608048,-0.233334,0.799685,4.00602,1.000268,3.577831,-0.444273,0,0,1,...,0,0,0,0,0,1,0,0,1,1
4,-1.104408,-1.096767,-0.689822,-1.130955,-1.207779,0.757063,-0.944197,0,1,0,...,0,0,0,0,0,0,0,1,1,0


In [59]:
scaled_data_combined.groupby('loan_status').count()['person_age']

loan_status
0    22409
1     6198
Name: person_age, dtype: int64

In [60]:
round((6198/(6198+22409) * 100),2) # that percentage shows the percentage of defaulters

21.67

# __SMOTE - Synthetic Minority Over-Sampling Technique__

In [62]:
from imblearn.over_sampling import SMOTE

In [63]:
smote = SMOTE()

In [64]:
target = scaled_data_combined['loan_status']
target.head()

0    0
1    1
2    1
3    1
4    1
Name: loan_status, dtype: int64

In [65]:
features = scaled_data_combined.drop('loan_status', axis = 1)
features.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,RENT,...,D,E,F,G,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,cb_person_default_on_file
0,-1.104408,-1.102588,0.054932,-1.368046,0.030957,-0.653321,-0.944197,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,-0.442595,-1.102588,-0.938073,-0.656773,0.56671,3.765882,-0.694235,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,-0.773501,-0.017962,-0.19332,4.00602,1.297564,3.38978,-0.944197,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,-0.608048,-0.233334,0.799685,4.00602,1.000268,3.577831,-0.444273,0,0,1,...,0,0,0,0,0,0,1,0,0,1
4,-1.104408,-1.096767,-0.689822,-1.130955,-1.207779,0.757063,-0.944197,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [66]:
balanced_features, balanced_target = smote.fit_resample(features, target)

In [67]:
balanced_target.shape

(44818,)

In [68]:
balanced_features.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,RENT,...,D,E,F,G,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,cb_person_default_on_file
count,44818.0,44818.0,44818.0,44818.0,44818.0,44818.0,44818.0,44818.0,44818.0,44818.0,...,44818.0,44818.0,44818.0,44818.0,44818.0,44818.0,44818.0,44818.0,44818.0,44818.0
mean,-0.022543,-0.118722,-0.065346,0.070155,0.231643,0.250582,-0.014572,0.00232,0.053394,0.57941,...,0.18323,0.049266,0.011179,0.003034,0.173412,0.107278,0.190816,0.153309,0.145455,0.213084
std,0.993693,0.91043,0.974106,1.037172,1.04015,1.130407,0.997565,0.048116,0.22482,0.493659,...,0.386859,0.216425,0.105137,0.055003,0.378608,0.30947,0.392949,0.360289,0.352563,0.409491
min,-1.269861,-1.211244,-1.186324,-1.447077,-1.740436,-1.593577,-0.944197,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.773501,-0.609752,-0.77366,-0.735803,-0.650348,-0.653321,-0.694235,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.294479,-0.315208,-0.205843,-0.174688,0.244639,0.004858,-0.444273,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.384671,0.108158,0.51037,0.674603,1.055463,1.03914,0.555575,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6.175535,38.289015,8.247222,4.00602,3.77194,6.210548,6.05474,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [69]:
balanced_target_df = pd.DataFrame({'target':balanced_target}) #we balanced
balanced_target_df.groupby('target').size()

target
0    22409
1    22409
dtype: int64

# __Model Training__

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [72]:
x_train, x_test, y_train, y_test = train_test_split(balanced_features, balanced_target, test_size = 0.20, random_state = 42)

In [73]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(35854, 22)
(8964, 22)
(35854,)
(8964,)


In [74]:
logit = LogisticRegression()

In [75]:
logit.fit(x_train, y_train)

In [76]:
logit.score(x_train, y_train)

0.8009984938918949

In [77]:
logit_prediction = logit.predict(x_test)
logit_prediction

array([1, 1, 0, ..., 0, 1, 1], dtype=int64)

In [78]:
print(classification_report(y_test, logit_prediction))

              precision    recall  f1-score   support

           0       0.79      0.83      0.81      4476
           1       0.82      0.78      0.80      4488

    accuracy                           0.81      8964
   macro avg       0.81      0.81      0.81      8964
weighted avg       0.81      0.81      0.81      8964



In [79]:
logit.coef_[0]

array([-0.09834357,  0.04994098, -0.04444516, -0.74707692,  1.05533842,
        1.45917053,  0.04156531, -0.81043773, -2.19878957,  0.43775688,
       -0.74174855, -1.07263618,  0.52997291,  0.18469393, -0.028795  ,
        1.01518619, -1.28007451, -0.40532886, -0.67294454, -1.07191739,
       -1.67927393, -0.15218086])

In [80]:
features_imp_logit = pd.DataFrame({'features': balanced_features.columns, 'logit_imp': logit.coef_[0]})
features_imp_logit.sort_values(by = 'logit_imp', ascending = False)

Unnamed: 0,features,logit_imp
5,loan_percent_income,1.459171
4,loan_int_rate,1.055338
15,G,1.015186
12,D,0.529973
9,RENT,0.437757
13,E,0.184694
1,person_income,0.049941
6,cb_person_cred_hist_length,0.041565
14,F,-0.028795
2,person_emp_length,-0.044445


# Random Forest

In [82]:
rf = RandomForestClassifier()

In [83]:
rf.fit(x_train, y_train)

In [84]:
rf.score(x_train, y_train)

1.0

In [85]:
rf_prediction = rf.predict(x_test)
rf_prediction

array([1, 1, 0, ..., 0, 0, 0], dtype=int64)

In [86]:
print(classification_report(y_test, rf_prediction))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      4476
           1       0.97      0.91      0.94      4488

    accuracy                           0.94      8964
   macro avg       0.94      0.94      0.94      8964
weighted avg       0.94      0.94      0.94      8964



In [87]:
rf.feature_importances_

array([0.05797383, 0.1373872 , 0.07016795, 0.0734469 , 0.1564238 ,
       0.20049132, 0.05571611, 0.00033809, 0.02011869, 0.04700247,
       0.00752652, 0.01734201, 0.05544056, 0.01280808, 0.00261109,
       0.00073235, 0.01499351, 0.01623995, 0.00846176, 0.01293409,
       0.01807809, 0.01376561])

In [88]:
features_imp_rf = pd.DataFrame({'features': balanced_features.columns, 'rf_imp': rf.feature_importances_})
features_imp_rf.head()

Unnamed: 0,features,rf_imp
0,person_age,0.057974
1,person_income,0.137387
2,person_emp_length,0.070168
3,loan_amnt,0.073447
4,loan_int_rate,0.156424


In [89]:
features_imp_rf.sort_values(by = 'rf_imp', ascending = False)

Unnamed: 0,features,rf_imp
5,loan_percent_income,0.200491
4,loan_int_rate,0.156424
1,person_income,0.137387
3,loan_amnt,0.073447
2,person_emp_length,0.070168
0,person_age,0.057974
6,cb_person_cred_hist_length,0.055716
12,D,0.055441
9,RENT,0.047002
8,OWN,0.020119


# XGBoost 

In [91]:
xgb_model = XGBClassifier(tree_method = 'exact')

In [92]:
# model.fit(x,y.values.ravel())
xgb_model.fit(x_train, y_train.values.ravel())

In [93]:
xgb_model.score(x_train, y_train.values.ravel())

0.9701009650248229

In [94]:
xgb_prediction = xgb_model.predict(x_test)
xgb_prediction

array([1, 1, 0, ..., 0, 0, 0])

In [95]:
print(classification_report(y_test, xgb_prediction))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95      4476
           1       0.99      0.92      0.95      4488

    accuracy                           0.95      8964
   macro avg       0.95      0.95      0.95      8964
weighted avg       0.95      0.95      0.95      8964



In [96]:
features_imp_xgb = pd.DataFrame({'features':balanced_features.columns, 'xgb_imp':xgb_model.feature_importances_})
features_imp_xgb.sort_values(by = 'xgb_imp', ascending = False)

Unnamed: 0,features,xgb_imp
8,OWN,0.121933
5,loan_percent_income,0.103629
20,VENTURE,0.095305
11,C,0.090777
9,RENT,0.08534
4,loan_int_rate,0.071124
16,EDUCATION,0.061964
19,PERSONAL,0.05431
17,HOMEIMPROVEMENT,0.054123
6,cb_person_cred_hist_length,0.052769


In [97]:
features_imp = pd.concat([features_imp_logit, features_imp_rf, features_imp_xgb], axis = 1)
features_imp.round(2)

Unnamed: 0,features,logit_imp,features.1,rf_imp,features.2,xgb_imp
0,person_age,-0.1,person_age,0.06,person_age,0.03
1,person_income,0.05,person_income,0.14,person_income,0.03
2,person_emp_length,-0.04,person_emp_length,0.07,person_emp_length,0.04
3,loan_amnt,-0.75,loan_amnt,0.07,loan_amnt,0.01
4,loan_int_rate,1.06,loan_int_rate,0.16,loan_int_rate,0.07
5,loan_percent_income,1.46,loan_percent_income,0.2,loan_percent_income,0.1
6,cb_person_cred_hist_length,0.04,cb_person_cred_hist_length,0.06,cb_person_cred_hist_length,0.05
7,OTHER,-0.81,OTHER,0.0,OTHER,0.01
8,OWN,-2.2,OWN,0.02,OWN,0.12
9,RENT,0.44,RENT,0.05,RENT,0.09


In [98]:
xgb_prediction_df = pd.DataFrame({'test_indices_xgb':x_test.index, 'xgb_pred':xgb_prediction})
logit_prediction_df = pd.DataFrame({'test_indices_logit':x_test.index, 'logit_pred':logit_prediction})
rf_prediction_df = pd.DataFrame({'test_indices_rf':x_test.index, 'rf_pred':rf_prediction})


In [99]:
cr_data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2
...,...,...,...,...,...,...,...,...,...,...,...,...
28602,52,64500,RENT,0.0,EDUCATION,B,5000,11.26,0,0.08,N,20
28603,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
28604,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
28605,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28


In [100]:
cr_data_copy = cr_data.copy()

In [101]:
merged_xgb = cr_data_copy.merge(xgb_prediction_df, left_index = True, right_on = 'test_indices_xgb', how = 'left')
merged_xgb.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_xgb,xgb_pred
,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,0,
5212.0,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,1,1.0
,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,2,
,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,3,
3918.0,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,4,1.0


In [102]:
merged_rf = merged_xgb.merge(rf_prediction_df, left_index = True, right_on = 'test_indices_rf', how = 'left')
merged_rf.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_xgb,xgb_pred,test_indices_rf,rf_pred
,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,0,,,
,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,1,1.0,5212.0,
,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,2,,,
,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,3,,,
,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,4,1.0,3918.0,


In [103]:
merged_final = merged_rf.merge(logit_prediction_df, left_index = True, right_on = 'test_indices_logit', how = 'left')
merged_final.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,test_indices_xgb,xgb_pred,test_indices_rf,rf_pred,test_indices_logit,logit_pred
,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,0,,,,,
,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,1,1.0,5212.0,,,
,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,2,,,,,
,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,3,,,,,
,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,4,1.0,3918.0,,,


In [104]:
merged_final.shape

(28607, 18)

In [105]:
merged_final.dropna(inplace = True) #all the data with prediction

In [106]:
merged_final.shape

(254, 18)

In [107]:
final_with_prediction = merged_final.drop(['test_indices_xgb', 'test_indices_rf', 'test_indices_logit'], axis = 1)
final_with_prediction.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,xgb_pred,rf_pred,logit_pred
3014.0,23,92004,RENT,6.0,PERSONAL,C,30000,15.23,1,0.33,Y,3,1.0,1.0,0.0
8865.0,23,151200,RENT,7.0,DEBTCONSOLIDATION,B,28000,11.11,0,0.19,N,2,0.0,0.0,1.0
1754.0,22,14400,OWN,2.0,MEDICAL,B,4000,10.99,1,0.28,N,3,1.0,1.0,0.0
3710.0,24,85000,RENT,5.0,MEDICAL,B,25000,10.62,0,0.29,N,4,0.0,1.0,1.0
8162.0,23,16000,OWN,7.0,MEDICAL,A,6000,7.9,1,0.38,N,2,1.0,0.0,0.0


In [108]:
final_with_prediction.to_csv(r'...', index = False)