In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Data Description

In [2]:
desc_df = pd.read_csv('dataset/LCDataDictionary.csv')

In [3]:
for i in range(desc_df.shape[0]):
    print(desc_df.loc[i]['LoanStatNew'],' : ',desc_df.loc[i]['Description'])

acc_now_delinq  :  The number of accounts on which the borrower is now delinquent.
acc_open_past_24mths  :  Number of trades opened in past 24 months.
addr_state  :  The state provided by the borrower in the loan application
all_util  :  Balance to credit limit on all trades
annual_inc  :  The self-reported annual income provided by the borrower during registration.
annual_inc_joint  :  The combined self-reported annual income provided by the co-borrowers during registration
application_type  :  Indicates whether the loan is an individual application or a joint application with two co-borrowers
avg_cur_bal  :  Average current balance of all accounts
bc_open_to_buy  :  Total open to buy on revolving bankcards.
bc_util  :  Ratio of total current balance to high credit/credit limit for all bankcard accounts.
chargeoff_within_12_mths  :  Number of charge-offs within 12 months
collection_recovery_fee  :  post charge off collection fee
collections_12_mths_ex_med  :  Number of collections in 

### Loading Dataset

In [4]:
loan_df = pd.read_csv('dataset/lending_club_loans.csv', skiprows=1, low_memory=False)

In [5]:
loan_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,0.0,0.0,,,,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,0.0,0.0,,,,
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,0.0,0.0,,,,
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,0.0,0.0,,,,
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,0.0,0.0,,,,


In [252]:
loan_df.shape

(42538, 115)

# Data Preparation

### Missing Values

In [492]:
clean_df = loan_df.copy()

In [493]:
clean_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,0.0,0.0,,,,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,0.0,0.0,,,,
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,0.0,0.0,,,,
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,0.0,0.0,,,,
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,0.0,0.0,,,,


In [494]:
# checking total null values in each columns

for cols in clean_df:
    print(clean_df[cols].isna().sum())

0
3
3
3
3
3
3
3
3
3
2629
1115
3
7
3
3
3
3
3
13296
3
16
3
3
3
32
32
3
3
32
26929
38887
32
32
3
93
32
3
3
3
3
3
3
3
3
3
3
86
3
39242
7
3
3
148
42538
3
3
42538
42538
42538
32
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
148
32
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
42538
1368
108
42538
42538
42538
42538


In [495]:
# columns, where null values are more than 38000

null_cols = []
for cols in clean_df:
    if clean_df[cols].isna().sum() > 38000:
        null_cols.append(cols)

In [496]:
len(null_cols)

56

In [497]:
clean_df[null_cols]

Unnamed: 0,mths_since_last_record,next_pymnt_d,mths_since_last_major_derog,annual_inc_joint,dti_joint,verification_status_joint,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,...,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,Oct-2016,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42533,,Jul-2010,,,,,,,,,...,,,,,,,,,,
42534,,Jul-2010,,,,,,,,,...,,,,,,,,,,
42535,,Jul-2010,,,,,,,,,...,,,,,,,,,,
42536,,,,,,,,,,,...,,,,,,,,,,


In [498]:
clean_df.drop(columns=null_cols, axis=1, inplace=True)

In [499]:
clean_df.shape

(42538, 59)

In [500]:
clean_df.isna().sum()

id                                0
member_id                         3
loan_amnt                         3
funded_amnt                       3
funded_amnt_inv                   3
term                              3
int_rate                          3
installment                       3
grade                             3
sub_grade                         3
emp_title                      2629
emp_length                     1115
home_ownership                    3
annual_inc                        7
verification_status               3
issue_d                           3
loan_status                       3
pymnt_plan                        3
url                               3
desc                          13296
purpose                           3
title                            16
zip_code                          3
addr_state                        3
dti                               3
delinq_2yrs                      32
earliest_cr_line                 32
fico_range_low              

In [501]:
clean_df['emp_title'].value_counts()

US Army                                         139
Bank of America                                 115
IBM                                              72
Kaiser Permanente                                61
AT&T                                             61
                                               ... 
North Pacific Dermatology                         1
Lowes                                             1
Deutsch inc                                       1
Richard Wolf Medical Instruments Corporation      1
ice cold air                                      1
Name: emp_title, Length: 30658, dtype: int64

In [502]:
clean_df['emp_title'].isna().sum()

2629

In [503]:
clean_df['emp_title'].fillna('Other', inplace=True)

In [504]:
clean_df['emp_title'].value_counts()

Other                                           2630
US Army                                          139
Bank of America                                  115
IBM                                               72
Kaiser Permanente                                 61
                                                ... 
North Pacific Dermatology                          1
Lowes                                              1
Deutsch inc                                        1
Richard Wolf Medical Instruments Corporation       1
ice cold air                                       1
Name: emp_title, Length: 30658, dtype: int64

In [505]:
clean_df['emp_length'].value_counts()

10+ years    9369
< 1 year     5062
2 years      4743
3 years      4364
4 years      3649
1 year       3595
5 years      3458
6 years      2375
7 years      1875
8 years      1592
9 years      1341
Name: emp_length, dtype: int64

In [506]:
clean_df['emp_length'].isna().sum()

1115

In [507]:
clean_df['emp_length'].fillna('< 1 year', inplace=True)

In [508]:
clean_df['emp_length'].value_counts()

10+ years    9369
< 1 year     6177
2 years      4743
3 years      4364
4 years      3649
1 year       3595
5 years      3458
6 years      2375
7 years      1875
8 years      1592
9 years      1341
Name: emp_length, dtype: int64

In [509]:
clean_df['desc']

0          Borrower added on 12/22/11 > I need to upgra...
1          Borrower added on 12/22/11 > I plan to use t...
2                                                      NaN
3          Borrower added on 12/21/11 > to pay for prop...
4          Borrower added on 12/21/11 > I plan on combi...
                               ...                        
42533    I need to pay $2,100 for fixing my Volvo :)  A...
42534    Hi,   I'm buying  a used car. Anybody on faceb...
42535    I need to make several improvements around the...
42536                                                  NaN
42537                                                  NaN
Name: desc, Length: 42538, dtype: object

In [510]:
clean_df['desc'].isna().sum()

13296

In [511]:
clean_df['desc'].fillna('not filled', inplace=True)

In [512]:
clean_df.isna().sum()

id                                0
member_id                         3
loan_amnt                         3
funded_amnt                       3
funded_amnt_inv                   3
term                              3
int_rate                          3
installment                       3
grade                             3
sub_grade                         3
emp_title                         0
emp_length                        0
home_ownership                    3
annual_inc                        7
verification_status               3
issue_d                           3
loan_status                       3
pymnt_plan                        3
url                               3
desc                              0
purpose                           3
title                            16
zip_code                          3
addr_state                        3
dti                               3
delinq_2yrs                      32
earliest_cr_line                 32
fico_range_low              

In [513]:
clean_df['mths_since_last_delinq'].isna().sum()

26929

In [514]:
clean_df['mths_since_last_delinq'].isna().sum()/clean_df.shape[0]

0.6330575015280455

In [515]:
clean_df.drop(columns=['mths_since_last_delinq'], axis=1, inplace=True)

In [516]:
clean_df.isna().sum()

id                               0
member_id                        3
loan_amnt                        3
funded_amnt                      3
funded_amnt_inv                  3
term                             3
int_rate                         3
installment                      3
grade                            3
sub_grade                        3
emp_title                        0
emp_length                       0
home_ownership                   3
annual_inc                       7
verification_status              3
issue_d                          3
loan_status                      3
pymnt_plan                       3
url                              3
desc                             0
purpose                          3
title                           16
zip_code                         3
addr_state                       3
dti                              3
delinq_2yrs                     32
earliest_cr_line                32
fico_range_low                   3
fico_range_high     

In [517]:
clean_df['pub_rec_bankruptcies'].value_counts()

0.0    39316
1.0     1846
2.0        8
Name: pub_rec_bankruptcies, dtype: int64

In [518]:
clean_df['pub_rec_bankruptcies'].fillna(0, inplace=True)

In [519]:
clean_df.isna().sum()

id                              0
member_id                       3
loan_amnt                       3
funded_amnt                     3
funded_amnt_inv                 3
term                            3
int_rate                        3
installment                     3
grade                           3
sub_grade                       3
emp_title                       0
emp_length                      0
home_ownership                  3
annual_inc                      7
verification_status             3
issue_d                         3
loan_status                     3
pymnt_plan                      3
url                             3
desc                            0
purpose                         3
title                          16
zip_code                        3
addr_state                      3
dti                             3
delinq_2yrs                    32
earliest_cr_line               32
fico_range_low                  3
fico_range_high                 3
inq_last_6mths

In [520]:
clean_df['collections_12_mths_ex_med'].value_counts()

0.0    42390
Name: collections_12_mths_ex_med, dtype: int64

In [521]:
clean_df['collections_12_mths_ex_med'].fillna(0, inplace=True)

In [522]:
clean_df['chargeoff_within_12_mths'].isna().sum()

148

In [523]:
clean_df['chargeoff_within_12_mths'].value_counts()

0.0    42390
Name: chargeoff_within_12_mths, dtype: int64

In [524]:
clean_df['chargeoff_within_12_mths'].fillna(0, inplace=True)

In [525]:
clean_df['tax_liens'].value_counts()

0.0    42429
1.0        1
Name: tax_liens, dtype: int64

In [526]:
clean_df['tax_liens'].fillna(0, inplace=True)

In [527]:
clean_df.isna().sum()

id                             0
member_id                      3
loan_amnt                      3
funded_amnt                    3
funded_amnt_inv                3
term                           3
int_rate                       3
installment                    3
grade                          3
sub_grade                      3
emp_title                      0
emp_length                     0
home_ownership                 3
annual_inc                     7
verification_status            3
issue_d                        3
loan_status                    3
pymnt_plan                     3
url                            3
desc                           0
purpose                        3
title                         16
zip_code                       3
addr_state                     3
dti                            3
delinq_2yrs                   32
earliest_cr_line              32
fico_range_low                 3
fico_range_high                3
inq_last_6mths                32
open_acc  

In [528]:
clean_df['last_pymnt_d'].value_counts()

Mar-2013    1070
Dec-2014     949
May-2013     943
Feb-2013     906
Mar-2012     893
            ... 
Jun-2008      20
Mar-2008      18
Jan-2008      11
Feb-2008       8
Dec-2007       2
Name: last_pymnt_d, Length: 106, dtype: int64

In [529]:
clean_df.dropna(subset=['last_pymnt_d','last_credit_pull_d'], axis=0, inplace=True)

In [530]:
clean_df.shape

(42448, 58)

In [531]:
clean_df.isna().sum()

id                             0
member_id                      0
loan_amnt                      0
funded_amnt                    0
funded_amnt_inv                0
term                           0
int_rate                       0
installment                    0
grade                          0
sub_grade                      0
emp_title                      0
emp_length                     0
home_ownership                 0
annual_inc                     4
verification_status            0
issue_d                        0
loan_status                    0
pymnt_plan                     0
url                            0
desc                           0
purpose                        0
title                         13
zip_code                       0
addr_state                     0
dti                            0
delinq_2yrs                   28
earliest_cr_line              28
fico_range_low                 0
fico_range_high                0
inq_last_6mths                28
open_acc  

In [532]:
clean_df['title'].value_counts()

Debt Consolidation                       2256
Debt Consolidation Loan                  1756
Personal Loan                             706
Consolidation                             546
debt consolidation                        532
                                         ... 
Get Rid of High Interest Credit Cards       1
Need surgery/ finish paying off debt        1
paying off  credit card debt                1
CAGloan                                     1
Upgrade Apt                                 1
Name: title, Length: 21223, dtype: int64

In [533]:
clean_df['title'].fillna('Other', inplace=True)

In [534]:
clean_df.isna().sum()

id                             0
member_id                      0
loan_amnt                      0
funded_amnt                    0
funded_amnt_inv                0
term                           0
int_rate                       0
installment                    0
grade                          0
sub_grade                      0
emp_title                      0
emp_length                     0
home_ownership                 0
annual_inc                     4
verification_status            0
issue_d                        0
loan_status                    0
pymnt_plan                     0
url                            0
desc                           0
purpose                        0
title                          0
zip_code                       0
addr_state                     0
dti                            0
delinq_2yrs                   28
earliest_cr_line              28
fico_range_low                 0
fico_range_high                0
inq_last_6mths                28
open_acc  

In [535]:
clean_df['delinq_2yrs'].isna().sum()

28

In [536]:
clean_df['delinq_2yrs'].value_counts()

0.0     37697
1.0      3588
2.0       769
3.0       242
4.0        71
5.0        27
6.0        13
7.0         6
8.0         3
11.0        2
9.0         1
13.0        1
Name: delinq_2yrs, dtype: int64

In [537]:
clean_df['delinq_2yrs'].fillna(0, inplace=True)

In [538]:
clean_df['earliest_cr_line']

0        Jan-1985
1        Apr-1999
2        Nov-2001
3        Feb-1996
4        Jan-1996
           ...   
42531         NaN
42532         NaN
42533         NaN
42534         NaN
42535         NaN
Name: earliest_cr_line, Length: 42448, dtype: object

In [539]:
clean_df.dropna(subset=['earliest_cr_line'], axis=0, inplace=True)

In [540]:
clean_df.isna().sum()

id                             0
member_id                      0
loan_amnt                      0
funded_amnt                    0
funded_amnt_inv                0
term                           0
int_rate                       0
installment                    0
grade                          0
sub_grade                      0
emp_title                      0
emp_length                     0
home_ownership                 0
annual_inc                     0
verification_status            0
issue_d                        0
loan_status                    0
pymnt_plan                     0
url                            0
desc                           0
purpose                        0
title                          0
zip_code                       0
addr_state                     0
dti                            0
delinq_2yrs                    0
earliest_cr_line               0
fico_range_low                 0
fico_range_high                0
inq_last_6mths                 0
open_acc  

In [541]:
clean_df['revol_util'].value_counts()

0%        1068
40.7%       65
63%         63
0.2%        63
66.6%       62
          ... 
100.6%       1
104.2%       1
60.69%       1
27.81%       1
49.63%       1
Name: revol_util, Length: 1119, dtype: int64

In [542]:
clean_df['revol_util'].fillna('0%', inplace=True)

In [543]:
clean_df.isna().sum()

id                            0
member_id                     0
loan_amnt                     0
funded_amnt                   0
funded_amnt_inv               0
term                          0
int_rate                      0
installment                   0
grade                         0
sub_grade                     0
emp_title                     0
emp_length                    0
home_ownership                0
annual_inc                    0
verification_status           0
issue_d                       0
loan_status                   0
pymnt_plan                    0
url                           0
desc                          0
purpose                       0
title                         0
zip_code                      0
addr_state                    0
dti                           0
delinq_2yrs                   0
earliest_cr_line              0
fico_range_low                0
fico_range_high               0
inq_last_6mths                0
open_acc                      0
pub_rec 

In [544]:
clean_df.reset_index(drop=True, inplace=True)

In [545]:
clean_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,744.0,740.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,499.0,0.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,719.0,715.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,604.0,600.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,694.0,690.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0


## Data Cleaning

In [546]:
new_clean_df = clean_df.copy()

In [547]:
new_clean_df.shape

(42420, 58)

In [548]:
new_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42420 entries, 0 to 42419
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          42420 non-null  object 
 1   member_id                   42420 non-null  float64
 2   loan_amnt                   42420 non-null  float64
 3   funded_amnt                 42420 non-null  float64
 4   funded_amnt_inv             42420 non-null  float64
 5   term                        42420 non-null  object 
 6   int_rate                    42420 non-null  object 
 7   installment                 42420 non-null  float64
 8   grade                       42420 non-null  object 
 9   sub_grade                   42420 non-null  object 
 10  emp_title                   42420 non-null  object 
 11  emp_length                  42420 non-null  object 
 12  home_ownership              42420 non-null  object 
 13  annual_inc                  424

### member_id

In [549]:
new_clean_df['member_id'] = new_clean_df['member_id'].astype('int').astype('object')

In [550]:
new_clean_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,744.0,740.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
1,1077430,1314167,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,499.0,0.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
2,1077175,1313524,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,719.0,715.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
3,1076863,1277178,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,604.0,600.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
4,1075358,1311748,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,694.0,690.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0


### term

In [551]:
new_clean_df['term'].value_counts()

 36 months    31437
 60 months    10983
Name: term, dtype: int64

In [552]:
new_clean_df['term'].replace({' 36 months':36, ' 60 months':60}, inplace=True)

In [553]:
new_clean_df['term_months'] = new_clean_df['term'].copy()

In [554]:
new_clean_df.drop(columns=['term'], axis=1, inplace=True)

In [555]:
new_clean_df['term_months'].dtype

dtype('int64')

In [556]:
new_clean_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,grade,sub_grade,emp_title,...,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,term_months
0,1077501,1296599,5000.0,5000.0,4975.0,10.65%,162.87,B,B2,Other,...,740.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36
1,1077430,1314167,2500.0,2500.0,2500.0,15.27%,59.83,C,C4,Ryder,...,0.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,60
2,1077175,1313524,2400.0,2400.0,2400.0,15.96%,84.33,C,C5,Other,...,715.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36
3,1076863,1277178,10000.0,10000.0,10000.0,13.49%,339.31,C,C1,AIR RESOURCES BOARD,...,600.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36
4,1075358,1311748,3000.0,3000.0,3000.0,12.69%,67.79,B,B5,University Medical Group,...,690.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,60


### int_rate

In [557]:
def nums(x):
    num = re.compile(r'[\d.]+').findall(x)
    return num[0]

In [558]:
new_clean_df['int_rate_%'] = new_clean_df['int_rate'].apply(nums)

In [559]:
new_clean_df['int_rate_%'] = pd.to_numeric(new_clean_df['int_rate_%'])

In [560]:
new_clean_df['int_rate_%'].dtype

dtype('float64')

In [561]:
new_clean_df.drop(columns=['int_rate'], axis=1, inplace=True)

In [562]:
new_clean_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,installment,grade,sub_grade,emp_title,emp_length,...,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,term_months,int_rate_%
0,1077501,1296599,5000.0,5000.0,4975.0,162.87,B,B2,Other,10+ years,...,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36,10.65
1,1077430,1314167,2500.0,2500.0,2500.0,59.83,C,C4,Ryder,< 1 year,...,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,60,15.27
2,1077175,1313524,2400.0,2400.0,2400.0,84.33,C,C5,Other,10+ years,...,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36,15.96
3,1076863,1277178,10000.0,10000.0,10000.0,339.31,C,C1,AIR RESOURCES BOARD,10+ years,...,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36,13.49
4,1075358,1311748,3000.0,3000.0,3000.0,67.79,B,B5,University Medical Group,1 year,...,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,60,12.69


### emp_length

In [563]:
new_clean_df['emp_length'].value_counts()

10+ years    9356
< 1 year     6139
2 years      4734
3 years      4355
4 years      3641
1 year       3578
5 years      3448
6 years      2371
7 years      1874
8 years      1588
9 years      1336
Name: emp_length, dtype: int64

In [564]:
new_clean_df['emp_length_year'] = new_clean_df['emp_length'].replace({'< 1 year':'0'})

In [565]:
new_clean_df['emp_length_year'] = new_clean_df['emp_length_year'].apply(nums)

In [566]:
new_clean_df['emp_length_year'] = new_clean_df['emp_length_year'].astype('int64')

### revol_util

In [567]:
new_clean_df['revol_util']

0        83.7%
1         9.4%
2        98.5%
3          21%
4        53.9%
         ...  
42415       0%
42416      85%
42417     2.2%
42418      66%
42419    63.5%
Name: revol_util, Length: 42420, dtype: object

In [569]:
new_clean_df['revol_util_% = new_clean_df['revol_util'].apply(nums)

In [570]:
new_clean_df['revol_util_%'] = new_clean_df['revol_util_%'].astype('float64')

In [571]:
new_clean_df.drop(columns=['emp_length','revol_util'], axis=1, inplace=True)

In [576]:
new_clean_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,installment,grade,sub_grade,emp_title,home_ownership,...,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,term_months,int_rate_%,emp_length_year,revol_util_%
0,1077501,1296599,5000.0,5000.0,4975.0,162.87,B,B2,Other,RENT,...,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36,10.65,10,83.7
1,1077430,1314167,2500.0,2500.0,2500.0,59.83,C,C4,Ryder,RENT,...,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,60,15.27,0,9.4
2,1077175,1313524,2400.0,2400.0,2400.0,84.33,C,C5,Other,RENT,...,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36,15.96,10,98.5
3,1076863,1277178,10000.0,10000.0,10000.0,339.31,C,C1,AIR RESOURCES BOARD,RENT,...,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36,13.49,10,21.0
4,1075358,1311748,3000.0,3000.0,3000.0,67.79,B,B5,University Medical Group,RENT,...,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,60,12.69,1,53.9


### issue_d

In [631]:
new_clean_df['issue_d']

0        Dec-2011
1        Dec-2011
2        Dec-2011
3        Dec-2011
4        Dec-2011
           ...   
42415    Jul-2007
42416    Jul-2007
42417    Jul-2007
42418    Jun-2007
42419    Jun-2007
Name: issue_d, Length: 42420, dtype: object

In [632]:
new_clean_df['issue_d'] = new_clean_df['issue_d'].astype('datetime64')

In [634]:
new_clean_df['issue_d']

0       2011-12-01
1       2011-12-01
2       2011-12-01
3       2011-12-01
4       2011-12-01
           ...    
42415   2007-07-01
42416   2007-07-01
42417   2007-07-01
42418   2007-06-01
42419   2007-06-01
Name: issue_d, Length: 42420, dtype: datetime64[ns]

### earliest_cr_line

In [642]:
new_clean_df['earliest_cr_line']

0        Jan-1985
1        Apr-1999
2        Nov-2001
3        Feb-1996
4        Jan-1996
           ...   
42415    Dec-2006
42416    Sep-1999
42417    Mar-1984
42418    Jan-1996
42419    Jul-2004
Name: earliest_cr_line, Length: 42420, dtype: object

In [643]:
new_clean_df['earliest_cr_line'] = new_clean_df['earliest_cr_line'].astype('datetime64')

In [644]:
new_clean_df['earliest_cr_line']

0       1985-01-01
1       1999-04-01
2       2001-11-01
3       1996-02-01
4       1996-01-01
           ...    
42415   2006-12-01
42416   1999-09-01
42417   1984-03-01
42418   1996-01-01
42419   2004-07-01
Name: earliest_cr_line, Length: 42420, dtype: datetime64[ns]

### total_pymnt

In [665]:
new_clean_df['total_pymnt']

0         5863.16
1         1008.71
2         3005.67
3        12231.89
4         3784.49
           ...   
42415     6486.77
42416    12622.32
42417     2227.02
42418     7029.87
42419     5084.72
Name: total_pymnt, Length: 42420, dtype: float64

In [663]:
new_clean_df['total_pymnt'] = new_clean_df['total_pymnt'].apply(lambda x:np.round(x, decimals=2))

In [666]:
new_clean_df['total_pymnt']

0         5863.16
1         1008.71
2         3005.67
3        12231.89
4         3784.49
           ...   
42415     6486.77
42416    12622.32
42417     2227.02
42418     7029.87
42419     5084.72
Name: total_pymnt, Length: 42420, dtype: float64

### last_pymnt_d

In [683]:
new_clean_df['last_pymnt_d']

0        Jan-2015
1        Apr-2013
2        Jun-2014
3        Jan-2015
4        Sep-2016
           ...   
42415    Feb-2010
42416    Aug-2010
42417    Jul-2010
42418    Jun-2010
42419    Jun-2010
Name: last_pymnt_d, Length: 42420, dtype: object

In [684]:
new_clean_df['last_pymnt_d'] = new_clean_df['last_pymnt_d'].astype('datetime64')

In [685]:
new_clean_df['last_pymnt_d']

0       2015-01-01
1       2013-04-01
2       2014-06-01
3       2015-01-01
4       2016-09-01
           ...    
42415   2010-02-01
42416   2010-08-01
42417   2010-07-01
42418   2010-06-01
42419   2010-06-01
Name: last_pymnt_d, Length: 42420, dtype: datetime64[ns]

In [686]:
new_clean_df['last_pymnt_amnt']

0         171.62
1         119.66
2         649.91
3         357.48
4          67.79
          ...   
42415    1056.94
42416    1605.65
42417      63.59
42418     197.36
42419     143.28
Name: last_pymnt_amnt, Length: 42420, dtype: float64

### last_credit_pull_d

In [668]:
new_clean_df['last_credit_pull_d']

0        Sep-2016
1        Sep-2016
2        Sep-2016
3        Apr-2016
4        Sep-2016
           ...   
42415    Jul-2013
42416    Aug-2010
42417    Jun-2010
42418    Oct-2014
42419    Feb-2015
Name: last_credit_pull_d, Length: 42420, dtype: object

In [670]:
new_clean_df['last_credit_pull_d'] = new_clean_df['last_credit_pull_d'].astype('datetime64')

In [671]:
new_clean_df['last_credit_pull_d']

0       2016-09-01
1       2016-09-01
2       2016-09-01
3       2016-04-01
4       2016-09-01
           ...    
42415   2013-07-01
42416   2010-08-01
42417   2010-06-01
42418   2014-10-01
42419   2015-02-01
Name: last_credit_pull_d, Length: 42420, dtype: datetime64[ns]

In [697]:
new_clean_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,installment,grade,sub_grade,emp_title,home_ownership,...,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,term_months,int_rate_%,emp_length_year,revol_util_%
0,1077501,1296599,5000.0,5000.0,4975.0,162.87,B,B2,Other,RENT,...,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36,10.65,10,83.7
1,1077430,1314167,2500.0,2500.0,2500.0,59.83,C,C4,Ryder,RENT,...,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,60,15.27,0,9.4
2,1077175,1313524,2400.0,2400.0,2400.0,84.33,C,C5,Other,RENT,...,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36,15.96,10,98.5
3,1076863,1277178,10000.0,10000.0,10000.0,339.31,C,C1,AIR RESOURCES BOARD,RENT,...,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,36,13.49,10,21.0
4,1075358,1311748,3000.0,3000.0,3000.0,67.79,B,B5,University Medical Group,RENT,...,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,60,12.69,1,53.9


In [698]:
new_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42420 entries, 0 to 42419
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          42420 non-null  object        
 1   member_id                   42420 non-null  object        
 2   loan_amnt                   42420 non-null  float64       
 3   funded_amnt                 42420 non-null  float64       
 4   funded_amnt_inv             42420 non-null  float64       
 5   installment                 42420 non-null  float64       
 6   grade                       42420 non-null  object        
 7   sub_grade                   42420 non-null  object        
 8   emp_title                   42420 non-null  object        
 9   home_ownership              42420 non-null  object        
 10  annual_inc                  42420 non-null  float64       
 11  verification_status         42420 non-null  object    

In [699]:
new_clean_df.describe()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,installment,annual_inc,dti,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,...,policy_code,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,term_months,int_rate_%,emp_length_year,revol_util_%
count,42420.0,42420.0,42420.0,42420.0,42420.0,42420.0,42420.0,42420.0,42420.0,42420.0,...,42420.0,42420.0,42420.0,42420.0,42420.0,42420.0,42420.0,42420.0,42420.0,42420.0
mean,11100.622348,10831.808699,10152.617744,322.899562,69169.92,13.380188,0.152263,713.073432,717.073432,1.079939,...,1.0,9.4e-05,0.0,0.143329,0.043729,2.4e-05,42.213861,12.163146,4.798373,49.049401
std,7409.234126,7144.78181,7129.781409,208.855377,64134.22,6.724445,0.5119,36.181801,36.181801,1.5264,...,0.0,0.00971,0.0,29.389325,0.205415,0.004855,10.513002,3.708374,3.607687,28.400152
min,500.0,500.0,0.0,15.67,1896.0,0.0,0.0,610.0,614.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,36.0,5.42,0.0,0.0
25%,5200.0,5075.0,4950.0,165.74,40000.0,8.21,0.0,685.0,689.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,36.0,9.63,2.0,25.6
50%,9775.0,9600.0,8500.0,278.15,59000.0,13.48,0.0,710.0,714.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,36.0,11.99,4.0,49.6
75%,15000.0,15000.0,14000.0,428.5925,82500.0,18.69,0.0,740.0,744.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,60.0,14.72,9.0,72.6
max,35000.0,35000.0,35000.0,1305.19,6000000.0,29.99,13.0,825.0,829.0,33.0,...,1.0,1.0,0.0,6053.0,2.0,1.0,60.0,24.59,10.0,119.0
