In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Data Description

In [2]:
desc_df = pd.read_csv('dataset/LCDataDictionary.csv')

In [3]:
for i in range(desc_df.shape[0]):
    print(desc_df.loc[i]['LoanStatNew'],' : ',desc_df.loc[i]['Description'])

acc_now_delinq  :  The number of accounts on which the borrower is now delinquent.
acc_open_past_24mths  :  Number of trades opened in past 24 months.
addr_state  :  The state provided by the borrower in the loan application
all_util  :  Balance to credit limit on all trades
annual_inc  :  The self-reported annual income provided by the borrower during registration.
annual_inc_joint  :  The combined self-reported annual income provided by the co-borrowers during registration
application_type  :  Indicates whether the loan is an individual application or a joint application with two co-borrowers
avg_cur_bal  :  Average current balance of all accounts
bc_open_to_buy  :  Total open to buy on revolving bankcards.
bc_util  :  Ratio of total current balance to high credit/credit limit for all bankcard accounts.
chargeoff_within_12_mths  :  Number of charge-offs within 12 months
collection_recovery_fee  :  post charge off collection fee
collections_12_mths_ex_med  :  Number of collections in 

### Loading Dataset

In [4]:
loan_df = pd.read_csv('dataset/lending_club_loans.csv', skiprows=1, low_memory=False)

In [5]:
loan_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,0.0,0.0,,,,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,0.0,0.0,,,,
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,0.0,0.0,,,,
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,0.0,0.0,,,,
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,0.0,0.0,,,,


In [6]:
loan_df.shape

(42538, 115)

# Data Preparation

### Missing Values

In [59]:
clean_df = loan_df.copy()

In [60]:
clean_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,0.0,0.0,,,,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,0.0,0.0,,,,
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,0.0,0.0,,,,
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,0.0,0.0,,,,
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,0.0,0.0,,,,


In [61]:
clean_df.isna().sum()

id                                0
member_id                         3
loan_amnt                         3
funded_amnt                       3
funded_amnt_inv                   3
                              ...  
tax_liens                       108
tot_hi_cred_lim               42538
total_bal_ex_mort             42538
total_bc_limit                42538
total_il_high_credit_limit    42538
Length: 115, dtype: int64

In [62]:
null_cols = []
for cols in clean_df:
    if clean_df[cols].isna().sum() > 38000:
        null_cols.append(cols)

In [63]:
len(null_cols)

56

In [64]:
clean_df[null_cols]

Unnamed: 0,mths_since_last_record,next_pymnt_d,mths_since_last_major_derog,annual_inc_joint,dti_joint,verification_status_joint,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,...,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,Oct-2016,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42533,,Jul-2010,,,,,,,,,...,,,,,,,,,,
42534,,Jul-2010,,,,,,,,,...,,,,,,,,,,
42535,,Jul-2010,,,,,,,,,...,,,,,,,,,,
42536,,,,,,,,,,,...,,,,,,,,,,


In [65]:
clean_df.drop(columns=null_cols, axis=1, inplace=True)

In [66]:
clean_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,744.0,740.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,499.0,0.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,719.0,715.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,604.0,600.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,694.0,690.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0


In [67]:
clean_df.isna().sum()

id                                0
member_id                         3
loan_amnt                         3
funded_amnt                       3
funded_amnt_inv                   3
term                              3
int_rate                          3
installment                       3
grade                             3
sub_grade                         3
emp_title                      2629
emp_length                     1115
home_ownership                    3
annual_inc                        7
verification_status               3
issue_d                           3
loan_status                       3
pymnt_plan                        3
url                               3
desc                          13296
purpose                           3
title                            16
zip_code                          3
addr_state                        3
dti                               3
delinq_2yrs                      32
earliest_cr_line                 32
fico_range_low              

In [68]:
clean_df[['emp_title','emp_length','desc','mths_since_last_delinq','pub_rec_bankruptcies']]

Unnamed: 0,emp_title,emp_length,desc,mths_since_last_delinq,pub_rec_bankruptcies
0,,10+ years,Borrower added on 12/22/11 > I need to upgra...,,0.0
1,Ryder,< 1 year,Borrower added on 12/22/11 > I plan to use t...,,0.0
2,,10+ years,,,0.0
3,AIR RESOURCES BOARD,10+ years,Borrower added on 12/21/11 > to pay for prop...,35.0,0.0
4,University Medical Group,1 year,Borrower added on 12/21/11 > I plan on combi...,38.0,0.0
...,...,...,...,...,...
42533,,< 1 year,"I need to pay $2,100 for fixing my Volvo :) A...",,
42534,,< 1 year,"Hi, I'm buying a used car. Anybody on faceb...",,
42535,Homemaker,10+ years,I need to make several improvements around the...,,
42536,,,,,


In [69]:
clean_df['emp_title'].value_counts()

US Army                  139
Bank of America          115
IBM                       72
AT&T                      61
Kaiser Permanente         61
                        ... 
encore rehabilitation      1
Insight Companies Inc      1
Maryland                   1
Lowe's Companies           1
yellow cab com.            1
Name: emp_title, Length: 30658, dtype: int64

In [70]:
clean_df['emp_length'].value_counts()

10+ years    9369
< 1 year     5062
2 years      4743
3 years      4364
4 years      3649
1 year       3595
5 years      3458
6 years      2375
7 years      1875
8 years      1592
9 years      1341
Name: emp_length, dtype: int64

In [71]:
clean_df['desc']

0          Borrower added on 12/22/11 > I need to upgra...
1          Borrower added on 12/22/11 > I plan to use t...
2                                                      NaN
3          Borrower added on 12/21/11 > to pay for prop...
4          Borrower added on 12/21/11 > I plan on combi...
                               ...                        
42533    I need to pay $2,100 for fixing my Volvo :)  A...
42534    Hi,   I'm buying  a used car. Anybody on faceb...
42535    I need to make several improvements around the...
42536                                                  NaN
42537                                                  NaN
Name: desc, Length: 42538, dtype: object

In [72]:
clean_df['mths_since_last_delinq'].isna().sum()/clean_df.shape[0]

0.6330575015280455

In [73]:
clean_df['pub_rec_bankruptcies'].value_counts()

0.0    39316
1.0     1846
2.0        8
Name: pub_rec_bankruptcies, dtype: int64

In [74]:
clean_df['title'].value_counts()

Debt Consolidation                 2259
Debt Consolidation Loan            1760
Personal Loan                       708
Consolidation                       547
debt consolidation                  532
                                   ... 
Consolidation2011                     1
Relief for a single parent loan       1
Pay debts                             1
Denise                                1
solidgold                             1
Name: title, Length: 21264, dtype: int64

In [75]:
clean_df.dropna(subset=['emp_length'], axis=0, inplace=True)

In [76]:
clean_df.isna().sum()

id                                0
member_id                         0
loan_amnt                         0
funded_amnt                       0
funded_amnt_inv                   0
term                              0
int_rate                          0
installment                       0
grade                             0
sub_grade                         0
emp_title                      1571
emp_length                        0
home_ownership                    0
annual_inc                        4
verification_status               0
issue_d                           0
loan_status                       0
pymnt_plan                        0
url                               0
desc                          12834
purpose                           0
title                            12
zip_code                          0
addr_state                        0
dti                               0
delinq_2yrs                      29
earliest_cr_line                 29
fico_range_low              

In [77]:
clean_df['url']

0        https://lendingclub.com/browse/loanDetail.acti...
1        https://lendingclub.com/browse/loanDetail.acti...
2        https://lendingclub.com/browse/loanDetail.acti...
3        https://lendingclub.com/browse/loanDetail.acti...
4        https://lendingclub.com/browse/loanDetail.acti...
                               ...                        
42531    https://lendingclub.com/browse/loanDetail.acti...
42532    https://lendingclub.com/browse/loanDetail.acti...
42533    https://lendingclub.com/browse/loanDetail.acti...
42534    https://lendingclub.com/browse/loanDetail.acti...
42535    https://lendingclub.com/browse/loanDetail.acti...
Name: url, Length: 41423, dtype: object

In [78]:
clean_df.drop(columns=['emp_title','desc','mths_since_last_delinq'], axis=1, inplace=True)

In [79]:
clean_df.isna().sum()

id                               0
member_id                        0
loan_amnt                        0
funded_amnt                      0
funded_amnt_inv                  0
term                             0
int_rate                         0
installment                      0
grade                            0
sub_grade                        0
emp_length                       0
home_ownership                   0
annual_inc                       4
verification_status              0
issue_d                          0
loan_status                      0
pymnt_plan                       0
url                              0
purpose                          0
title                           12
zip_code                         0
addr_state                       0
dti                              0
delinq_2yrs                     29
earliest_cr_line                29
fico_range_low                   0
fico_range_high                  0
inq_last_6mths                  29
open_acc            

In [80]:
clean_df.dropna(axis=0, inplace=True)

In [81]:
clean_df.isna().sum()

id                            0
member_id                     0
loan_amnt                     0
funded_amnt                   0
funded_amnt_inv               0
term                          0
int_rate                      0
installment                   0
grade                         0
sub_grade                     0
emp_length                    0
home_ownership                0
annual_inc                    0
verification_status           0
issue_d                       0
loan_status                   0
pymnt_plan                    0
url                           0
purpose                       0
title                         0
zip_code                      0
addr_state                    0
dti                           0
delinq_2yrs                   0
earliest_cr_line              0
fico_range_low                0
fico_range_high               0
inq_last_6mths                0
open_acc                      0
pub_rec                       0
revol_bal                     0
revol_ut

In [82]:
clean_df.shape

(39913, 56)

In [83]:
clean_df

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.000000,36 months,10.65%,162.87,B,B2,...,744.0,740.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
1,1077430,1314167.0,2500.0,2500.0,2500.000000,60 months,15.27%,59.83,C,C4,...,499.0,0.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
2,1077175,1313524.0,2400.0,2400.0,2400.000000,36 months,15.96%,84.33,C,C5,...,719.0,715.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
3,1076863,1277178.0,10000.0,10000.0,10000.000000,36 months,13.49%,339.31,C,C1,...,604.0,600.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
4,1075358,1311748.0,3000.0,3000.0,3000.000000,60 months,12.69%,67.79,B,B5,...,694.0,690.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42313,150070,150059.0,10000.0,10000.0,149.994976,36 months,10.91%,326.97,C,C3,...,569.0,565.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,1.0,0.0
42369,137757,137499.0,10475.0,10475.0,3174.995847,36 months,17.54%,376.29,G,G4,...,724.0,720.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
42370,137749,137410.0,10475.0,10475.0,2925.000000,36 months,14.38%,359.95,E,E4,...,539.0,535.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
42371,136976,136970.0,20425.0,8500.0,1849.998917,36 months,15.96%,298.67,F,F4,...,609.0,605.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,1.0,0.0
