In [12]:
#Import some modules
%matplotlib inline
import numpy as np
import pandas as pd
import utils as ut


#Load in the data
#loans = pd.read_csv('LoanStats3a.csv', skiprows = 1, 
#                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
#                 skipfooter = 2)

df1 = pd.read_csv('LoanStats3a.csv', skiprows = 1, 
                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
                 skipfooter = 2)
df2 = pd.read_csv('LoanStats3b.csv', skiprows = 1, 
                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
                 skipfooter = 2, infer_datetime_format=True)
df3 = pd.read_csv('LoanStats3c.csv', skiprows = 1, 
                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
                 skipfooter = 2, infer_datetime_format=True)
df4 = pd.read_csv('LoanStats3d.csv', skiprows = 1, 
                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
                 skipfooter = 2, infer_datetime_format=True)

loans = pd.concat([df1, df2, df3, df4], ignore_index = True)

del df1
del df2
del df3
del df4

In [13]:
#Begin by cleaning up the data a bit
#Percentages of the different loan statuses
loans.loan_status.value_counts(normalize = True)


Current                                                0.652293
Fully Paid                                             0.261470
Charged Off                                            0.057966
Late (31-120 days)                                     0.014763
In Grace Period                                        0.006518
Late (16-30 days)                                      0.002723
Does not meet the credit policy. Status:Fully Paid     0.002621
Does not meet the credit policy. Status:Charged Off    0.001005
Default                                                0.000634
Does not meet the credit policy. Status:Current        0.000005
dtype: float64

In [14]:
loans.replace(to_replace = 'Does not meet the credit policy. Status:Fully Paid', value = 'Fully Paid', inplace = True)
loans.replace(to_replace = 'Does not meet the credit policy. Status:Charged Off', value = 'Charged Off', inplace = True)
#Assign default to charged off? They seem equivalent from the lenders perspective
loans.replace(to_replace = 'Default', value = 'Charged Off', inplace = True)

loans.loan_status.value_counts(normalize = True)

Current                                            0.652293
Fully Paid                                         0.264091
Charged Off                                        0.059605
Late (31-120 days)                                 0.014763
In Grace Period                                    0.006518
Late (16-30 days)                                  0.002723
Does not meet the credit policy. Status:Current    0.000005
dtype: float64

In [15]:
#Filter out everything but 'fully paid' and 'charged off' and plot
#We're just going to consider those two categories to simplify the analysis
loans = loans[(loans.loan_status == 'Fully Paid') | (loans.loan_status == 'Charged Off')]

In [16]:
loans.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'sub_grade',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'verification_status', u'issue_d', u'loan_status', u'pymnt_plan',
       u'url', u'desc', u'purpose', u'title', u'zip_code', u'addr_state',
       u'dti', u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths',
       u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt',
       u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int',
       u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee',
       u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d',
       u'last_credit_pull_d', u'collections_12_mths_ex_med',
       u'mths_since_last_major_derog', u'policy_code', u'application_type',
       u'annu

Inferential statistics on the various features of this data set. Begin with the numeric fields that have little to no missing data and do not require any processing.

In [17]:
#Check out the effect of annual income
#Get the annual income for the charged off and fully paid categories
#Get the median incomes
inc_paid_med = loans[loans.loan_status == 'Fully Paid'].annual_inc.median()
inc_chrg_med = loans[loans.loan_status == 'Charged Off'].annual_inc.median()

loans.groupby('loan_status').annual_inc.median()

loan_status
Charged Off    56000
Fully Paid     64000
Name: annual_inc, dtype: float64

In [19]:
#Look at the effect size for the incomes
inc_paid_med / inc_chrg_med
#Modest effect size

1.1428571428571428

In [20]:
#Bootstrap the median incomes for the two categories and see if that effect is significant
prct = ut.med_diff_bootstrap(loans[loans.loan_status == 'Fully Paid'].annual_inc,
                             loans[loans.loan_status == 'Charged Off'].annual_inc, 99.5, 0.5)
prct
#Reliable

[7024.9849999999997, 8773.0099999999984]

In [21]:
#Check out the effect of dti
#Get the dti for the charged off and fully paid categories
#Get the median dtis
dti_paid_med = loans[loans.loan_status == 'Fully Paid'].dti.median()
dti_chrg_med = loans[loans.loan_status == 'Charged Off'].dti.median()

loans.groupby('loan_status').dti.median() 

loan_status
Charged Off    18.27
Fully Paid     15.69
Name: dti, dtype: float64

In [22]:
#Look at the effect size for the dtis
dti_chrg_med / dti_paid_med
#Modest effect size

1.1644359464627152

In [23]:
#Bootstrap the dti for the two categories and see if that effect is significant
prct = ut.med_diff_bootstrap(loans[loans.loan_status == 'Charged Off'].dti,
                             loans[loans.loan_status == 'Fully Paid'].dti, 99.5, 0.5)
prct
#Reliable

[2.4400000000000013, 2.7000000000000011]

In [24]:
#Check out the effect of delinquencies in the last two years
#Get the number of delinquencies for the charged off and fully paid categories
#Get the mean delinquencies
dlq_paid_mn = loans[loans.loan_status == 'Fully Paid'].delinq_2yrs.mean()
dlq_chrg_mn = loans[loans.loan_status == 'Charged Off'].delinq_2yrs.mean()

loans.groupby('loan_status').delinq_2yrs.mean() 

loan_status
Charged Off    0.27490
Fully Paid     0.24146
Name: delinq_2yrs, dtype: float64

In [26]:
#Look at the effect size for the delinquencies
dlq_chrg_mn / dlq_paid_mn
#Modest effect size

1.1384884893215346

In [27]:
#Bootstrap the delinquencies for the two categories and see if that effect is significant
prct = ut.mean_diff_bootstrap(loans[loans.loan_status == 'Charged Off'].delinq_2yrs,
                              loans[loans.loan_status == 'Fully Paid'].delinq_2yrs, 99.5, 0.5)
prct
#Reliable

[0.022954457309100756, 0.043943470247482859]

In [28]:
#Check out the effect of inquiries in the last six months
#Get the inquiries for the charged off and fully paid categories
#Get the mean inquiries
inq_paid_mn = loans[loans.loan_status == 'Fully Paid'].inq_last_6mths.mean()
inq_chrg_mn = loans[loans.loan_status == 'Charged Off'].inq_last_6mths.mean()

loans.groupby('loan_status').inq_last_6mths.mean() 

loan_status
Charged Off    1.047217
Fully Paid     0.858365
Name: inq_last_6mths, dtype: float64

In [30]:
#Look at the effect size for the inquiries
inq_chrg_mn / inq_paid_mn
#Moderate effect

1.2200132638223962

In [31]:
#Bootstrap the inquiries for the two categories and see if that effect is significant
prct = ut.mean_diff_bootstrap(loans[loans.loan_status == 'Charged Off'].inq_last_6mths,
                              loans[loans.loan_status == 'Fully Paid'].inq_last_6mths, 99.5, 0.5)
prct
#Reliable

[0.17240098985823846, 0.2063343733412828]

In [32]:
#Check out the effect of number of open accounts
#Get the number of open accounts for the charged off and fully paid categories
#Get the mean number of accounts
opn_paid_mn = loans[loans.loan_status == 'Fully Paid'].open_acc.mean()
opn_chrg_mn = loans[loans.loan_status == 'Charged Off'].open_acc.mean()

loans.groupby('loan_status').open_acc.mean()

loan_status
Charged Off    11.008180
Fully Paid     10.875742
Name: open_acc, dtype: float64

In [33]:
#Look at the effect size for the number of open accounts
opn_paid_mn / opn_chrg_mn
#Small effect

0.98796918047848403

In [34]:
#Bootstrap the open accounts for the two categories and see if that effect is significant
prct = ut.mean_diff_bootstrap(loans[loans.loan_status == 'Fully Paid'].open_acc,
                              loans[loans.loan_status == 'Charged Off'].open_acc, 99.5, 0.5)
prct
#Looks like the effect is reliable, but it is small enough to be suspect

[-0.19623503540697443, -0.066461859835128423]

In [48]:
#Check out the effect of number of derogatory public records
#Get the number of public records for the charged off and fully paid categories
#Get the mean number of derogatory public records
pub_paid_mn = loans[loans.loan_status == 'Fully Paid'].pub_rec.mean()
pub_chrg_mn = loans[loans.loan_status == 'Charged Off'].pub_rec.mean()

loans.groupby('loan_status').pub_rec.mean()


loan_status
Charged Off    0.143180
Fully Paid     0.139283
Name: pub_rec, dtype: float64

In [49]:
#Look at the effect size for the number of derogatory public records
pub_chrg_mn / pub_paid_mn
#Small effect

1.0279786874048025

In [37]:
#Bootstrap the public records for the two categories and see if that effect is significant
prct = ut.mean_diff_bootstrap(loans[loans.loan_status == 'Charged Off'].pub_rec,
                              loans[loans.loan_status == 'Fully Paid'].pub_rec, 99.5, 0.5)
prct
#Not reliable

[-0.0018740016053750276, 0.009818523648975357]

In [38]:
#Check out the effect of revolving balance
#Get the revolving balance for the charged off and fully paid categories
#Get the median revolving balance 
rev_bal_paid_med = loans[loans.loan_status == 'Fully Paid'].revol_bal.median()
rev_bal_chrg_med = loans[loans.loan_status == 'Charged Off'].revol_bal.median()

loans.groupby('loan_status').revol_bal.median()

loan_status
Charged Off    11301.5
Fully Paid     10805.0
Name: revol_bal, dtype: float64

In [39]:
#Effect size
rev_bal_chrg_med / rev_bal_paid_med
#Small effect

1.0459509486348912

In [40]:
#Bootstrap
prct = ut.med_diff_bootstrap(loans[loans.loan_status == 'Charged Off'].revol_bal, 
                             loans[loans.loan_status == 'Fully Paid'].revol_bal, 99.5, 0.5)
prct
#reliable

[352.0, 647.0]

In [42]:
#Total number of accounts
tot_acc_paid_mn = loans[loans.loan_status == 'Fully Paid'].total_acc.mean()
tot_acc_chrg_mn = loans[loans.loan_status == 'Charged Off'].total_acc.mean()

loans.groupby('loan_status').total_acc.mean() 

loan_status
Charged Off    24.141296
Fully Paid     25.138142
Name: total_acc, dtype: float64

In [43]:
#Effect size
tot_acc_paid_mn  / tot_acc_chrg_mn 
#Small effect

1.041292169210503

In [44]:
#Bootstrap
prct = ut.mean_diff_bootstrap(loans[loans.loan_status == 'Fully Paid'].total_acc,
                              loans[loans.loan_status == 'Charged Off'].total_acc, 99.5, 0.5)
prct
#Seems to be reliable, but not a very big effect

[0.8410510951780743, 1.148117810622356]

In [45]:
#Collections in the last 12 months excluding medical collections
col_paid_mn = loans[loans.loan_status == 'Fully Paid'].collections_12_mths_ex_med.mean()
col_chrg_mn = loans[loans.loan_status == 'Charged Off'].collections_12_mths_ex_med.mean()

loans.groupby('loan_status').collections_12_mths_ex_med.mean()

#Field seems to be missing some data

loan_status
Charged Off    0.007741
Fully Paid     0.006107
Name: collections_12_mths_ex_med, dtype: float64

In [46]:
#Effect size
col_chrg_mn  / col_paid_mn
#Moderate effect, but missing data?

1.2675053795187776

In [47]:
#Bootstrap
prct = ut.mean_diff_bootstrap(loans[loans.loan_status == 'Charged Off'].collections_12_mths_ex_med,
                              loans[loans.loan_status == 'Fully Paid'].collections_12_mths_ex_med, 99.5, 0.5)
prct
#Effect is reliable

[0.00044366518691515099, 0.002854148549792756]

In [50]:
#Number of accounts now delinquent
dlq_acc_paid_mn = loans[loans.loan_status == 'Fully Paid'].acc_now_delinq.mean()
dlq_acc_chrg_mn = loans[loans.loan_status == 'Charged Off'].acc_now_delinq.mean()

loans.groupby('loan_status').acc_now_delinq.mean()

loan_status
Charged Off    0.003879
Fully Paid     0.002827
Name: acc_now_delinq, dtype: float64

In [51]:
#Effect size
dlq_acc_chrg_mn / dlq_acc_paid_mn

1.3722424705392382

In [52]:
#Effect size, too rare, meaningless
#Bootstrap
prct = ut.mean_diff_bootstrap(loans[loans.loan_status == 'Charged Off'].acc_now_delinq,
                              loans[loans.loan_status == 'Fully Paid'].acc_now_delinq, 99.5, 0.5)
prct
#Also not reliable

[0.00017702472440258958, 0.0019903687465753829]