In [6]:
#Import some modules
%matplotlib inline
import numpy as np
import pandas as pd
import utils as ut


#Load in the data
loans = pd.read_csv('LoanStats3a.csv', skiprows = 1, 
                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
                 skipfooter = 2)

#df1 = pd.read_csv('LoanStats3a.csv', skiprows = 1, 
#                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
#                 skipfooter = 2)
#df2 = pd.read_csv('LoanStats3b.csv', skiprows = 1, 
#                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
#                 skipfooter = 2, infer_datetime_format=True)
#df3 = pd.read_csv('LoanStats3c.csv', skiprows = 1, 
#                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
#                 skipfooter = 2, infer_datetime_format=True)
#df4 = pd.read_csv('LoanStats3d.csv', skiprows = 1, 
#                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
#                 skipfooter = 2, infer_datetime_format=True)

#loans = pd.concat([df1, df2, df3, df4])

#del df1
#del df2
#del df3
#del df4

In [7]:
#Begin by cleaning up the data a bit
#Percentages of the different loan statuses
loans.loan_status.value_counts(normalize = True)


Fully Paid                                             0.753150
Charged Off                                            0.130830
Current                                                0.048900
Does not meet the credit policy. Status:Fully Paid     0.046643
Does not meet the credit policy. Status:Charged Off    0.017891
Late (31-120 days)                                     0.001552
In Grace Period                                        0.000729
Late (16-30 days)                                      0.000165
Does not meet the credit policy. Status:Current        0.000094
Default                                                0.000024
dtype: float64

In [8]:
loans.replace(to_replace = 'Does not meet the credit policy. Status:Fully Paid', value = 'Fully Paid', inplace = True)
loans.replace(to_replace = 'Does not meet the credit policy. Status:Charged Off', value = 'Charged Off', inplace = True)
#Assign default to charged off? They seem equivalent from the lenders perspective
loans.replace(to_replace = 'Default', value = 'Charged Off', inplace = True)

loans.loan_status.value_counts(normalize = True)

Fully Paid                                         0.799793
Charged Off                                        0.148745
Current                                            0.048900
Late (31-120 days)                                 0.001552
In Grace Period                                    0.000729
Late (16-30 days)                                  0.000165
Does not meet the credit policy. Status:Current    0.000094
dtype: float64

In [9]:
#Filter out everything but 'fully paid' and 'charged off' and plot
#We're just going to consider those two categories to simplify the analysis
loans = loans[(loans.loan_status == 'Fully Paid') | (loans.loan_status == 'Charged Off')]

In [10]:
loans.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'sub_grade',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'verification_status', u'issue_d', u'loan_status', u'pymnt_plan',
       u'url', u'desc', u'purpose', u'title', u'zip_code', u'addr_state',
       u'dti', u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths',
       u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt',
       u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int',
       u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee',
       u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d',
       u'last_credit_pull_d', u'collections_12_mths_ex_med',
       u'mths_since_last_major_derog', u'policy_code', u'application_type',
       u'annu

Inferential statistics on the various features of this data set. Begin with the numeric fields that have little to no missing data and do not require any processing.

In [11]:
#Check out the effect of annual income
#Get the annual income for the charged off and fully paid categories
inc_paid = loans[loans.loan_status == 'Fully Paid'].annual_inc
inc_chrg = loans[loans.loan_status == 'Charged Off'].annual_inc

#Get the median incomes
inc_paid_med = inc_paid.median()
inc_chrg_med = inc_chrg.median()

loans.groupby('loan_status').annual_inc.median()

loan_status
Charged Off    53000
Fully Paid     60000
Name: annual_inc, dtype: float64

In [12]:
#Look at the effect size for the incomes
inc_paid_med / inc_chrg_med
#Modest effect size

1.1320754716981132

In [13]:
#Bootstrap the median incomes for the two categories and see if that effect is significant
prct = ut.med_diff_bootstrap(inc_paid, inc_chrg, 99.5, 0.5)
prct
#Reliable

[5336.4275999999954, 8000.0]

In [14]:
#Check out the effect of dti
#Get the dti for the charged off and fully paid categories
dti_paid = loans[loans.loan_status == 'Fully Paid'].dti
dti_chrg = loans[loans.loan_status == 'Charged Off'].dti

#Get the median dtis
dti_paid_med = dti_paid.median()
dti_chrg_med = dti_chrg.median()

loans.groupby('loan_status').dti.median() 

loan_status
Charged Off    14.33
Fully Paid     13.22
Name: dti, dtype: float64

In [15]:
#Look at the effect size for the dtis
dti_chrg_med / dti_paid_med
#Modest effect size

1.083963691376702

In [16]:
#Bootstrap the dti for the two categories and see if that effect is significant
prct = ut.med_diff_bootstrap(dti_chrg, dti_paid, 99.5, 0.5)
prct
#Reliable

[0.76497499999999885, 1.4199999999999999]

In [17]:
#Check out the effect of delinquencies in the last two years
#Get the number of delinquencies for the charged off and fully paid categories
dlq_paid = loans[loans.loan_status == 'Fully Paid'].delinq_2yrs
dlq_chrg = loans[loans.loan_status == 'Charged Off'].delinq_2yrs

#Get the mean delinquencies
dlq_paid_mn = dlq_paid.mean()
dlq_chrg_mn = dlq_chrg.mean()

loans.groupby('loan_status').delinq_2yrs.mean() 

loan_status
Charged Off    0.175838
Fully Paid     0.148762
Name: delinq_2yrs, dtype: float64

In [18]:
#Look at the effect size for the delinquencies
dlq_chrg_mn / dlq_paid_mn
#Moderate effect size

1.1820129711672762

In [19]:
#Bootstrap the delinquencies for the two categories and see if that effect is significant
prct = ut.mean_diff_bootstrap(dlq_chrg, dlq_paid, 99.5, 0.5)
prct
#Reliable

[0.0080716763949312681, 0.046258959640160108]

In [20]:
#Check out the effect of inquiries in the last six months
#Get the inquiries for the charged off and fully paid categories
inq_paid = loans[loans.loan_status == 'Fully Paid'].inq_last_6mths
inq_chrg = loans[loans.loan_status == 'Charged Off'].inq_last_6mths

#Get the mean inquiries
inq_paid_mn = inq_paid.mean()
inq_chrg_mn = inq_chrg.mean()

loans.groupby('loan_status').inq_last_6mths.mean() 

loan_status
Charged Off    1.493833
Fully Paid     1.022151
Name: inq_last_6mths, dtype: float64

In [21]:
#Look at the effect size for the inquiries
inq_chrg_mn / inq_paid_mn
#Big effect

1.4614602579371656

In [22]:
#Bootstrap the inquiries for the two categories and see if that effect is significant
prct = ut.mean_diff_bootstrap(inq_chrg, inq_paid, 99.5, 0.5)
prct
#Reliable

[0.4058066120894856, 0.53957746454902555]

In [24]:
#Check out the effect of number of open accounts
#Get the number of open accounts for the charged off and fully paid categories
opn_paid = loans[loans.loan_status == 'Fully Paid'].open_acc
opn_chrg = loans[loans.loan_status == 'Charged Off'].open_acc

#Get the mean number of accounts
opn_paid_mn = opn_paid.mean()
opn_chrg_mn = opn_chrg.mean()

loans.groupby('loan_status').open_acc.mean()

loan_status
Charged Off    9.273877
Fully Paid     9.334706
Name: open_acc, dtype: float64

In [26]:
#Look at the effect size for the number of open accounts
opn_paid_mn / opn_chrg_mn
#Practically no effect, bootstrap just to be sure

1.0065591585737803

In [27]:
#Bootstrap the open accounts for the two categories and see if that effect is significant
prct = ut.mean_diff_bootstrap(opn_paid, opn_chrg, 99.5, 0.5)
prct
#Definitely no effect

[-0.099661985993591953, 0.21901887695575831]

In [28]:
#Check out the effect of number of derogatory public records
#Get the number of public records for the charged off and fully paid categories
pub_paid = loans[loans.loan_status == 'Fully Paid'].pub_rec
pub_chrg = loans[loans.loan_status == 'Charged Off'].pub_rec

#Get the mean number of derogatory public records
pub_paid_mn = pub_paid.mean()
pub_chrg_mn = pub_chrg.mean()

loans.groupby('loan_status').pub_rec.mean()

#Very few derogatory public records out there 

loan_status
Charged Off    0.089026
Fully Paid     0.052892
Name: pub_rec, dtype: float64

In [29]:
#Look at the effect size for the number of derogatory public records
pub_chrg_mn / pub_paid_mn
#Public records are rare, but the effect seems to be large

1.6831743964585009

In [31]:
#Bootstrap the public records for the two categories and see if that effect is significant
prct = ut.mean_diff_bootstrap(pub_chrg, pub_paid, 99.5, 0.5)
prct
#It's also a reliable effect

[0.026273587764811382, 0.046282762427432088]

In [37]:
#Check out the effect of revolving balance
#Get the revolving balance for the charged off and fully paid categories
rev_bal_paid = loans[loans.loan_status == 'Fully Paid'].revol_bal
rev_bal_chrg = loans[loans.loan_status == 'Charged Off'].revol_bal

#Get the median revolving balance 
rev_bal_paid_med = rev_bal_paid.median()
rev_bal_chrg_med = rev_bal_chrg.median()

loans.groupby('loan_status').revol_bal.median()

loan_status
Charged Off    9026.0
Fully Paid     8636.5
Name: revol_bal, dtype: float64

In [38]:
#Effect size
rev_bal_chrg_med / rev_bal_paid_med
#Small effect

1.0450992879059804

In [39]:
#Bootstrap
prct = ut.med_diff_bootstrap(rev_bal_chrg, rev_bal_paid, 99.5, 0.5)
prct
#Not reliable

[-12.004999999999995, 863.0199999999968]

In [41]:
#Total number of accounts
tot_acc_paid = loans[loans.loan_status == 'Fully Paid'].total_acc
tot_acc_chrg = loans[loans.loan_status == 'Charged Off'].total_acc

tot_acc_paid_mn = tot_acc_paid.mean()
tot_acc_chrg_mn = tot_acc_chrg.mean()

loans.groupby('loan_status').total_acc.mean() 

loan_status
Charged Off    21.503637
Fully Paid     22.186356
Name: total_acc, dtype: float64

In [42]:
#Effect size
tot_acc_paid_mn  / tot_acc_chrg_mn 
#Small effect

1.0317490236251465

In [43]:
#Bootstrap
prct = ut.mean_diff_bootstrap(tot_acc_paid, tot_acc_chrg, 99.5, 0.5)
prct
#Seems to be reliable, but not a very big effect

[0.24742419353047915, 1.0967098645985496]

In [46]:
#Collections in the last 12 months excluding medical collections
col_paid = loans[loans.loan_status == 'Fully Paid'].collections_12_mths_ex_med
col_chrg = loans[loans.loan_status == 'Charged Off'].collections_12_mths_ex_med

col_paid_mn = col_paid.mean()
col_chrg_mn = col_chrg.mean()

loans.groupby('loan_status').collections_12_mths_ex_med.mean()

#Field seems to be empty

loan_status
Charged Off    0
Fully Paid     0
Name: collections_12_mths_ex_med, dtype: float64

In [47]:
#Number of accounts now delinquent
dlq_acc_paid = loans[loans.loan_status == 'Fully Paid'].acc_now_delinq
dlq_acc_chrg = loans[loans.loan_status == 'Charged Off'].acc_now_delinq

dlq_acc_paid_mn = dlq_acc_paid.mean()
dlq_acc_chrg_mn = dlq_acc_chrg.mean()

loans.groupby('loan_status').acc_now_delinq.mean()
#Seems absurdly rare, too rare to be that useful

loan_status
Charged Off    0.000000
Fully Paid     0.000118
Name: acc_now_delinq, dtype: float64

In [49]:
#Effect size, too rare, meaningless
#Bootstrap
prct = ut.mean_diff_bootstrap(dlq_acc_paid, dlq_acc_chrg, 99.5, 0.5)
prct
#Also not reliable

[0.0, 0.0002941522964703062]