In [1]:
#Import some modules
%matplotlib inline
import numpy as np
import pandas as pd


#Load in the data
loans = pd.read_csv('LoanStats3a.csv', skiprows = 1, 
                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
                 skipfooter = 2)

#df1 = pd.read_csv('LoanStats3a.csv', skiprows = 1, 
#                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
#                 skipfooter = 2)
#df2 = pd.read_csv('LoanStats3b.csv', skiprows = 1, 
#                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
#                 skipfooter = 2, infer_datetime_format=True)
#df3 = pd.read_csv('LoanStats3c.csv', skiprows = 1, 
#                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
#                 skipfooter = 2, infer_datetime_format=True)
#df4 = pd.read_csv('LoanStats3d.csv', skiprows = 1, 
#                  parse_dates = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
#                 skipfooter = 2, infer_datetime_format=True)

#loans = pd.concat([df1, df2, df3, df4])

#del df1
#del df2
#del df3
#del df4



In [2]:
#Begin by cleaning up the data a bit
#Percentages of the different loan statuses
loans.loan_status.value_counts(normalize = True)


Fully Paid                                             0.753150
Charged Off                                            0.130830
Current                                                0.048900
Does not meet the credit policy. Status:Fully Paid     0.046643
Does not meet the credit policy. Status:Charged Off    0.017891
Late (31-120 days)                                     0.001552
In Grace Period                                        0.000729
Late (16-30 days)                                      0.000165
Does not meet the credit policy. Status:Current        0.000094
Default                                                0.000024
dtype: float64

In [4]:
loans.replace(to_replace = 'Does not meet the credit policy. Status:Fully Paid', value = 'Fully Paid', inplace = True)
loans.replace(to_replace = 'Does not meet the credit policy. Status:Charged Off', value = 'Charged Off', inplace = True)
#Assign default to charged off? They seem equivalent from the lenders perspective
loans.replace(to_replace = 'Default', value = 'Charged Off', inplace = True)

loans.loan_status.value_counts(normalize = True)

Fully Paid                                         0.799793
Charged Off                                        0.148745
Current                                            0.048900
Late (31-120 days)                                 0.001552
In Grace Period                                    0.000729
Late (16-30 days)                                  0.000165
Does not meet the credit policy. Status:Current    0.000094
dtype: float64

In [5]:
#Filter out everything but 'fully paid' and 'charged off' and plot
#We're just going to consider those two categories to simplify the analysis
loans = loans[(loans.loan_status == 'Fully Paid') | (loans.loan_status == 'Charged Off')]

In [16]:
loans.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'sub_grade',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'verification_status', u'issue_d', u'loan_status', u'pymnt_plan',
       u'url', u'desc', u'purpose', u'title', u'zip_code', u'addr_state',
       u'dti', u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths',
       u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt',
       u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int',
       u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee',
       u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d',
       u'last_credit_pull_d', u'collections_12_mths_ex_med',
       u'mths_since_last_major_derog', u'policy_code', u'application_type',
       u'annu

In [21]:
#Check out the effect of annual income
#Get the annual income for the charged off and fully paid categories
inc_paid = loans[loans.loan_status == 'Fully Paid'].annual_inc
inc_chrg = loans[loans.loan_status == 'Charged Off'].annual_inc

#Get the median incomes
inc_paid_med = inc_paid.median()
inc_chrg_med = inc_chrg.median()

loans.groupby('loan_status').annual_inc.median()

loan_status
Charged Off    53000
Fully Paid     60000
Name: annual_inc, dtype: float64

In [22]:
#Look at the effect size for the incomes
inc_paid_med / inc_chrg_med

1.1320754716981132

In [23]:
#Bootstrap the median incomes for the two categories and see if that effect is significant
r = 10000
inc_paid_boot = [inc_paid[inc_paid.index[np.random.randint(0,len(inc_paid),len(inc_paid))]].median() for i in range(r)]
inc_chrg_boot = [inc_chrg[inc_chrg.index[np.random.randint(0,len(inc_chrg),len(inc_chrg))]].median() for i in range(r)]
diff = np.array(inc_paid_boot) - np.array(inc_chrg_boot)
prct = np.percentile(diff, [0.5, 99.5])
prct

[5136.4475000000002, 8000.0]

In [24]:
#Check out the effect of dti
#Get the dti for the charged off and fully paid categories
dti_paid = loans[loans.loan_status == 'Fully Paid'].dti
dti_chrg = loans[loans.loan_status == 'Charged Off'].dti

#Get the median dtis
dti_paid_med = dti_paid.median()
dti_chrg_med = dti_chrg.median()

loans.groupby('loan_status').dti.median()

loan_status
Charged Off    14.33
Fully Paid     13.22
Name: dti, dtype: float64

In [26]:
#Look at the effect size for the dtis
dti_chrg_med / dti_paid_med

1.083963691376702

In [28]:
#Bootstrap the dti for the two categories and see if that effect is significant
r = 10000
dti_paid_boot = [dti_paid[dti_paid.index[np.random.randint(0,len(dti_paid),len(dti_paid))]].median() for i in range(r)]
dti_chrg_boot = [dti_chrg[dti_chrg.index[np.random.randint(0,len(dti_chrg),len(dti_chrg))]].median() for i in range(r)]
diff = np.array(dti_chrg_boot) - np.array(dti_paid_boot)
prct = np.percentile(diff, [0.5, 99.5])
prct

[0.76497500000000063, 1.4199999999999999]

In [31]:
#Check out the effect of delinquencies in the last two years
#Get the number of delinquencies for the charged off and fully paid categories
dlq_paid = loans[loans.loan_status == 'Fully Paid'].delinq_2yrs
dlq_chrg = loans[loans.loan_status == 'Charged Off'].delinq_2yrs

#Get the mean delinquencies
dlq_paid_mn = dlq_paid.mean()
dlq_chrg_mn = dlq_chrg.mean()

loans.groupby('loan_status').delinq_2yrs.mean()

loan_status
Charged Off    0.175838
Fully Paid     0.148762
Name: delinq_2yrs, dtype: float64

In [32]:
#Look at the effect size for the delinquencies
dlq_chrg_mn / dlq_paid_mn

1.1820129711672762

In [37]:
#Bootstrap the delinquencies for the two categories and see if that effect is significant
r = 10000
dlq_paid_boot = [dlq_paid[dlq_paid.index[np.random.randint(0,len(dlq_paid),len(dlq_paid))]].mean() for i in range(r)]
dlq_chrg_boot = [dlq_chrg[dlq_chrg.index[np.random.randint(0,len(dlq_chrg),len(dlq_chrg))]].mean() for i in range(r)]
diff = np.array(dlq_chrg_boot) - np.array(dlq_paid_boot)
prct = np.percentile(diff, [0.5, 99.5])
prct

[0.0085294559882389907, 0.046512669526453311]

In [35]:
#Check out the effect of inquiries in the last six months
#Get the dti for the charged off and fully paid categories
inq_paid = loans[loans.loan_status == 'Fully Paid'].inq_last_6mths
inq_chrg = loans[loans.loan_status == 'Charged Off'].inq_last_6mths

#Get the mean inquiries
inq_paid_mn = inq_paid.mean()
inq_chrg_mn = inq_chrg.mean()

loans.groupby('loan_status').inq_last_6mths.mean()

loan_status
Charged Off    1.493833
Fully Paid     1.022151
Name: inq_last_6mths, dtype: float64

In [36]:
#Look at the effect size for the inquiries
inq_chrg_mn / inq_paid_mn

1.4614602579371656

In [38]:
#Bootstrap the inquiries for the two categories and see if that effect is significant
r = 10000
inq_paid_boot = [inq_paid[inq_paid.index[np.random.randint(0,len(inq_paid),len(inq_paid))]].mean() for i in range(r)]
inq_chrg_boot = [inq_chrg[inq_chrg.index[np.random.randint(0,len(inq_chrg),len(inq_chrg))]].mean() for i in range(r)]
diff = np.array(inq_chrg_boot) - np.array(inq_paid_boot)
prct = np.percentile(diff, [0.5, 99.5])
prct

[0.40614719557108542, 0.53920507414845242]