In [1]:
# imports

import pandas as pd
import re


In [13]:
# functions

def drop_sparse_columns(df):
    column_names = df.columns.values
    for name in column_names:
        try:
            if df[name].isnull().sum() > 100000:
                df.drop(labels=name, axis=1, inplace=True)
        except ValueError:
            pass
    return df    


def filter_set_status(df):
    def set_status(s):
        if s == 'Fully Paid':
            return 'Paid'
        if s == 'Charged Off':
            return 'Not Paid'
        if s == 'Default':
            return 'Not Paid'
        if s == 'Does not meet the credit policy. Status:Fully Paid':
            return 'Paid'
        if s == 'Does not meet the credit policy. Status:Charged Off':
            return 'Paid'
       
    df = df[(df['loan_status'] == 'Fully Paid') | (df['loan_status'] == 'Charged Off') |
            (df['loan_status'] == 'Default')]  
    
    df['loan_status'] = df['loan_status'].apply(set_status)  
    return df


In [3]:
# columns 19 and 55 have mixed data types so we force them to strings for now
loan_df = pd.read_csv('loan.csv', dtype={19:'str', 55:'str'})

## General Info



In [4]:
print "Number of observations: %s" % loan_df.shape[0]
print "Number of columns: %s" % loan_df.shape[1]

Number of observations: 887379
Number of columns: 74


## Response Variable

This variable is `loan_status`. The possible values of this variable are

In [5]:
loan_df['loan_status'].unique()

array(['Fully Paid', 'Charged Off', 'Current', 'Default',
       'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off', 'Issued'], dtype=object)

It is probably prudent to simplify these categories. First let's define them (https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-):

- Fully Paid : Fully Paid
- Charged Off : Loan past due for at least 121 days but not more than 151 days.
- Current : All payments made on time to date
- Default : Loan past due for 121 days.
- Late (31-120 days) : 
- In Grace Period : This is late 1-15 days I think.
- Late (16-30 days) : 
- Does not meet the credit policy. Status:Fully Paid : From what I can tell, LC policy changed since loan origination.
- Does not meet the credit policy. Status:Charged Off : But status is still correct.
- Issued : Loan has been issued.

Now, the important thing to be able to predict is which loans will default. Perhaps we can even assign a probability of default to the prediction. It seems we will have to filter some of the data since we don't know whether someone who is 1-120 days late will actually default. We also don't know if someone who is current will actually make all payments. It makes sense to map as follows:

- Fully Paid => Paid
- Charged off, Default => Not Paid

For the columns that begin with "Does not meet..." we will use the status. We may also want to include Late 31-120 days in the default category. We should perhaps try it both ways and compare our accuracy.



In [6]:
loan_df =  filter_set_status(loan_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Cleaning

In [7]:
loan_df.shape

(254190, 74)

In [12]:
loan_df.columns.values

array(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'desc', 'purpose', 'zip_code',
       'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status',
       'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_il_6m', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m',
       'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
      

In [11]:
# There are some columns we should drop
loan_df.drop(labels=['collection_recovery_fee',
                    'last_pymnt_amnt',
                    'last_pymnt_d',
                    'next_pymnt_d',
                    'out_prncp',
                    'out_prncp_inv',
                    'pymnt_plan',
                    'recoveries',
                    'term',
                    'title',
                    'total_pymnt',
                    'total_pymnt_inv',
                    'total_rec_int',
                    'total_rec_late_fee',
                    'total_rec_prncp',
                    'url'], axis=1, inplace=True)

loan_df = drop_sparse_columns(loan_df)

In [25]:
loan_df.iloc[:, 0:12].describe(include='all')

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,grade,sub_grade,emp_title
count,254190.0,254190.0,254190.0,254190.0,254190.0,254190.0,254190.0,254190,254190,240183
unique,,,,,,,,7,35,133191
top,,,,,,,,B,B3,Manager
freq,,,,,,,,76263,18106,1959
mean,12340670.0,13661690.0,13570.590897,13530.210374,13437.613928,13.776014,418.273195,,,
std,14797260.0,15764940.0,8133.386869,8111.02087,8129.476448,4.404088,244.912615,,,
min,54734.0,70699.0,500.0,500.0,0.0,5.32,15.69,,,
25%,1462514.0,1709671.0,7250.0,7200.0,7200.0,10.74,239.56,,,
50%,6321818.0,7660992.0,12000.0,12000.0,11975.0,13.53,365.23,,,
75%,16552080.0,18585340.0,18250.0,18125.0,18000.0,16.55,547.55,,,


In [29]:
loan_df.iloc[:, 13:25].describe(include='all')

Unnamed: 0,verification_status,issue_d,loan_status,purpose,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec
count,254190,254190,254190,254190,254190,254190,254190.0,254190.0,254190,254190.0,254190.0,254190.0
unique,3,103,2,14,886,51,,,645,,,
top,Verified,Oct-2014,Paid,debt_consolidation,945xx,CA,,,Oct-2000,,,
freq,93375,8808,207723,149153,3303,43321,,,2119,,,
mean,,,,,,,16.560534,0.250533,,0.852508,10.944227,0.143794
std,,,,,,,7.79439,0.74239,,1.067088,4.893242,0.436923
min,,,,,,,0.0,0.0,,0.0,0.0,0.0
25%,,,,,,,10.77,0.0,,0.0,7.0,0.0
50%,,,,,,,16.22,0.0,,1.0,10.0,0.0
75%,,,,,,,22.01,0.0,,1.0,14.0,0.0


In [28]:
loan_df.iloc[:, 26:].describe(include='all')

Unnamed: 0,revol_util,total_acc,initial_list_status,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
count,253990.0,254190.0,254190,254169,254134.0,254190.0,254190,254190.0,190464.0,190464.0,190464.0
unique,,,2,102,,,2,,,,
top,,,f,Jan-2016,,,INDIVIDUAL,,,,
freq,,,182747,103213,,,254189,,,,
mean,54.330606,25.037291,,,0.006772,1.0,,0.003124,203.3825,138160.5,29694.85
std,24.785064,11.749435,,,0.089273,0.0,,0.060602,21035.5,152328.4,29499.8
min,0.0,2.0,,,0.0,1.0,,0.0,0.0,0.0,0.0
25%,36.3,16.0,,,0.0,1.0,,0.0,0.0,28355.75,13300.0
50%,55.8,23.0,,,0.0,1.0,,0.0,0.0,80760.5,22300.0
75%,73.9,32.0,,,0.0,1.0,,0.0,0.0,207990.5,36800.0


In [27]:
loan_df.shape

(254190, 37)

So we have removed columns with more than 100,000 missing values. We have also removed columns for extraneous reasons. Now of the columns left some comments:

- Employee title has 240,183 values of which 133,191 are Manager
- revol_util has only 253,990 values
- last_credit_pull_d has only 254,196 values
- collections_12_mths_ex_med has only 