In [32]:
import pandas as ps
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, roc_auc_score
from pprint import PrettyPrinter
import math
import random
np.set_printoptions(suppress=True)
pp = PrettyPrinter().pprint

In [33]:
loans = ps.read_csv('/Users/devinjackson/Documents/data/lending-club-loan-data/loan.csv')

In [4]:
drop_cols = ['sub_grade', 'emp_length', 'emp_title', 'issue_d', 'url', 'desc', 'title', 'purpose', 'zip_code', 'addr_state', 
'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 'application_type']
loans = loans.drop(drop_cols, axis=1)
print loans.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'home_ownership',
       u'annual_inc', u'verification_status', u'loan_status', u'pymnt_plan',
       u'dti', u'delinq_2yrs', u'inq_last_6mths', u'mths_since_last_delinq',
       u'mths_since_last_record', u'open_acc', u'pub_rec', u'revol_bal',
       u'revol_util', u'total_acc', u'initial_list_status', u'out_prncp',
       u'out_prncp_inv', u'total_pymnt', u'total_pymnt_inv',
       u'total_rec_prncp', u'total_rec_int', u'total_rec_late_fee',
       u'recoveries', u'collection_recovery_fee', u'last_pymnt_amnt',
       u'collections_12_mths_ex_med', u'mths_since_last_major_derog',
       u'policy_code', u'annual_inc_joint', u'dti_joint',
       u'verification_status_joint', u'acc_now_delinq', u'tot_coll_amt',
       u'tot_cur_bal', u'open_acc_6m', u'open_il_6m', u'open_il_12m',
       u'open_il_24m', u'mths_since_rcnt_il', u'total_bal_il', u'il_util',
  

In [5]:
loans = loans[(loans['verification_status'] =='Not Verified') | (loans['verification_status'] =='Verified')
            | (loans['verification_status'] =='Source Verified')]
loans = loans[(loans['term'] ==' 36 months') | (loans['term'] ==' 60 months')]
loans = loans[(loans['initial_list_status'] =='f') | (loans['initial_list_status'] =='w')]
loans = loans[(loans['pymnt_plan'] =='n') | (loans['pymnt_plan'] =='y')]
loans = loans[(loans['loan_status'] =='Fully Paid') | (loans['loan_status'] =='Default')]
loans = loans[(loans['home_ownership'] =='RENT') | (loans['home_ownership'] =='OWN') | (loans['home_ownership'] =='NONE')
            | (loans['home_ownership'] =='MORTGAGE') | (loans['home_ownership'] =='OTHER')]
loans['term'] = loans['term'].map( {' 36 months': 0, ' 60 months': 1} ).astype(int)
loans['loan_status'] = loans['loan_status'].map( {'Fully Paid': 0, 'Default': 1} ).astype(int)
loans['grade'] = loans['grade'].map( {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6} ).astype(int)
loans['home_ownership'] = loans['home_ownership'].map( {'RENT': 0, 'OWN': 1,'MORTGAGE': 2, 'OTHER': 3,
                                                         'NONE': 4} ).astype(int)
loans['verification_status'] = loans['verification_status'].map( {'Verified': 0, 
 'Source Verified': 1,'Not Verified': 2} ).astype(int)
loans['pymnt_plan'] = loans['pymnt_plan'].map( {'n': 0, 'y': 1} ).astype(int)
loans['initial_list_status'] = loans['initial_list_status'].map( {'f': 0, 'w': 1} ).astype(int)
if 'verification_status_joint' in loans.columns:
    loans = loans.drop(['verification_status_joint'],axis=1)
loans = loans.drop(['total_rec_int'], axis=1)

In [6]:
np.random.seed(seed=1234)
random_idx = np.random.rand(loans.shape[0])

In [7]:
print random_idx
train = loans [ random_idx < 0.7 ]
test = loans[ random_idx >= 0.7 ]

[ 0.19151945  0.62210877  0.43772774 ...,  0.95466902  0.02534225
  0.73835725]


In [8]:
cols = list(train.columns)
cols.remove('loan_status')
train = train.fillna(0.0)
test = test.fillna(0.0)
forest = RandomForestClassifier(n_estimators = 100)
t_cols = cols
t_cols.append('loan_status')
forest = forest.fit(train[cols], train['loan_status'])
output = forest.predict(test[t_cols])

In [9]:
print roc_auc_score(test['loan_status'], output)
#print r2_score(test['int_rate'].values, output)

1.0


In [24]:
print forest.feature_importances_
important_cols = []
#print pp(zip(forest.feature_importances_, train.columns))
for level, col in zip(forest.feature_importances_, train.columns):
    if level > 0.0:
        print col, level
        important_cols.append(col)

[ 0.00584636  0.00438578  0.00405439  0.00728705  0.00585628  0.00076764
  0.00132878  0.00480416  0.00080348  0.00007236  0.0003598   0.00016939
  0.          0.00057921  0.00006126  0.00013524  0.00020099  0.00008846
  0.00054704  0.0000594   0.00021253  0.00037235  0.00033807  0.00012325
  0.25573407  0.26007531  0.01854297  0.01259424  0.03574999  0.0004924   0.
  0.          0.01542848  0.00002145  0.00005484  0.          0.          0.
  0.          0.00005401  0.0010031   0.          0.          0.          0.
  0.          0.          0.          0.00003171  0.          0.
  0.00001715  0.00104617  0.          0.          0.          0.3607008 ]


In [38]:
imp = zip(forest.feature_importances_, train.columns)
print sorted(imp, reverse=True)[:9]

[(0.36070080414565064, 'inq_last_12m'), (0.26007531389070754, 'out_prncp'), (0.25573407274462662, 'initial_list_status'), (0.035749988371634824, 'total_pymnt_inv'), (0.018542966844773068, 'out_prncp_inv'), (0.015428477863096722, 'collection_recovery_fee'), (0.012594239386881699, 'total_pymnt'), (0.0072870472725063843, 'funded_amnt'), (0.0058562799645157492, 'funded_amnt_inv')]
