In [1]:
# Import pandas 
import pandas as pd

In [2]:
# Read in lending club data 
lcdata = pd.read_csv("/tmp/lcdata/lc-2015-loans-2017-05-17T13-54.csv", low_memory=False)

# Print info on lending club data
print(lcdata.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421095 entries, 0 to 421094
Columns: 133 entries, id to adjusted_dti
dtypes: float64(64), int64(46), object(23)
memory usage: 427.3+ MB
None


In [3]:
print(lcdata.head())

lcdata.dtypes

         id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  66624733        NaN      18000        18000          18000.0   60 months   
1  68367011        NaN      21000        21000          21000.0   60 months   
2  68615044        NaN      16000        16000          16000.0   60 months   
3  68476697        NaN      15700        15700          15700.0   60 months   
4  68516838        NaN      23850        23850          23850.0   60 months   

   int_rate  installment grade sub_grade     ...      sec_app_open_il_6m  \
0     19.48       471.70     E        E2     ...                     NaN   
1     13.99       488.53     C        C4     ...                     NaN   
2     13.99       372.21     C        C4     ...                     NaN   
3     16.59       386.74     D        D2     ...                     NaN   
4     17.27       596.21     D        D3     ...                     NaN   

  sec_app_num_rev_accts sec_app_chargeoff_within_12_mths  \
0       

id                                       int64
member_id                              float64
loan_amnt                                int64
funded_amnt                              int64
funded_amnt_inv                        float64
term                                    object
int_rate                               float64
installment                            float64
grade                                   object
sub_grade                               object
emp_title                               object
emp_length                              object
home_ownership                          object
annual_inc                             float64
verification_status                     object
issue_d                                 object
loan_status                             object
pymnt_plan                              object
url                                     object
desc                                    object
purpose                                 object
title        

In [4]:
# Groups of columns we want to use in our model
applicant_numeric=['annual_inc','dti','age_earliest_cr','loan_amnt', 'installment']
applicant_text=['emp_title','title']
applicant_categorical=['application_type', 'emp_length', 'home_ownership', 'addr_state', 'term']
credit_numeric=['acc_now_delinq','acc_open_past_24mths','avg_cur_bal','bc_open_to_buy','bc_util','delinq_2yrs','delinq_amnt','fico_range_high','fico_range_low','last_fico_range_high','last_fico_range_low','open_acc','pub_rec','revol_util','revol_bal','tot_coll_amt','tot_cur_bal','total_acc','total_rev_hi_lim','num_accts_ever_120_pd','num_actv_bc_tl','num_actv_rev_tl','num_bc_sats','num_bc_tl','num_il_tl', 'num_rev_tl_bal_gt_0','pct_tl_nvr_dlq','percent_bc_gt_75','tot_hi_cred_lim','total_bal_ex_mort','total_bc_limit','total_il_high_credit_limit','total_rev_hi_lim','all_util', 'loan_to_income', 'installment_pct_inc','il_util','il_util_ex_mort','total_bal_il','total_cu_tl']


In [5]:
# X is only the columns we want to use from the source data
X=lcdata.loc[:, applicant_numeric + applicant_text + applicant_categorical + credit_numeric]

def encode_categorical(x, categorical_cols):
    label_data = None
    cat_data = None
    for col in categorical_cols:
        # use get_dummies() to do one hot encoding of categorical column
        x = x.merge(pd.get_dummies(x[col]), left_index=True, right_index=True)
        
        # drop the original categorical column
        x.drop(col, axis=1, inplace=True)
        
    print(x.shape)
    return x

X = encode_categorical(X, applicant_categorical)

print(X.sample())

(421095, 116)
       annual_inc    dti  age_earliest_cr  loan_amnt  installment emp_title  \
74745     48000.0  31.83             5768       8000       240.92   Teacher   

                    title  acc_now_delinq  acc_open_past_24mths  avg_cur_bal  \
74745  Debt consolidation               0                     3         1732   

          ...      TX  UT  VA  VT  WA  WI  WV  WY   36 months   60 months  
74745     ...       0   0   1   0   0   0   0   0           1           0  

[1 rows x 116 columns]


In [6]:
# Specify the target labels and flatten the array 
y=pd.get_dummies(lcdata.grade)
print(y.sample())

        A  B  C  D  E  F  G
286509  0  0  0  1  0  0  0


In [12]:
# Import `train_test_split` from `sklearn.model_selection`
from sklearn.model_selection import train_test_split

# Split the data up in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(X_train)


        annual_inc    dti  age_earliest_cr  loan_amnt  installment  \
345902     40000.0   6.84             5768      13500       293.46   
226292     72000.0  33.26             9725      12000       275.07   
93777     160000.0  22.23            16604      24000       750.86   
86618      70000.0  21.31             7229      18375       497.01   
28629      70018.0  26.83             4611      30000       676.32   
229439    120000.0   8.46             7168      11200       256.74   
157154     81000.0  21.78            10059      23000       519.68   
419543     50000.0  19.20             3150      10175       357.68   
30722      36000.0   7.37             6438       6000       183.19   
185667    120000.0  12.73             7017      25000       874.12   
121059     78000.0  20.95             9847      15750       543.29   
280555     98000.0  18.47             5556      30000       652.13   
108466     51000.0   9.55             6407       9000       290.37   
124166     21000.0  

In [11]:
# Import `Sequential` from `keras.models`
from keras.models import Sequential

# Import `Dense` from `keras.layers`
from keras.layers import Dense

print(y.shape[1])

def baseline_model():
    # create model
    model = Sequential()
    # input layer
    model.add(Dense(100, input_shape=X, activation='relu'))
    
    # hidden layer
    model.add(Dense(100, input_dim=X.shape[1], activation='relu'))
    
    # output layer
    model.add(Dense(y.shape[1], activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = baseline_model()
model.fit(X_train, y_train,epochs=20, batch_size=1, verbose=1)

7


ValueError: invalid literal for int() with base 10: 'annual_inc'