In [None]:
import pandas as pd
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, precision_score, recall_score, f1_score
import numpy as np
from keras.layers import Dense, Dropout
from keras.constraints import maxnorm
from keras.optimizers import Adam
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import load_model
from sklearn.metrics import accuracy_score
import sys

In [2]:
lending_club_df = pd.read_csv("./data/final_lending_club.csv")

### Data Model
``` 'loan_amnt',
 'funded_amnt',
 'int_rate',
 'installment',
 'emp_length',
 'annual_inc',
 'zip_code',
 'dti',
 'delinq_2yrs',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'open_acc_6m',
 'open_il_6m',
 'open_il_12m',
 'open_il_24m',
 'total_bal_il',
 'il_util',
 'max_bal_bc',
 'all_util',
 'total_rev_hi_lim',
 'inq_last_12m',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_util',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths_since_recent_bc',
 'mths_since_recent_bc_dlq',
 'mths_since_recent_inq',
 'mths_since_recent_revol_delinq',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_bc_tl',
 'num_il_tl',
 'num_op_rev_tl',
 'num_rev_accts',
 'num_rev_tl_bal_gt_0',
 'num_sats',
 'num_tl_120dpd_2m',
 'num_tl_30dpd',
 'num_tl_90g_dpd_24m',
 'num_tl_op_past_12m',
 'pct_tl_nvr_dlq',
 'percent_bc_gt_75',
 'pub_rec_bankruptcies',
 'tax_liens',
 'tot_hi_cred_lim',
 'total_bal_ex_mort',
 'total_bc_limit',
 'total_il_high_credit_limit',
 'month',
 'term_ 36 months',
 'term_ 60 months',
 'grade_A',
 'grade_B',
 'grade_C',
 'grade_D',
 'grade_E',
 'grade_F',
 'grade_G',
 'home_ownership_ANY',
 'home_ownership_MORTGAGE',
 'home_ownership_NONE',
 'home_ownership_OTHER',
 'home_ownership_OWN',
 'home_ownership_RENT',
 'purpose_car',
 'purpose_credit_card',
 'purpose_debt_consolidation',
 'purpose_educational',
 'purpose_home_improvement',
 'purpose_house',
 'purpose_major_purchase',
 'purpose_medical',
 'purpose_moving',
 'purpose_other',
 'purpose_renewable_energy',
 'purpose_small_business',
 'purpose_vacation',
 'purpose_wedding',
 'addr_state_AK',
 'addr_state_AL',
 'addr_state_AR',
 'addr_state_AZ',
 'addr_state_CA',
 'addr_state_CO',
 'addr_state_CT',
 'addr_state_DC',
 'addr_state_DE',
 'addr_state_FL',
 'addr_state_GA',
 'addr_state_HI',
 'addr_state_IA',
 'addr_state_ID',
 'addr_state_IL',
 'addr_state_IN',
 'addr_state_KS',
 'addr_state_KY',
 'addr_state_LA',
 'addr_state_MA',
 'addr_state_MD',
 'addr_state_ME',
 'addr_state_MI',
 'addr_state_MN',
 'addr_state_MO',
 'addr_state_MS',
 'addr_state_MT',
 'addr_state_NC',
 'addr_state_ND',
 'addr_state_NE',
 'addr_state_NH',
 'addr_state_NJ',
 'addr_state_NM',
 'addr_state_NV',
 'addr_state_NY',
 'addr_state_OH',
 'addr_state_OK',
 'addr_state_OR',
 'addr_state_PA',
 'addr_state_RI',
 'addr_state_SC',
 'addr_state_SD',
 'addr_state_TN',
 'addr_state_TX',
 'addr_state_UT',
 'addr_state_VA',
 'addr_state_VT',
 'addr_state_WA',
 'addr_state_WI',
 'addr_state_WV',
 'addr_state_WY']'```

### Make sure training DF is in the same order as the data model

In [3]:
lending_club_df_training = lending_club_df[['loan_amnt',
 'funded_amnt',
 'int_rate',
 'installment',
 'emp_length',
 'annual_inc',
 'zip_code',
 'dti',
                                            'repaid',
 'delinq_2yrs',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'open_acc_6m',
 'open_il_6m',
 'open_il_12m',
 'open_il_24m',
 'total_bal_il',
 'il_util',
 'max_bal_bc',
 'all_util',
 'total_rev_hi_lim',
 'inq_last_12m',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_util',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths_since_recent_bc',
 'mths_since_recent_bc_dlq',
 'mths_since_recent_inq',
 'mths_since_recent_revol_delinq',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_bc_tl',
 'num_il_tl',
 'num_op_rev_tl',
 'num_rev_accts',
 'num_rev_tl_bal_gt_0',
 'num_sats',
 'num_tl_120dpd_2m',
 'num_tl_30dpd',
 'num_tl_90g_dpd_24m',
 'num_tl_op_past_12m',
 'pct_tl_nvr_dlq',
 'percent_bc_gt_75',
 'pub_rec_bankruptcies',
 'tax_liens',
 'tot_hi_cred_lim',
 'total_bal_ex_mort',
 'total_bc_limit',
 'total_il_high_credit_limit',
 'month',
 'term_ 36 months',
 'term_ 60 months',
 'grade_A',
 'grade_B',
 'grade_C',
 'grade_D',
 'grade_E',
 'grade_F',
 'grade_G',
 'home_ownership_ANY',
 'home_ownership_MORTGAGE',
 'home_ownership_NONE',
 'home_ownership_OTHER',
 'home_ownership_OWN',
 'home_ownership_RENT',
 'purpose_car',
 'purpose_credit_card',
 'purpose_debt_consolidation',
 'purpose_educational',
 'purpose_home_improvement',
 'purpose_house',
 'purpose_major_purchase',
 'purpose_medical',
 'purpose_moving',
 'purpose_other',
 'purpose_renewable_energy',
 'purpose_small_business',
 'purpose_vacation',
 'purpose_wedding',
 'addr_state_AK',
 'addr_state_AL',
 'addr_state_AR',
 'addr_state_AZ',
 'addr_state_CA',
 'addr_state_CO',
 'addr_state_CT',
 'addr_state_DC',
 'addr_state_DE',
 'addr_state_FL',
 'addr_state_GA',
 'addr_state_HI',
 'addr_state_IA',
 'addr_state_ID',
 'addr_state_IL',
 'addr_state_IN',
 'addr_state_KS',
 'addr_state_KY',
 'addr_state_LA',
 'addr_state_MA',
 'addr_state_MD',
 'addr_state_ME',
 'addr_state_MI',
 'addr_state_MN',
 'addr_state_MO',
 'addr_state_MS',
 'addr_state_MT',
 'addr_state_NC',
 'addr_state_ND',
 'addr_state_NE',
 'addr_state_NH',
 'addr_state_NJ',
 'addr_state_NM',
 'addr_state_NV',
 'addr_state_NY',
 'addr_state_OH',
 'addr_state_OK',
 'addr_state_OR',
 'addr_state_PA',
 'addr_state_RI',
 'addr_state_SC',
 'addr_state_SD',
 'addr_state_TN',
 'addr_state_TX',
 'addr_state_UT',
 'addr_state_VA',
 'addr_state_VT',
 'addr_state_WA',
 'addr_state_WI',
 'addr_state_WV',
 'addr_state_WY']]

## Train Test Split

In [4]:
y_col = lending_club_df_training.repaid

In [32]:
lending_club_df.columns[lending_club_df.columns!='repaid']

Index([u'loan_amnt', u'funded_amnt', u'int_rate', u'installment',
       u'emp_length', u'annual_inc', u'zip_code', u'dti', u'delinq_2yrs',
       u'inq_last_6mths',
       ...
       u'addr_state_SD', u'addr_state_TN', u'addr_state_TX', u'addr_state_UT',
       u'addr_state_VA', u'addr_state_VT', u'addr_state_WA', u'addr_state_WI',
       u'addr_state_WV', u'addr_state_WY'],
      dtype='object', length=149)

In [33]:
x_cols = lending_club_df_training.loc[:,lending_club_df.columns[lending_club_df.columns!='repaid'] ]

- Standardize columns

In [35]:
scaler = StandardScaler()

In [36]:
x_scaled =  scaler.fit_transform(x_cols)

- train test split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y_col,test_size=.05)

# Build a Keras model

In [38]:
def build_model():
    """Build a Keras binary classifier model"""
    model = Sequential()
    model.add(Dense(units =100, input_shape=(149,), activation='relu',kernel_constraint=maxnorm(4)))
    model.add(Dropout(.2))
    model.add(Dense(units =300,  activation='relu',kernel_constraint=maxnorm(4)))
    model.add(Dropout(.25))
    model.add(Dense(units =500,  activation='relu',kernel_constraint=maxnorm(4)))
    model.add(Dropout(.3))
    model.add(Dense(units =300,  activation='relu',kernel_constraint=maxnorm(4)))
    model.add(Dropout(.2))
    model.add(Dense(units =100,  activation='relu',kernel_constraint=maxnorm(4)))
    model.add(Dense(units =1,  activation='sigmoid'))
    adam = Adam()
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model
    
        

In [39]:
m = build_model()

In [40]:
m.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 100)               15000     
_________________________________________________________________
dropout_9 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 300)               30300     
_________________________________________________________________
dropout_10 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 500)               150500    
_________________________________________________________________
dropout_11 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 300)               150300    
__________

- Train the model

In [41]:
epo = 5
batch_size_in= 64

In [42]:
history = m.fit( X_train,y_train, epochs = epo,batch_size=batch_size_in, validation_split=.05)

Train on 1200507 samples, validate on 63185 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
for k,v in history.history.items():
    print(k,v)

('acc', [0.75166991945763839, 0.75896183862243694, 0.76028627904809554, 0.76072026235562273, 0.76132917175796755])
('loss', [0.51521353016437998, 0.5058070183706298, 0.50409075120996716, 0.50333969502593545, 0.50277405786718954])
('val_acc', [0.7600221571632132, 0.76263353644627097, 0.76219039329426319, 0.76261770990516142, 0.76226952599981046])
('val_loss', [0.507030595362993, 0.50091415637560333, 0.50116134368232834, 0.50088657361660949, 0.50688658172668999])


In [44]:
history.history['acc'][-1]

0.76132917175796755

## test predictions

In [45]:
predictions = m.predict(X_test)

In [48]:
accuracy_score(y_test, predictions.round()) # 76%  test (held out) accuracy


0.76008479800333784

In [None]:
print('HI')

## Save / load model

In [None]:
# get final accuracies
train_acc=history.history['acc'][-1]
val_acc=history.history['val_acc'][-1]
# Save the model
m.save('models/mlp_model_{}-epochs_{}-batchsize_{}-train_acc_{}-val_acc.h5'.format(epo, batch_size_in, train_acc, val_acc)) # save weigts to h5py
# serialize model to JSON
model_json=m.to_json()
with open('models/mlp_model_{}-epochs_{}-batchsize_{}-train_acc_{}-val_acc.h5'.format(epo, batch_size_in, train_acc, val_acc),  "w") as json_file:
    json_file.write(model_json)

# Extra Stuff

In [None]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=build_model, epochs=2, batch_size=32, verbose=1)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=3, shuffle=True)
results = cross_val_score(pipeline, x_cols, y_col, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))