In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("input/train_indessa.csv")
print("Shape: ", data.shape)

Shape:  (532428, 45)


## Preprocessing

In [3]:
data = data.fillna("0")

In [4]:
data['total_rev_hi_lim'] = data['total_rev_hi_lim'].astype(int)

In [5]:
# Values (36months, 60months) into int 

data['term'] = data['term'].str.extract('(\d+)', expand=False).astype(int)
data = data.fillna("0")
data['term'] = data['term'].astype(int)

In [6]:
# Values (126th weeks, ...) into int

data['last_week_pay'] = data['last_week_pay'].str.extract('(\d+)', expand=False)
data = data.fillna("0")
data['last_week_pay'] = data['last_week_pay'].astype(int)

In [7]:
# Create a new feature = last_week_pay / term

data.insert(0, 'payment_completion', (data['last_week_pay']/(data['term']/12*52+1))*100)
data['payment_completion'] = data['payment_completion'].astype(int)

In [8]:
# Drop irrelevant features and text features
drop_cols = ['member_id', 'batch_enrolled', 'desc', 'title', 'emp_title']
data.drop(drop_cols, 1, inplace=True)

In [9]:
# Encode Label for Classifier
from sklearn.preprocessing import LabelEncoder

cat_cols = ['grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 
            'pymnt_plan', 'purpose', 'initial_list_status', 'application_type', 
            'verification_status_joint', 'zip_code', 'addr_state']
le = {}

for col in cat_cols:
    le[col] = LabelEncoder()
    data[col] = le[col].fit_transform(data[col])
    le[col].classes_ = np.append(le[col].classes_, 'other')
    
    print('Encoded: ', col)

Encoded:  grade
Encoded:  sub_grade
Encoded:  emp_length
Encoded:  home_ownership
Encoded:  verification_status
Encoded:  pymnt_plan
Encoded:  purpose
Encoded:  initial_list_status
Encoded:  application_type
Encoded:  verification_status_joint
Encoded:  zip_code
Encoded:  addr_state


## Split Training Set

In [151]:
split = int(len(data)*0.8)

major = ['tot_cur_bal', 'last_week_pay', 'total_rev_hi_lim', 'int_rate', 'payment_completion']

data = data.dropna()
X_train = data[data['loan_status'] >= 0].iloc[:split,:-1][major]
X_test = data[data['loan_status'] >= 0].iloc[split:,:-1][major]

In [152]:
X_train.shape

(425942, 5)

In [176]:
# One Hot Y
from sklearn.preprocessing import OneHotEncoder

oht_target = OneHotEncoder()
Y = pd.DataFrame(oht_target.fit_transform(data[data['loan_status'] >= 0].iloc[:,-1:]).todense())
Y_train = Y[:split]
Y_test = Y[split:]

In [120]:
Y_train = data[data['loan_status'] >= 0].iloc[:split,-1:]
Y_test = data[data['loan_status'] >= 0].iloc[split:,-1:]

In [177]:
Y_train.shape

(425942, 2)

## ANN with Keras

In [124]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers.advanced_activations import PReLU

In [197]:
model = Sequential()

In [198]:
model.add(Dense(units=40, input_dim=5))
model.add(Activation('relu'))
model.add(Dense(units=40))
model.add(Activation('relu'))
model.add(Dense(units=40))
model.add(Activation('relu'))
model.add(Dense(units=40))
model.add(Activation('relu'))
model.add(Dense(units=40))
model.add(Activation('relu'))
model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [199]:
'''model.add(Dense(units=20, input_dim=5))
model.add(Activation('relu'))
model.add(Dense(units=20))
model.add(Activation('relu'))
model.add(Dense(units=20))
model.add(Activation('relu'))
model.add(Dense(units=1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])'''

"model.add(Dense(units=20, input_dim=5))\nmodel.add(Activation('relu'))\nmodel.add(Dense(units=20))\nmodel.add(Activation('relu'))\nmodel.add(Dense(units=20))\nmodel.add(Activation('relu'))\nmodel.add(Dense(units=1))\nmodel.add(Activation('sigmoid'))\n\nmodel.compile(loss='binary_crossentropy',\n              optimizer='rmsprop',\n              metrics=['accuracy'])"

In [200]:
model.fit(X_train.values, Y_train.values, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc522215048>

In [203]:
loss_and_metrics = model.evaluate(X_test.values, Y_test.values, batch_size=128)



In [204]:
loss_and_metrics

[11.434294727257303, 0.25058693162326001]

In [205]:
model.metrics_names

['loss', 'acc']

## Prediction

In [206]:
data_test = pd.read_csv("input/test_indessa.csv")
rows = data_test['member_id'].copy()

In [207]:
data_test = data_test.fillna("0")

In [208]:
data_test['total_rev_hi_lim'] = data_test['total_rev_hi_lim'].astype(int)

In [211]:
# Values (36months, 60months) into int 

data_test['term'] = data_test['term'].str.extract('(\d+)', expand=False).astype(int)
data_test = data_test.fillna("0")
data_test['term'] = data_test['term'].astype(int)

In [212]:
# Values (126th weeks, ...) into int

data_test['last_week_pay'] = data_test['last_week_pay'].str.extract('(\d+)', expand=False)
data_test = data_test.fillna("0")
data_test['last_week_pay'] = data_test['last_week_pay'].astype(int)

In [213]:
# Create a new feature = last_week_pay / term

data_test.insert(0, 'payment_completion', (data_test['last_week_pay']/(data_test['term']/12*52+1))*100)
data_test['payment_completion'] = data_test['payment_completion'].astype(int)

In [220]:
pred_test = model.predict_proba(data_test[major].values)



In [232]:
pred_frame = pd.DataFrame({'member_id': rows, 'loan_status': pred_test[:,1]})
pred_frame = pred_frame.round(2)
saturate
pred_frame.apply(lambda x: 1 )

In [233]:
pred_frame.to_csv('submission.csv', index=False, columns=['member_id', 'loan_status'], float_format='%g')

In [234]:
pred_test[:5,0]

array([ 0.,  0.,  0.,  0.,  0.], dtype=float32)