In [117]:
import re
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import datetime as dt

from sklearn import model_selection

from sklearn.metrics import confusion_matrix

%matplotlib inline 

In [118]:
# all accepted loans from lendingclub
all_accept_df = pd.read_csv('./archive/accepted_2007_to_2018q4.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [119]:
# convert columns to numeric

all_accept_df['term'] = pd.to_numeric(all_accept_df['term'].apply(lambda x: re.findall(r'\d+', str(x))).str[0])

# convert columns to date

all_accept_df['issue_d'] = pd.to_datetime(all_accept_df['issue_d'])
all_accept_df['last_pymnt_d'] = pd.to_datetime(all_accept_df['last_pymnt_d'])

In [120]:
def AddMonthsToDate(dates, months):
    updatedDates = []
    
    for i in range(0, len(dates)):
        toAddMnths = pd.to_numeric(months.iat[i]) - 3
        updatedDates.append(dates.iat[i] + pd.DateOffset(months=toAddMnths))
        
    return updatedDates

In [121]:
all_accept_df = all_accept_df[(all_accept_df['loan_status'] == 'Fully Paid') | (all_accept_df['loan_status'] == 'Does not meet the credit policy. Status:Fully Paid')]

print(len(all_accept_df['issue_d']))

updatedDates = AddMonthsToDate(all_accept_df['issue_d'], all_accept_df['term'])

print(len(updatedDates))

all_accept_df['loan_prepaid'] = all_accept_df['last_pymnt_d'] < updatedDates
all_accept_df = all_accept_df.drop('loan_status', axis=1)

#all_accept_df['loan_prepaid'] = ['Prepaid' if x==1 else 'Fully Paid' for x in all_accept_df['loan_prepaid']]

1078739
1078739


In [122]:
all_accept_df['loan_prepaid'].value_counts()

True     805910
False    272829
Name: loan_prepaid, dtype: int64

In [123]:
def ConvertDateTimeToOrdinal(d):
    if d is pd.NaT:
        return 0
    else:
        return d.toordinal()

In [124]:
# convert columns to datetime (toordinal)

dateCols = pd.Series(
    ['issue_d',
    'earliest_cr_line',
    'last_pymnt_d',
    'next_pymnt_d',
    'last_credit_pull_d',
    'debt_settlement_flag_date',
    'settlement_date',
    'hardship_start_date',
    'hardship_end_date',
    'payment_plan_start_date',
    'sec_app_earliest_cr_line'])

for col in dateCols:    
    all_accept_df[col] = pd.to_datetime(all_accept_df[col]).apply(ConvertDateTimeToOrdinal)

In [125]:
# drop columns to be ignore for now

all_accept_df = all_accept_df.drop([
    'id',
    'member_id',
    'emp_title',
    'emp_length',
    'home_ownership',
    'url',
    'desc',
    'title',
    'zip_code',
    'addr_state',
    'initial_list_status',
    'verification_status_joint',  
    'hardship_type',
    'hardship_reason',
    'disbursement_method'
], axis=1)

In [126]:
# convert column to numeric

all_accept_df['term'] = pd.to_numeric(all_accept_df['term'].apply(lambda x: re.findall(r'\d+', str(x))).str[0])
all_accept_df['deferral_term'] = pd.to_numeric(all_accept_df['deferral_term'])

In [127]:
# Dummify

colsToBeDummified = pd.Series(
    ['grade',
    'sub_grade',
    'verification_status',
    'purpose',
    'pymnt_plan',
    'application_type',
    'hardship_flag',
    'hardship_status',
    'hardship_loan_status',
    'debt_settlement_flag',
    'settlement_status'])

for col in colsToBeDummified:    
    all_accept_df = all_accept_df.join(pd.get_dummies(all_accept_df[col], drop_first=True, prefix=col))
    all_accept_df = all_accept_df.drop(col, axis=1)

In [128]:
# handle na

all_accept_df.fillna(0, inplace=True)

In [130]:
from sklearn.model_selection import train_test_split

all_accept_train_df, all_accept_test_df, all_accept_train_target, all_accept_test_target = \
            train_test_split(all_accept_df, all_accept_df['loan_prepaid'], test_size=0.2, random_state=100)

In [131]:
# make sure all columns are numeric

for col in all_accept_train_df.columns:
    all_accept_train_df[col] = pd.to_numeric(all_accept_train_df[col])
    all_accept_test_df[col] = pd.to_numeric(all_accept_test_df[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [132]:
all_accept_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 862991 entries, 735086 to 297867
Columns: 188 entries, loan_amnt to settlement_status_BROKEN
dtypes: bool(1), float64(112), int64(12), uint8(63)
memory usage: 875.7 MB


In [133]:
all_accept_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215748 entries, 187093 to 991606
Columns: 188 entries, loan_amnt to settlement_status_BROKEN
dtypes: bool(1), float64(112), int64(12), uint8(63)
memory usage: 218.9 MB


## - Feature Selection

In [134]:
features = [
    'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term',
    'int_rate', 'grade_B', 'grade_C',
    'grade_D', 'grade_E', 'grade_F', 'grade_G',
    'sub_grade_A2', 'sub_grade_A3', 'sub_grade_A4', 'sub_grade_A5', 
    'sub_grade_B2', 'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5',
    'sub_grade_C1', 'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4',
    'sub_grade_C5', 'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3',
    'sub_grade_D4', 'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2',
    'sub_grade_E3', 'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1',
    'sub_grade_F2', 'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5',
    'sub_grade_G1', 'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4',
    'sub_grade_G5', 'sub_grade_B1','purpose_credit_card', 'purpose_debt_consolidation',
    'purpose_home_improvement', 'purpose_medical', 'purpose_other', 
    'purpose_small_business', 'annual_inc',
    'issue_d', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high',
    'open_acc', 'total_acc', 'total_pymnt',
    'total_pymnt_inv', 'last_pymnt_d', 'last_pymnt_amnt',
    'annual_inc_joint', 'dti_joint', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
    'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit',
    'revol_bal_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high',
]

In [135]:
mask = all_accept_train_df.columns.isin(features)

kBestColumns = all_accept_train_df.columns[mask]
otherColumnsToBeCombined = all_accept_train_df.columns[~mask]

len(kBestColumns) + len(otherColumnsToBeCombined)

188

## - PCA on non-kBest columns

In [136]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)

### PCA - Modify train data set

In [137]:
all_accept_train_otherColsDf = pd.DataFrame(all_accept_train_df[otherColumnsToBeCombined])

data_rescaled = scaler.fit_transform(all_accept_train_otherColsDf)

pca.fit(data_rescaled)

reduced = pca.transform(data_rescaled)
reduced = pd.DataFrame(reduced)

reduced.fillna(0, inplace=True)

reduced.shape

(862991, 20)

In [138]:
pd.DataFrame(all_accept_train_df[kBestColumns]).shape

(862991, 75)

In [139]:
kBestColsDf = pd.DataFrame(all_accept_train_df[kBestColumns])

all_accept_train_df = \
    pd.concat([kBestColsDf, reduced.reindex(kBestColsDf.index)], axis=1)

all_accept_train_df.shape

(862991, 95)

### PCA - Modify test data set

In [140]:
all_accept_test_otherColsDf = pd.DataFrame(all_accept_test_df[otherColumnsToBeCombined])

reduced = pca.transform(all_accept_test_otherColsDf)
reduced = pd.DataFrame(reduced)

reduced.fillna(0, inplace=True)

reduced.shape

(215748, 20)

In [141]:
kBestColsDf = pd.DataFrame(all_accept_test_df[kBestColumns])

all_accept_test_df = \
    pd.concat([kBestColsDf, reduced.reindex(kBestColsDf.index)], axis=1)

all_accept_test_df.shape

(215748, 95)

## Shrink data set

In [142]:
#n_range_train = range(1, all_accept_train_df.shape[0])
n_range_train = range(1, 20000)

all_accept_train_df_r = pd.DataFrame(all_accept_train_df.iloc[n_range_train])
all_accept_train_target_r = all_accept_train_target.iloc[n_range_train] 

all_accept_train_df_r.fillna(0, inplace=True)

In [143]:
#n_range_test = range(1, all_accept_test_df.shape[0])
n_range_test = range(1, 2000)

all_accept_test_df_r = pd.DataFrame(all_accept_test_df.iloc[n_range_test])
all_accept_test_target_r = all_accept_test_target.iloc[n_range_test] 

all_accept_test_df_r.fillna(0, inplace=True)

## Global variables

In [144]:
random_state = 100
#n_jobs = 6
n_jobs = 2
cv = 5
accuracy = 'accuracy'

## Decision Trees

In [145]:
from sklearn import tree

tree_model = tree.DecisionTreeClassifier()

grid_para_tree = [{
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": range(1, 10),
    "min_samples_split": np.linspace(start=2, stop=30, num=15, dtype=int)
}]

tree_model.set_params(random_state=random_state)

grid_search_tree = model_selection.GridSearchCV(tree_model, grid_para_tree,
                                                cv=cv, scoring=accuracy,
                                                n_jobs=n_jobs)

In [146]:
%time grid_search_tree.fit(all_accept_train_df_r, all_accept_train_target_r)

CPU times: user 5.2 s, sys: 1.48 s, total: 6.69 s
Wall time: 3min 3s


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=100,
                                              splitter='best'),
             iid='deprecated', n_jobs=2,
             param_grid=[{'criterion': ['gini', 'entropy'],
                      

In [147]:
grid_search_tree.score(all_accept_train_df_r, all_accept_train_target_r)

0.9897994899744987

In [148]:
grid_search_tree.score(all_accept_test_df_r, all_accept_test_target_r)

0.9874937468734367

In [149]:
confusion_matrix(grid_search_tree.predict(all_accept_train_df_r), all_accept_train_target_r)

array([[ 4953,   120],
       [   84, 14842]])

In [150]:
confusion_matrix(grid_search_tree.predict(all_accept_test_df_r), all_accept_test_target_r)

array([[ 491,   18],
       [   7, 1483]])

## XGBoost

In [151]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

xgb_c = xgb.XGBClassifier(
    objective= 'binary:logistic',
    nthread=n_jobs,
    seed=random_state
)

parameters = {
    'max_depth': range (5, 7, 10),
    'n_estimators': range(100, 150, 200),
    'learning_rate': [0.25, 0.2, 0.1]
}

grid_search_xgb = model_selection.GridSearchCV(
    estimator=xgb_c,
    param_grid=parameters,
    scoring = accuracy,
    n_jobs = n_jobs,
    cv = cv
)

In [152]:
%time grid_search_xgb.fit(all_accept_train_df_r, all_accept_train_target_r)

CPU times: user 12.9 s, sys: 88.1 ms, total: 13 s
Wall time: 52.2 s


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=2, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=100, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=2,
             param_grid={'learning_rate': [0.25, 0.2, 0.1],
                         'max_depth': range(5, 7, 10),
                         'n_estimators': range(100, 150, 200)},
     

In [153]:
grid_search_xgb.score(all_accept_train_df_r, all_accept_train_target_r)

1.0

In [154]:
grid_search_xgb.score(all_accept_test_df_r, all_accept_test_target_r)

0.9909954977488744

In [155]:
confusion_matrix(grid_search_xgb.predict(all_accept_train_df_r), all_accept_train_target_r)

array([[ 5037,     0],
       [    0, 14962]])

In [156]:
confusion_matrix(grid_search_xgb.predict(all_accept_test_df_r), all_accept_test_target_r)

array([[ 492,   12],
       [   6, 1489]])

## Neural Network

In [157]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import sklearn
import time
import scipy

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import initializers

In [158]:
class One_layer_batch(object):
    kernel_initializer = initializers.TruncatedNormal(mean=0.0,stddev=0.1)
    bias_initializer   = initializers.Constant(0.1)    
    def __init__(self, size_hidden=2):
        self.__w1 = None
        self.__b1 = None
        self.__w2 = None
        self.__b1 = None
        self.__size_hidden = size_hidden
        
    def __initialize_model(self, x_train, y_train, rate):
        num_observations= x_train.shape[0]
        num_features   = x_train.shape[1]
        num_labels     = y_train.shape[1]
        size_hidden    = self.__size_hidden
        
        x = tf.placeholder(tf.float32, shape=[None, num_features])
        y = tf.placeholder(tf.float32, shape=[None, num_labels])
        
        d_layer1 = Dense(input_shape=(num_features,), units=size_hidden, activation=tf.nn.sigmoid, kernel_initializer = self.kernel_initializer, bias_initializer= self.bias_initializer)
        hidden   = d_layer1(x)
        
        hidden_drop = tf.nn.dropout(hidden, rate=rate)
        
        d_layer2 = Dense(input_shape=(size_hidden,), units=num_labels, kernel_initializer = self.kernel_initializer, bias_initializer= self.bias_initializer)
        y_lin   = d_layer2(hidden_drop)

        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=y_lin, labels=y))
        
        train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
        return train_step, x, y, d_layer1.weights[0], d_layer1.weights[1], d_layer2.weights[0], d_layer2.weights[1]
        
    def fit(self, x_train, y_train, rate = 0.25, batch_size = 100, steps = 10000):
        train_step, x, y, w_1, b_1, w_2, b_2 = self.__initialize_model(x_train, y_train, rate)
        
        num_observations = x_train.shape[0]
        tf.global_variables_initializer().run()
        
        for i in range(steps):
            locs    = sklearn.utils.random.sample_without_replacement(num_observations, batch_size)
            x_batch = x_train.values[locs,:]
            y_batch = y_train[locs,:]
            train_step.run(feed_dict={x: x_batch, y: y_batch})
            
        self.__w1 = w_1.eval()
        self.__b1 = b_1.eval()
        self.__w2 = w_2.eval()
        self.__b2 = b_2.eval()
    
    def __sigmoid(self, x):
        return scipy.special.expit(x)
        
    def predict(self, x_train):
        w1 = self.__w1
        b1 = self.__b1
        w2 = self.__w2
        b2 = self.__b2
        y_lin = np.dot(self.__sigmoid(np.dot(x_train, w1)+b1), w2)+b2
        return tf.argmax(y_lin, 1).eval()        
    
    @property
    def w1(self):
        return self.__w1
    
    @property
    def b1(self):
        return self.__b1
    
    @property
    def w2(self):
        return self.__w2
    
    @property
    def b2(self):
        return self.__b2

In [159]:
start = time.time()
sess = tf.InteractiveSession()
tf.set_random_seed(random_state)

all_accept_train_target_r_dum = pd.get_dummies(pd.Series(all_accept_train_target_r)).values
all_accept_test_target_r_dum = pd.get_dummies(pd.Series(all_accept_test_target_r)).values

nn = One_layer_batch(size_hidden=40)
nn.fit(all_accept_train_df_r, all_accept_train_target_r_dum, batch_size = 100, steps = 50000)

predict = nn.predict(all_accept_train_df_r)
print ('Training Accuracy: ', np.mean(predict == np.argmax(all_accept_train_target_r_dum, 1)))

predict = nn.predict(all_accept_test_df_r)
print('Test Accuracy: ', np.mean(predict == np.argmax(all_accept_test_target_r_dum, 1)))

print('Time elapsed: ', time.time()-start)

sess.close()

Training Accuracy:  0.7481374068703435
Test Accuracy:  0.7508754377188595
Time elapsed:  233.304438829422
