In [2]:
import re
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import datetime as dt

from sklearn import model_selection

from sklearn.metrics import confusion_matrix

%matplotlib inline 

In [3]:
# all accepted loans from lendingclub
all_accept_df = pd.read_csv('./archive/accepted_2007_to_2018q4.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# convert columns to numeric

all_accept_df['term'] = pd.to_numeric(all_accept_df['term'].apply(lambda x: re.findall(r'\d+', str(x))).str[0])

# convert columns to date

all_accept_df['issue_d'] = pd.to_datetime(all_accept_df['issue_d'])
all_accept_df['last_pymnt_d'] = pd.to_datetime(all_accept_df['last_pymnt_d'])

In [5]:
def AddMonthsToDate(dates, months):
    updatedDates = []
    
    for i in range(0, len(dates)):
        toAddMnths = pd.to_numeric(months.iat[i]) - 3
        updatedDates.append(dates.iat[i] + pd.DateOffset(months=toAddMnths))
        
    return updatedDates

In [6]:
all_accept_df = all_accept_df.dropna(subset=['issue_d', 'term'])

print(len(all_accept_df['issue_d']))

updatedDates = AddMonthsToDate(all_accept_df['issue_d'], all_accept_df['term'])

print(len(updatedDates))

all_accept_df['loan_prepaid'] = (
        ((all_accept_df['loan_status'] == 'Fully Paid') | (all_accept_df['loan_status'] == 'Does not meet the credit policy. Status:Fully Paid')) & 
            (all_accept_df['last_pymnt_d'] < updatedDates))


all_accept_df = all_accept_df.drop('loan_status', axis=1)

2260668
2260668


In [7]:
all_accept_df['loan_prepaid'].value_counts()

False    1454758
True      805910
Name: loan_prepaid, dtype: int64

In [8]:
def ConvertDateTimeToOrdinal(d):
    if d is pd.NaT:
        return 0
    else:
        return d.toordinal()

In [9]:
# convert columns to datetime (toordinal)

dateCols = pd.Series(
    ['issue_d',
    'earliest_cr_line',
    'last_pymnt_d',
    'next_pymnt_d',
    'last_credit_pull_d',
    'debt_settlement_flag_date',
    'settlement_date',
    'hardship_start_date',
    'hardship_end_date',
    'payment_plan_start_date',
    'sec_app_earliest_cr_line'])

for col in dateCols:    
    all_accept_df[col] = pd.to_datetime(all_accept_df[col]).apply(ConvertDateTimeToOrdinal)

In [10]:
# drop columns to be ignore for now

all_accept_df = all_accept_df.drop([
    'id',
    'member_id',
    'emp_title',
    'emp_length',
    'home_ownership',
    'url',
    'desc',
    'title',
    'zip_code',
    'addr_state',
    'initial_list_status',
    'verification_status_joint',  
    'hardship_type',
    'hardship_reason',
    'disbursement_method'
], axis=1)

In [11]:
# convert column to numeric

all_accept_df['term'] = pd.to_numeric(all_accept_df['term'].apply(lambda x: re.findall(r'\d+', str(x))).str[0])
all_accept_df['deferral_term'] = pd.to_numeric(all_accept_df['deferral_term'])

In [12]:
# Dummify

colsToBeDummified = pd.Series(
    ['grade',
    'sub_grade',
    'verification_status',
    'purpose',
    'pymnt_plan',
    'application_type',
    'hardship_flag',
    'hardship_status',
    'hardship_loan_status',
    'debt_settlement_flag',
    'settlement_status'])

for col in colsToBeDummified:    
    all_accept_df = all_accept_df.join(pd.get_dummies(all_accept_df[col], drop_first=True, prefix=col))
    all_accept_df = all_accept_df.drop(col, axis=1)

In [13]:
# handle na

all_accept_df.fillna(0, inplace=True)

In [14]:
from sklearn.model_selection import train_test_split

all_accept_train_df, all_accept_test_df, all_accept_train_target, all_accept_test_target = \
            train_test_split(all_accept_df, all_accept_df['loan_prepaid'], test_size=0.2, random_state=100)

In [15]:
# make sure all columns are numeric

for col in all_accept_train_df.columns:
    all_accept_train_df[col] = pd.to_numeric(all_accept_train_df[col])
    all_accept_test_df[col] = pd.to_numeric(all_accept_test_df[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_accept_train_df[col] = pd.to_numeric(all_accept_train_df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_accept_test_df[col] = pd.to_numeric(all_accept_test_df[col])


In [16]:
all_accept_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1808534 entries, 1497595 to 1873441
Columns: 192 entries, loan_amnt to settlement_status_COMPLETE
dtypes: bool(1), float64(112), int64(12), uint8(67)
memory usage: 1.8 GB


In [17]:
all_accept_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 452134 entries, 1532366 to 110876
Columns: 192 entries, loan_amnt to settlement_status_COMPLETE
dtypes: bool(1), float64(112), int64(12), uint8(67)
memory usage: 460.5 MB


## - Feature Selection

In [18]:
features = [
    'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term',
    'int_rate', 'grade_B', 'grade_C',
    'grade_D', 'grade_E', 'grade_F', 'grade_G',
    'sub_grade_A2', 'sub_grade_A3', 'sub_grade_A4', 'sub_grade_A5', 
    'sub_grade_B2', 'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5',
    'sub_grade_C1', 'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4',
    'sub_grade_C5', 'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3',
    'sub_grade_D4', 'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2',
    'sub_grade_E3', 'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1',
    'sub_grade_F2', 'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5',
    'sub_grade_G1', 'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4',
    'sub_grade_G5', 'sub_grade_B1','purpose_credit_card', 'purpose_debt_consolidation',
    'purpose_home_improvement', 'purpose_medical', 'purpose_other', 
    'purpose_small_business', 'annual_inc',
    'issue_d', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high',
    'open_acc', 'total_acc', 'total_pymnt',
    'total_pymnt_inv', 'last_pymnt_d', 'last_pymnt_amnt',
    'annual_inc_joint', 'dti_joint', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
    'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit',
    'revol_bal_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high',
]

In [19]:
mask = all_accept_train_df.columns.isin(features)

kBestColumns = all_accept_train_df.columns[mask]
otherColumnsToBeCombined = all_accept_train_df.columns[~mask]

len(kBestColumns) + len(otherColumnsToBeCombined)

192

## - PCA on non-kBest columns

In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)

### PCA - Modify train data set

In [21]:
all_accept_train_otherColsDf = pd.DataFrame(all_accept_train_df[otherColumnsToBeCombined])

data_rescaled = scaler.fit_transform(all_accept_train_otherColsDf)

pca.fit(data_rescaled)

reduced = pca.transform(data_rescaled)
reduced = pd.DataFrame(reduced)

reduced.fillna(0, inplace=True)

reduced.shape

(1808534, 21)

In [22]:
pd.DataFrame(all_accept_train_df[kBestColumns]).shape

(1808534, 75)

In [23]:
kBestColsDf = pd.DataFrame(all_accept_train_df[kBestColumns])

all_accept_train_df = \
    pd.concat([kBestColsDf, reduced.reindex(kBestColsDf.index)], axis=1)

all_accept_train_df.shape

(1808534, 96)

### PCA - Modify test data set

In [24]:
all_accept_test_otherColsDf = pd.DataFrame(all_accept_test_df[otherColumnsToBeCombined])

reduced = pca.transform(all_accept_test_otherColsDf)
reduced = pd.DataFrame(reduced)

reduced.fillna(0, inplace=True)

reduced.shape

(452134, 21)

In [25]:
kBestColsDf = pd.DataFrame(all_accept_test_df[kBestColumns])

all_accept_test_df = \
    pd.concat([kBestColsDf, reduced.reindex(kBestColsDf.index)], axis=1)

all_accept_test_df.shape

(452134, 96)

## Shrink data set

In [26]:
n_range_train = range(1, all_accept_train_df.shape[0])
#n_range_train = range(1, 20000)

all_accept_train_df_r = pd.DataFrame(all_accept_train_df.iloc[n_range_train])
all_accept_train_target_r = all_accept_train_target.iloc[n_range_train] 

all_accept_train_df_r.fillna(0, inplace=True)

In [27]:
n_range_test = range(1, all_accept_test_df.shape[0])
#n_range_test = range(1, 2000)

all_accept_test_df_r = pd.DataFrame(all_accept_test_df.iloc[n_range_test])
all_accept_test_target_r = all_accept_test_target.iloc[n_range_test] 

all_accept_test_df_r.fillna(0, inplace=True)

## Global variables

In [28]:
random_state = 100
n_jobs = 6
#n_jobs = 2
cv = 5
accuracy = 'accuracy'

## Decision Trees

In [33]:
from sklearn import tree

tree_model = tree.DecisionTreeClassifier()

grid_para_tree = [{
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": range(1, 10),
    "min_samples_split": np.linspace(start=2, stop=30, num=15, dtype=int)
}]

tree_model.set_params(random_state=random_state)

grid_search_tree = model_selection.GridSearchCV(tree_model, grid_para_tree,
                                                cv=cv, scoring=accuracy,
                                                n_jobs=n_jobs)

In [34]:
%time grid_search_tree.fit(all_accept_train_df_r, all_accept_train_target_r)

Wall time: 13h 12min 50s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=100), n_jobs=6,
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'min_samples_leaf': range(1, 10),
                          'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])}],
             scoring='accuracy')

In [35]:
grid_search_tree.score(all_accept_train_df_r, all_accept_train_target_r)

0.9941007435308065

In [36]:
grid_search_tree.score(all_accept_test_df_r, all_accept_test_target_r)

0.9879172721301033

In [37]:
confusion_matrix(grid_search_tree.predict(all_accept_train_df_r), all_accept_train_target_r)

array([[1157910,    4863],
       [   5806,  639954]], dtype=int64)

In [38]:
confusion_matrix(grid_search_tree.predict(all_accept_test_df_r), all_accept_test_target_r)

array([[288190,   2613],
       [  2850, 158480]], dtype=int64)

## XGBoost

In [39]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

xgb_c = xgb.XGBClassifier(
    objective= 'binary:logistic',
    nthread=n_jobs,
    seed=random_state
)

parameters = {
    'max_depth': range (5, 7, 10),
    'n_estimators': range(100, 150, 200),
    'learning_rate': [0.25, 0.2, 0.1]
}

grid_search_xgb = model_selection.GridSearchCV(
    estimator=xgb_c,
    param_grid=parameters,
    scoring = accuracy,
    n_jobs = n_jobs,
    cv = cv
)

In [40]:
%time grid_search_xgb.fit(all_accept_train_df_r, all_accept_train_target_r)



Wall time: 47min 26s


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None, nthread=6,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, seed=100,
                                     subsample=No

In [41]:
grid_search_xgb.score(all_accept_train_df_r, all_accept_train_target_r)

0.9935251388832828

In [42]:
grid_search_xgb.score(all_accept_test_df_r, all_accept_test_target_r)

0.9929202248011094

In [43]:
confusion_matrix(grid_search_xgb.predict(all_accept_train_df_r), all_accept_train_target_r)

array([[1158122,    6116],
       [   5594,  638701]], dtype=int64)

In [44]:
confusion_matrix(grid_search_xgb.predict(all_accept_test_df_r), all_accept_test_target_r)

array([[289465,   1626],
       [  1575, 159467]], dtype=int64)

## Neural Network

In [29]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import sklearn
import time
import scipy

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import initializers

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout

model = Sequential()

In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
all_accept_train_df_r_scaled = scaler.fit_transform(all_accept_train_df_r)

In [31]:
start = time.time()
tf.set_random_seed(random_state)

num_observations = all_accept_train_df_r_scaled.shape[0]
num_features     = all_accept_train_df_r_scaled.shape[1]

kernel_initializer = initializers.TruncatedNormal(mean=0.0,stddev=0.1)
bias_initializer   = initializers.Constant(0.1)
    
model.add(Dense(num_features*2, input_dim=num_features, input_shape=(num_features,),
                activation=tf.nn.relu, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
model.add(Dropout(0.5))

model.add(Dense(num_features, activation=tf.nn.relu))
model.add(Dropout(0.3))

model.add(Dense(num_features/2, activation=tf.nn.relu))
model.add(Dropout(0.2))

model.add(Dense(units=1, activation=tf.nn.sigmoid))

bce = tf.keras.losses.BinaryCrossentropy()
sgd = tf.keras.optimizers.SGD(learning_rate=0.1)

model.compile(loss=bce, optimizer=sgd, metrics=[accuracy])

model.fit(all_accept_train_df_r_scaled, all_accept_train_target_r.values, epochs=1000, batch_size=100, verbose=0)

print('Time elapsed: ', time.time()-start)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Time elapsed:  39688.28752946854


In [32]:
predict = model.predict_classes(all_accept_train_df_r_scaled)

print ('Training Accuracy: ', np.mean(predict == np.argmax(all_accept_train_target_r, 0)))

confusion_matrix(predict, all_accept_train_target_r)



Training Accuracy:  0.8895426721784375


array([[1156501,    9711],
       [   7215,  635106]], dtype=int64)

In [33]:
scaler = StandardScaler()
all_accept_test_df_r_scaled = scaler.fit_transform(all_accept_test_df_r)

predict = model.predict_classes(all_accept_test_df_r_scaled)
print('Test Accuracy: ', np.mean(predict == np.argmax(all_accept_test_target_r, 0)))

confusion_matrix(predict, all_accept_test_target_r)



Test Accuracy:  0.8746838327864588


array([[289053,   2311],
       [  1987, 158782]], dtype=int64)