In [1]:
import re
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import datetime as dt

from sklearn import model_selection

from sklearn.metrics import confusion_matrix

from sklearn.metrics import roc_auc_score

%matplotlib inline 

## Global variables

In [2]:
random_state = 100
n_jobs = 6
#n_jobs = 2
cv = 5
accuracy = 'accuracy'

### Data handling

In [3]:
# all accepted loans from lendingclub
all_accept_df = pd.read_csv('./archive/accepted_2007_to_2018q4.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# drop invalid loan_status to 'Unknown'

all_accept_df.loc[pd.isnull(all_accept_df['loan_status']),'loan_status'] = 'Unknown'

all_accept_df.loan_status.unique()

array(['Fully Paid', 'Current', 'Charged Off', 'In Grace Period',
       'Late (31-120 days)', 'Late (16-30 days)', 'Default', 'Unknown',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)

In [5]:
loan_status = all_accept_df['loan_status']

loan_status[(loan_status=='Fully Paid') | (loan_status=='Current') |
            (loan_status=='Does not meet the credit policy. Status:Fully Paid') |
            (loan_status=='In Grace Period')] = 1

loan_status[loan_status != 1] = 0

all_accept_df['loan_status'] = pd.to_numeric(loan_status)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan_status[(loan_status=='Fully Paid') | (loan_status=='Current') |
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan_status[loan_status != 1] = 0


In [6]:
def ConvertDateTimeToOrdinal(d):
    if d is pd.NaT:
        return 0
    else:
        return d.toordinal()

In [7]:
# filter bad loan ids

all_accept_df = all_accept_df[~all_accept_df['id'].str.contains("Total amount funded in policy code", na=False)]
all_accept_df = all_accept_df[~all_accept_df['id'].str.contains("Loans that do not meet the credit policy", na=False)]

In [8]:
# convert columns to datetime (toordinal)

dateCols = pd.Series(
    ['issue_d',
    'earliest_cr_line',
    'last_pymnt_d',
    'next_pymnt_d',
    'last_credit_pull_d',
    'debt_settlement_flag_date',
    'settlement_date',
    'hardship_start_date',
    'hardship_end_date',
    'payment_plan_start_date',
    'sec_app_earliest_cr_line'])

for col in dateCols:    
    all_accept_df[col] = pd.to_datetime(all_accept_df[col]).apply(ConvertDateTimeToOrdinal)

In [9]:
# drop columns to be ignore for now

all_accept_df = all_accept_df.drop([
    'member_id',
    'emp_title',
    'emp_length',
    'home_ownership',
    'url',
    'desc',
    'title',
    'zip_code',
    'addr_state',
    'initial_list_status',
    'verification_status_joint',  
    'hardship_type',
    'hardship_reason',
    'disbursement_method'
], axis=1)

In [10]:
# convert column to numeric

all_accept_df['term'] = pd.to_numeric(all_accept_df['term'].apply(lambda x: re.findall(r'\d+', str(x))).str[0])
all_accept_df['deferral_term'] = pd.to_numeric(all_accept_df['deferral_term'])

In [11]:
# Dummify

colsToBeDummified = pd.Series(
    ['grade',
    'sub_grade',
    'verification_status',
    'purpose',
    'pymnt_plan',
    'application_type',
    'hardship_flag',
    'hardship_status',
    'hardship_loan_status',
    'debt_settlement_flag',
    'settlement_status'])

for col in colsToBeDummified:    
    all_accept_df = all_accept_df.join(pd.get_dummies(all_accept_df[col], drop_first=True, prefix=col))
    all_accept_df = all_accept_df.drop(col, axis=1)

In [12]:
# handle na

all_accept_df.fillna(0, inplace=True)

### Resample data

In [13]:
from sklearn.utils import resample

df_majority = all_accept_df[all_accept_df['loan_status'] == 1]
df_minority = all_accept_df[all_accept_df['loan_status'] == 0]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=df_majority.shape[0], 
                                 random_state=random_state)

all_accept_df = pd.concat([df_majority, df_minority_upsampled])
 
all_accept_df['loan_status'].value_counts()

1    1965492
0    1965492
Name: loan_status, dtype: int64

### Train test split

In [14]:
from sklearn.model_selection import train_test_split

all_accept_train_df, all_accept_test_df, all_accept_train_target, all_accept_test_target = \
            train_test_split(all_accept_df, all_accept_df['loan_status'], test_size=0.2, random_state=100)

In [15]:
# make sure all columns are numeric

for col in all_accept_train_df.columns:
    all_accept_train_df[col] = pd.to_numeric(all_accept_train_df[col])
    all_accept_test_df[col] = pd.to_numeric(all_accept_test_df[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_accept_train_df[col] = pd.to_numeric(all_accept_train_df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_accept_test_df[col] = pd.to_numeric(all_accept_test_df[col])


In [16]:
all_accept_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3144787 entries, 142817 to 2152783
Columns: 193 entries, id to settlement_status_COMPLETE
dtypes: float64(112), int64(14), uint8(67)
memory usage: 3.2 GB


In [17]:
all_accept_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 786197 entries, 18800 to 435839
Columns: 193 entries, id to settlement_status_COMPLETE
dtypes: float64(112), int64(14), uint8(67)
memory usage: 812.0 MB


In [18]:
all_accept_ids = pd.concat([all_accept_train_df['id'], all_accept_test_df['id']])

## - Feature Selection

In [19]:
features = [
    'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'installment',
    'int_rate', 'grade_B', 'grade_C',
    'grade_D', 'grade_E', 'grade_F', 'grade_G',
    'sub_grade_A2', 'sub_grade_A3', 'sub_grade_A4', 'sub_grade_A5', 
    'sub_grade_B2', 'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5',
    'sub_grade_C1', 'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4',
    'sub_grade_C5', 'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3',
    'sub_grade_D4', 'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2',
    'sub_grade_E3', 'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1',
    'sub_grade_F2', 'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5',
    'sub_grade_G1', 'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4',
    'sub_grade_G5', 'sub_grade_B1','purpose_credit_card', 'purpose_debt_consolidation',
    'purpose_home_improvement', 'purpose_medical', 'purpose_other', 
    'purpose_small_business', 'annual_inc', 'dti',
    'delinq_2yrs', 'fico_range_low', 'fico_range_high',
    'open_acc', 'total_acc', 'annual_inc_joint', 'dti_joint',
    'inq_last_6mths', 'pub_rec', 'revol_bal', 'revol_util',
    'revol_bal_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high'
]

In [20]:
mask = all_accept_train_df.columns.isin(features)

kBestColumns = all_accept_train_df.columns[mask]

len(kBestColumns)

68

In [21]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [22]:
all_accept_train_df = all_accept_train_df[kBestColumns]
all_accept_train_df = pd.DataFrame(scaler.fit_transform(all_accept_train_df), columns=features)
all_accept_train_df.fillna(0, inplace=True)

all_accept_train_df.shape

(3144787, 68)

In [23]:
all_accept_test_df = all_accept_test_df[kBestColumns]
all_accept_test_df = pd.DataFrame(scaler.fit_transform(all_accept_test_df), columns=features)
all_accept_test_df.fillna(0, inplace=True)

all_accept_test_df.shape

(786197, 68)

## Shrink data set

In [24]:
n_range_train = range(1, all_accept_train_df.shape[0])
#n_range_train = range(1, 200000)

all_accept_train_df_r = pd.DataFrame(all_accept_train_df.iloc[n_range_train])
all_accept_train_target_r = all_accept_train_target.iloc[n_range_train] 

all_accept_train_df_r.fillna(0, inplace=True)

In [25]:
n_range_test = range(1, all_accept_test_df.shape[0])
#n_range_test = range(1, 20000)

all_accept_test_df_r = pd.DataFrame(all_accept_test_df.iloc[n_range_test])
all_accept_test_target_r = all_accept_test_target.iloc[n_range_test] 

all_accept_test_df_r.fillna(0, inplace=True)

## XGBoost

In [38]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

xgb_c = xgb.XGBClassifier(
    objective= 'binary:logistic',
    nthread=n_jobs,
    seed=random_state,
    scale_pos_weight=0.149
)

parameters = {
    'max_depth': [7, 10, 12, 15],
    'n_estimators': [200, 300, 400, 500],
    'learning_rate': [0.25, 0.2, 0.1, 0.01]
}

grid_search_xgb = model_selection.GridSearchCV(
    estimator=xgb_c,
    param_grid=parameters,
    scoring = accuracy,
    n_jobs = n_jobs,
    cv = cv
)

In [None]:
%time grid_search_xgb.fit(all_accept_train_df_r, all_accept_train_target_r)

In [None]:
grid_search_xgb.score(all_accept_train_df_r, all_accept_train_target_r)

In [None]:
roc_auc_score(grid_search_xgb.predict(all_accept_train_df_r), all_accept_train_target_r)

In [None]:
grid_search_xgb.score(all_accept_test_df_r, all_accept_test_target_r)

In [None]:
roc_auc_score(grid_search_xgb.predict(all_accept_test_df_r), all_accept_test_target_r)

In [None]:
confusion_matrix(grid_search_xgb.predict(all_accept_train_df_r), all_accept_train_target_r)

In [None]:
confusion_matrix(grid_search_xgb.predict(all_accept_test_df_r), all_accept_test_target_r)

In [None]:
grid_search_xgb.best_estimator_.save_model('xgb_best_model')

In [None]:
all_accept_xgb_df = pd.concat([all_accept_train_df, all_accept_test_df])

probablities = grid_search_xgb.predict(all_accept_xgb_df)

all_accept_xgb_df = pd.DataFrame(np.column_stack((all_accept_ids, probablities)), columns=['id', 'predicted_val'])
all_accept_xgb_df.to_csv('all_accept_xgb_df.csv', index=False)                                        

In [None]:
all_accept_xgb_df.predicted_val.value_counts()

## Neural Network

In [468]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import sklearn
import time
import scipy

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import initializers

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout

model = Sequential()

In [469]:
#from sklearn.preprocessing import StandardScaler

#scaler = StandardScaler()
all_accept_train_df_r_scaled = all_accept_train_df_r #scaler.fit_transform(all_accept_train_df_r)

In [470]:
start = time.time()
tf.set_random_seed(random_state)

num_observations = all_accept_train_df_r_scaled.shape[0]
num_features     = all_accept_train_df_r_scaled.shape[1]

kernel_initializer = initializers.TruncatedNormal(mean=0.0,stddev=0.1)
bias_initializer   = initializers.Constant(0.1)
    
model.add(Dense(num_features*2, input_dim=num_features, input_shape=(num_features,),
                activation=tf.nn.relu, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
model.add(Dropout(0.5))

model.add(Dense(num_features, activation=tf.nn.relu))
model.add(Dropout(0.3))

model.add(Dense(num_features/2, activation=tf.nn.relu))
model.add(Dropout(0.2))

model.add(Dense(units=1, activation=tf.nn.sigmoid))

bce = tf.keras.losses.BinaryCrossentropy()
sgd = tf.keras.optimizers.SGD(learning_rate=0.1)

model.compile(loss=bce, optimizer=sgd, metrics=[accuracy])

model.fit(all_accept_train_df_r_scaled, all_accept_train_target_r.values, epochs=1000, batch_size=100, verbose=0)

print('Time elapsed: ', time.time()-start)

Time elapsed:  62426.27232670784


In [471]:
predict = model.predict_classes(all_accept_train_df_r_scaled)

print ('Training Accuracy: ', np.mean(predict[:,0] == all_accept_train_target_r.values))
print ('ROC AUC Score: ', roc_auc_score(predict[:,0], all_accept_train_target_r.values))

confusion_matrix(predict, all_accept_train_target_r)



Training Accuracy:  0.6746026597676281
ROC AUC Score:  0.674908664931631


array([[1093845,  544576],
       [ 478729, 1027636]], dtype=int64)

In [472]:
#scaler = StandardScaler()
all_accept_test_df_r_scaled = all_accept_test_df_r # scaler.fit_transform(all_accept_test_df_r)

predict = model.predict_classes(all_accept_test_df_r_scaled)
print('Test Accuracy: ', np.mean(predict[:,0] == all_accept_test_target_r.values))
print ('ROC AUC Score: ', roc_auc_score(predict[:,0], all_accept_test_target_r.values))

confusion_matrix(predict, all_accept_test_target_r)



Test Accuracy:  0.6707653053437056
ROC AUC Score:  0.6713685106759334


array([[275117, 141042],
       [117801, 252236]], dtype=int64)

In [473]:
model.save('keras_model')

In [474]:
all_accept_keras_df = pd.concat([all_accept_train_df, all_accept_test_df])

all_accept_keras_df = scaler.fit_transform(all_accept_keras_df)

probablities = model.predict_classes(all_accept_keras_df)

all_accept_keras_df = pd.DataFrame(np.column_stack((all_accept_ids, probablities)), columns=['id', 'predicted_val'])
all_accept_keras_df.to_csv('all_accept_keras_df.csv', index=False) 

