# Loan Club: Machine Learning Capstone Notebook

In [1]:
# Load packages
import pandas as pd
import numpy as np
import re
from time import strptime  # format data columns
import warnings
import math

from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
import seaborn as sns
import plotly.express as px

warnings.filterwarnings("ignore")  # ignore warnings throughout notebook
pd.set_option("display.max_columns", None)  # show all columns

In [2]:
# Load Data
filepath = "./data/accepted_subsampled_5percent.csv" #will be personalized
df = pd.read_csv(filepath, sep=",")

df_cleaned = df.copy() #work from second copy

In [3]:
# Features known to investors based on LC website
known_vars = ['acc_now_delinq',             # accounts now deliquent
              'collections_12_mths_ex_med', # collections excluding medical
              'fico_range_high',            # credit score range
              'fico_range_low',             # creit score range
              'delinq_2yrs',                # delinquencies in last two years
              'delinq_amnt',                # delinquency amount
              'earliest_cr_line',           # earliest credit line
              'home_ownership',             # home ownership
              'dti',                        # debt2income ratio
              'annual_inc',                 # annual income
              'initial_list_status',        # initial listing status
              'inq_last_6mths',             # credit inquires in last 6mo
              'int_rate',                   # interest rate
              'verification_status_joint',  # is this a joint app
              'emp_length',                 # length of employment (yr)
              'loan_amnt',                  # loan amount
              'id',                         # loan id
              'purpose',                    # purpose of the loan
              'term',                       # loan term (3 or 5yr)
              'addr_state',                 # borrower location state
              'installment',                # montly payment
              'mths_since_last_delinq',     # mo since last delinquency
              'mths_since_last_major_derog',# mo since last maj. derogatory
              'mths_since_last_record',     # mo since last public record
              'open_acc',                   # open credit line
              'pub_rec',                    # public records on file
              'revol_util',                 # revolving balance utilization (%)
              'revol_bal',                  # revolving credit balance ($)
              'tot_coll_amt',               # total collection amount ever
              'total_acc',                  # total credit lines
              'tot_cur_bal',                # total current balance
              'verification_status',        # verified income (Y/N I think)
              'grade'                       # loan grade
             ]

# Sanity check, print variable if not found within original dataframe
# [ print(var) for var in known_vars if (var not in df.columns)]


### Impute Missing Data of Known Variables

In [4]:
#Assess missingness of known variables
missingness = df_cleaned[known_vars].isnull().mean().T
missingness = missingness.loc[missingness>0].sort_values(ascending=False)
missingness


verification_status_joint      0.949313
mths_since_last_record         0.838543
mths_since_last_major_derog    0.744395
mths_since_last_delinq         0.513786
emp_length                     0.063868
tot_cur_bal                    0.033430
tot_coll_amt                   0.033430
revol_util                     0.001006
dti                            0.000829
collections_12_mths_ex_med     0.000476
open_acc                       0.000185
pub_rec                        0.000185
total_acc                      0.000185
inq_last_6mths                 0.000185
earliest_cr_line               0.000185
delinq_amnt                    0.000185
delinq_2yrs                    0.000185
acc_now_delinq                 0.000185
dtype: float64

In [5]:
# Drop features missing > 50% 
feat_wManyMissing = missingness.index[np.where(missingness > .5)].to_list()
df_cleaned[known_vars].drop(df_cleaned[feat_wManyMissing], axis=1, inplace=True)
[known_vars.remove(var) for var in feat_wManyMissing] #remove features from known_var list


[None, None, None, None]

In [6]:
# Replace NA's of numeric 'known_var' features with mean value
numeric_var = ['tot_cur_bal',
               'tot_coll_amt',
               'revol_util',
               'collections_12_mths_ex_med',
               'open_acc',
               'pub_rec',
               'total_acc',
               'inq_last_6mths',
               'delinq_amnt',
               'delinq_2yrs',
               'dti' ]

# List comprehension through numerica variables
[df_cleaned[var].fillna(df[var].mean(), inplace=True) for var in numeric_var]


[None, None, None, None, None, None, None, None, None, None, None]

In [7]:
# Helper function to replace missing character strings with randomly selected value
def fillna_random(var):
    #find index of missing values
    miss_idx = df_cleaned.loc[ df_cleaned[var].isnull()].index.tolist()
    
    #find new values to replace NaN values
    new_val = df_cleaned[var].loc[~df_cleaned.index.isin(miss_idx)].sample(len(miss_idx)).values.tolist()

    #replace values
    df_cleaned[var][miss_idx] = new_val

# ==================================================
# Replace NA's of character 'known_var' features with random
non_numeric_var = ['emp_length', 'earliest_cr_line', 'acc_now_delinq', 'delinq_2yrs'] #list of non-numeric variables
[fillna_random(var) for var in non_numeric_var]


[None, None, None, None]

In [8]:
#Sanity check that no more missing values
df_cleaned[known_vars].isnull().sum()


acc_now_delinq                0
collections_12_mths_ex_med    0
fico_range_high               0
fico_range_low                0
delinq_2yrs                   0
delinq_amnt                   0
earliest_cr_line              0
home_ownership                0
dti                           0
annual_inc                    0
initial_list_status           0
inq_last_6mths                0
int_rate                      0
emp_length                    0
loan_amnt                     0
id                            0
purpose                       0
term                          0
addr_state                    0
installment                   0
open_acc                      0
pub_rec                       0
revol_util                    0
revol_bal                     0
tot_coll_amt                  0
total_acc                     0
tot_cur_bal                   0
verification_status           0
grade                         0
dtype: int64

In [9]:
df_cleaned['grade'].unique()

array(['B', 'C', 'A', 'E', 'D', 'F', 'G'], dtype=object)

### Feature Engineering

In [10]:
# Simplify loan status (non-FullyPaid or ChargedOff loans will be converted to NAN)
df_cleaned['loan_status'] = df_cleaned['loan_status'].map({'Fully Paid':'Fully Paid',
                                                           'Charged Off':'Charged Off',
                                                           'Does not meet the credit policy. Status:Fully Paid': 'Fully Paid',
                                                           'Does not meet the credit policy. Status:Charged Off': 'Charged Off'})

# Remove non-completed loans
df_cleaned.drop(df_cleaned.loc[df_cleaned['loan_status'].isnull()].index.tolist(), axis=0, inplace=True)

# Simplify home ownership
df_cleaned['home_ownership'] = df_cleaned['home_ownership'].map({'MORTGAGE':'mortgage',
                                                                 'OWN':'own',
                                                                 'RENT':'rent'})

# Remove 25 observations without houses
df_cleaned.drop(df_cleaned.loc[df_cleaned['home_ownership'].isnull()].index.tolist(), axis=0, inplace=True)


# Reformat date features and calculate features related to prepayment  
df_cleaned['term_year'] = np.where(df_cleaned['term']==' 36 months', 3,5)
df_cleaned['earliest_cr_line'] =  pd.to_datetime(df_cleaned['earliest_cr_line'])
df_cleaned['issue_date'] =  pd.to_datetime(df_cleaned['issue_d'])
df_cleaned['last_pymnt_date'] = pd.to_datetime(df_cleaned['last_pymnt_d'])
df_cleaned['exp_last_pymnt_date'] = pd.to_datetime(df_cleaned['issue_d'].str[0:3]
                                                   +'-'
                                                   + (df_cleaned['issue_d'].str[-4:].astype('int')
                                                   + df_cleaned['term_year']).astype('str'))

# Calculate credit history ( in months )
date_ofloan = df_cleaned['issue_date'].dt.to_period('M').astype(int)
date_credline = df_cleaned['earliest_cr_line'].dt.to_period('M').astype(int)
df_cleaned['credit_hist_mths'] = date_ofloan - date_credline

# Log-transform skewed continuous features
df_cleaned['delinq_amnt_log'] = df_cleaned['delinq_amnt'].add(1).apply(np.log)
df_cleaned['annual_inc_log'] = df_cleaned['annual_inc'].add(1).apply(np.log)
df_cleaned['dti_log'] = df_cleaned['dti'].add(1).apply(np.log)
df_cleaned['funded_amnt_log'] = df_cleaned['funded_amnt'].add(1).apply(np.log)
df_cleaned['tot_coll_amt_log'] = df_cleaned['tot_coll_amt'].add(1).apply(np.log)
df_cleaned['tot_cur_bal_log'] = df_cleaned['tot_cur_bal'].add(1).apply(np.log)
df_cleaned['total_acc_log'] = df_cleaned['total_acc'].add(1).apply(np.log)
df_cleaned['revol_bal_log'] = df_cleaned['revol_bal'].add(1).apply(np.log)
df_cleaned['installment_log'] = df_cleaned['installment'].add(1).apply(np.log)
df_cleaned['open_acc_log'] = df_cleaned['open_acc'].add(1).apply(np.log)

# Simplify loan purpose - debt consolidation, credit card, and other
df_cleaned['purpose'] = df_cleaned['purpose'].map({'debt_consolidation':'debt_consolidation',
                                                   'credit_card':'credit_card'})
df_cleaned['purpose'].fillna('other',inplace=True)

# Convert loan grade to ordinal feature
df_cleaned['grade'] = df_cleaned['grade'].map({'A':1,
                                               'B':2,
                                               'C':3,
                                               'D':4,
                                               'E':5,
                                               'F':6,
                                               'G':7})

# Simplify employment length to four categories
df_cleaned['emp_length'] = df_cleaned['emp_length'].map({'< 1 year':0.5,
                                                         '1 year':1,
                                                         '2 years':2,
                                                         '3 years':3,
                                                         '4 years':4,
                                                         '5 years':5,
                                                         '6 years':6,
                                                         '7 years':7,
                                                         '8 years':8,
                                                         '9 years':9,
                                                         '10+ years':10})

# Create new binary features
df_cleaned['has_pub_rec'] = np.where(df_cleaned['pub_rec']>0,1,0) #0-=no public record
df_cleaned['has_paid_early'] = np.where((df_cleaned.loan_status=='Fully Paid')&(df_cleaned.last_pymnt_date < df_cleaned.exp_last_pymnt_date), 1, 0)
df_cleaned['has_36mo_loan'] = np.where(df_cleaned['term'].str.contains('36'),1,0) #0=60mo loan
df_cleaned['has_delinq_now'] = np.where(df_cleaned['acc_now_delinq']>0, 1, 0)
df_cleaned['has_delinq_past2yrs'] = np.where(df_cleaned['delinq_2yrs']>0, 1, 0) #0=no delinq within 2yrs
df_cleaned['has_whole_liststatus'] = np.where(df_cleaned['initial_list_status']=='w', 1, 0) #0=f
df_cleaned['has_fullypaid'] = np.where(df_cleaned['loan_status']=='Fully Paid', 1, 0) #0=charged off


In [158]:
# Create response variable based on 
threshold = 10 #goal of loans is loss of <10%
df_cleaned["roi_perc"] = df_cleaned["total_pymnt"].div(df_cleaned["funded_amnt"]).sub(1).mul(100)
df_cleaned['roi_response'] = np.where(df_cleaned['roi_perc'] > threshold, 1, 0) #84% of all loans > -20% ROI

df_cleaned['roi_response_5'] = np.where(df_cleaned['roi_perc'] > 5, 1, 0)
df_cleaned['credit_hist_mths'] = np.where(df_cleaned['credit_hist_mths'] < 0, 0, df_cleaned['credit_hist_mths'])

In [159]:
# Isolate list of predictor variables to be used for ML
predictor_vars = ['annual_inc_log',
                  'credit_hist_mths',
                  'delinq_amnt_log',
                  'dti_log',
                  'emp_length',
                  'fico_range_high',
                  'funded_amnt_log',
                  'grade',
                  'has_36mo_loan',
                  'has_delinq_now',
                  'has_delinq_past2yrs',
                  'has_fullypaid',
                  'has_paid_early',
                  'has_pub_rec',
                  'has_whole_liststatus',
                  'home_ownership',
                  'inq_last_6mths',
                  'installment_log',
                  'int_rate',
                  'open_acc_log',
                  'purpose',
                  'revol_bal_log',
                  'revol_util',
                  'tot_coll_amt_log',
                  'tot_cur_bal_log',
                  'total_acc_log',
                  'verification_status']

response_var = 'roi_response'
response_var_5 = 'roi_response_5'


# Machine Learning

### Logistic Regression
- Goal is to build classifier that predicts if loan results in desirable outcome
- Should also report feature importance

In [160]:
# Dummify categorical features
home_ownership_dummy = pd.get_dummies(df_cleaned['home_ownership'],
                                      prefix="home_ownership").drop('home_ownership_mortgage',axis=1)

purpose_dummy = pd.get_dummies(df_cleaned['purpose'],
                               prefix="purpose").drop('purpose_debt_consolidation',axis=1)

verification_status_dummy = pd.get_dummies(df_cleaned['verification_status'],
                               prefix="verification_status").drop('verification_status_Source Verified',axis=1)


In [161]:
# List of numeric features in the final dataframe
numeric_vars = ['annual_inc_log',
                'credit_hist_mths',
                'delinq_amnt_log',
                'dti_log',
                'emp_length',
                'fico_range_high',
                'funded_amnt_log',
                'grade',
                'has_36mo_loan',
                'has_delinq_now',
                'has_delinq_past2yrs',
                'has_pub_rec',
                'has_whole_liststatus',
                'inq_last_6mths',
                'installment_log',
                'int_rate',
                'open_acc_log',
                'revol_bal_log',
                'revol_util',
                'tot_coll_amt_log',
                'tot_cur_bal_log',
                'total_acc_log']

In [165]:
# final dataframe for model consumption

df_feature_final = pd.concat([df_cleaned[numeric_vars],
                      home_ownership_dummy,
                      purpose_dummy,
                      verification_status_dummy],axis=1)


In [168]:
# standardize final dataframe
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_feature_final)
df_feature_final_scaled = pd.DataFrame(scaler.transform(df_feature_final))
df_feature_final_scaled.columns = df_feature_final.columns
df_feature_final_scaled.sample(10)

In [203]:
# Need to reset index because feature index changed
class_response = df_cleaned[response_var].reset_index().drop('index',axis=1)
class_response_5 = df_cleaned[response_var_5].reset_index().drop('index',axis=1)

### Logistic Regression

**Model based on 10% threshold**

In [361]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix 
from sklearn.feature_selection import SelectFromModel
import statsmodels.api as sm

In [362]:
result = sm.Logit(class_response,df_feature_final_scaled).fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.667351
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:           roi_response   No. Observations:                67805
Model:                          Logit   Df Residuals:                    67777
Method:                           MLE   Df Model:                           27
Date:                Sun, 22 Mar 2020   Pseudo R-squ.:                 0.02950
Time:                        16:05:12   Log-Likelihood:                -45250.
converged:                       True   LL-Null:                       -46625.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
annual_inc_log                      -0.0443      0.011     -4.05

In [355]:
%time

lr = LogisticRegression(n_jobs=-1)
c_param = {'C':np.logspace(-5,5,10),'penalty':['l1','l2']}
gs_lr = GridSearchCV(lr, c_param, cv=5, scoring='accuracy')
gs_lr.fit(df_feature_final_scaled,class_response)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=-1, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.00000000e-05, 1.29154967e-04, 1.66810054e-03, 2.15443469e-02,
       2.78255940e-01, 3.59381366e+00, 4.64158883e+01, 5.99484250e+02,
       7.74263683e+03, 1.00000000e+05]),
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
     

In [439]:
gs_lr.best_params_

{'C': 0.0016681005372000592, 'penalty': 'l1'}

In [440]:
# Model accuracy
gs_lr.score(df_feature_final_scaled,class_response)

0.5977287810633434

In [441]:
# Population accuracy
class_response['roi_response'].value_counts()[1]/(class_response['roi_response'].value_counts()[0]+class_response['roi_response'].value_counts()[1])

0.5524371359044318

In [442]:
#Type I error is very high
con_matrix = confusion_matrix(class_response, gs_lr.best_estimator_.predict(df_feature_final_scaled))
con_matrix

array([[11652, 18695],
       [ 8581, 28877]])

In [443]:
accuracy = np.diag(con_matrix/con_matrix.sum(axis=1).reshape((-1,1)))
print('Accuracy for predicting below 10 percentage threshold: %f' %accuracy[0])
print('Accuracy for predicting above 10 percentage threshold: %f' %accuracy[1])

Accuracy for predicting below 10 percentage threshold: 0.383959
Accuracy for predicting above 10 percentage threshold: 0.770917


**Feature Selection Using Lasso**

In [476]:
sel = SelectFromModel(LogisticRegression(C=gs_lr.best_params_["C"], penalty=gs_lr.best_params_["penalty"]))
sel.fit(df_feature_final_scaled, class_response)

# Features selected
selected_feat = df_feature_final_scaled.columns[(sel.get_support())]
set(np.array(selected_feat))

{'annual_inc_log',
 'fico_range_high',
 'has_whole_liststatus',
 'revol_util',
 'tot_coll_amt_log',
 'total_acc_log'}

In [477]:
# Features dropped
set(df_feature_final_scaled.columns)-set(selected_feat)

{'credit_hist_mths',
 'delinq_amnt_log',
 'dti_log',
 'emp_length',
 'funded_amnt_log',
 'grade',
 'has_36mo_loan',
 'has_delinq_now',
 'has_delinq_past2yrs',
 'has_pub_rec',
 'home_ownership_own',
 'home_ownership_rent',
 'inq_last_6mths',
 'installment_log',
 'int_rate',
 'open_acc_log',
 'purpose_credit_card',
 'purpose_other',
 'revol_bal_log',
 'tot_cur_bal_log',
 'verification_status_Not Verified',
 'verification_status_Verified'}

**Train Test Using Only 6 features  - generates similar results as GridSearchCV**

In [478]:
from sklearn.model_selection import train_test_split

In [479]:
X_train, X_test, y_train, y_test = train_test_split(df_feature_final_scaled[selected_feat], class_response, test_size=0.2)

In [480]:
lr_train = LogisticRegression(C=gs_lr.best_params_["C"], penalty=gs_lr.best_params_["penalty"], n_jobs=-1)
lr_train.fit(X_train,y_train)
print("Train accuracy: %f" %lr_train.score(X_train,y_train))
print("Test accuracy: %f" %lr_train.score(X_test,y_test))
print("Test Confusion Matrix:")
confusion_matrix(y_test, lr_train.predict(X_test))

Train accuracy: 0.594554
Test accuracy: 0.610353
Test Confusion Matrix:


array([[2384, 3613],
       [1671, 5893]])

### Model based on 5% threshold

In [466]:
from imblearn.over_sampling import SMOTE

ImportError: cannot import name '_to_object_array' from 'sklearn.utils' (/Users/invincible/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/__init__.py)

In [428]:
result_5 = sm.Logit(class_response_5,df_feature_final_scaled).fit()
print(result_5.summary())

Optimization terminated successfully.
         Current function value: 0.676850
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         roi_response_5   No. Observations:                67805
Model:                          Logit   Df Residuals:                    67777
Method:                           MLE   Df Model:                           27
Date:                Sun, 22 Mar 2020   Pseudo R-squ.:                 -0.1328
Time:                        16:42:34   Log-Likelihood:                -45894.
converged:                       True   LL-Null:                       -40513.
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
annual_inc_log                      -0.0006      0.011     -0.05

In [430]:
# Grid Search for C value and L1/L2 
%time

lr_5 = LogisticRegression(n_jobs=-1)
c_param = {'C':np.logspace(-5,5,10),'penalty':['l1','l2']}
gs_lr_5 = GridSearchCV(lr_5, c_param, cv=5, scoring='accuracy')
gs_lr_5.fit(df_feature_final_scaled,class_response_5)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=-1, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.00000000e-05, 1.29154967e-04, 1.66810054e-03, 2.15443469e-02,
       2.78255940e-01, 3.59381366e+00, 4.64158883e+01, 5.99484250e+02,
       7.74263683e+03, 1.00000000e+05]),
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
     

In [471]:
gs_lr_5.best_params_

{'C': 0.0001291549665014884, 'penalty': 'l1'}

In [472]:
gs_lr_5.score(df_feature_final_scaled,class_response_5)

0.715124253373645

In [473]:
class_response_5['roi_response_5'].value_counts()[1]/(class_response_5['roi_response_5'].value_counts()[0]+class_response_5['roi_response_5'].value_counts()[1])

0.715124253373645

In [474]:
con_matrix_5 = confusion_matrix(class_response_5, gs_lr.best_estimator_.predict(df_feature_final_scaled))
con_matrix_5

array([[ 6332, 12984],
       [13901, 34588]])

In [475]:
accuracy_5 = np.diag(con_matrix_5/con_matrix_5.sum(axis=1).reshape((-1,1)))
print('Accuracy for predicting below 10 percentage threshold: %f' %accuracy_5[0])
print('Accuracy for predicting above 10 percentage threshold: %f' %accuracy_5[1])

Accuracy for predicting below 10 percentage threshold: 0.327811
Accuracy for predicting above 10 percentage threshold: 0.713316


**Feature Selection**

In [455]:
sel_5 = SelectFromModel(LogisticRegression(C=gs_lr_5.best_params_["C"], penalty=gs_lr_5.best_params_["penalty"]))
sel_5.fit(df_feature_final_scaled, class_response_5)

# Features selected
selected_feat_5 = df_feature_final_scaled.columns[(sel_5.get_support())]
set(np.array(selected_feat_5))


set()

In [456]:
# Features dropped
set(df_feature_final_scaled.columns)-set(selected_feat_5)

{'annual_inc_log',
 'credit_hist_mths',
 'delinq_amnt_log',
 'dti_log',
 'emp_length',
 'fico_range_high',
 'funded_amnt_log',
 'grade',
 'has_36mo_loan',
 'has_delinq_now',
 'has_delinq_past2yrs',
 'has_pub_rec',
 'has_whole_liststatus',
 'home_ownership_own',
 'home_ownership_rent',
 'inq_last_6mths',
 'installment_log',
 'int_rate',
 'open_acc_log',
 'purpose_credit_card',
 'purpose_other',
 'revol_bal_log',
 'revol_util',
 'tot_coll_amt_log',
 'tot_cur_bal_log',
 'total_acc_log',
 'verification_status_Not Verified',
 'verification_status_Verified'}

In [457]:
lr_5_train = LogisticRegression(C=gs_lr_5.best_params_["C"], penalty=gs_lr_5.best_params_["penalty"], n_jobs=-1 )
lr_5_train.fit(X_train,y_train)
print("Train accuracy: %f" %lr_5_train.score(X_train,y_train))
print("Test accuracy: %f" %lr_5_train.score(X_test,y_test))
print("Test Confusion Matrix:")
confusion_matrix(y_test, lr_5_train.predict(X_test))

Train accuracy: 0.447626
Test accuracy: 0.447312
Test Confusion Matrix:


array([[6066,    0],
       [7495,    0]])

### Random Forest
- Goal is to build classifier that predicts if loan results in desirable outcome
- Should also report feature importance

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import ensemble

#Create test-train split of data
x = df_final
y = df_cleaned[response_var] #labels
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
# #Create random forest object
# randomForest = ensemble.RandomForestClassifier()

# #Set random forest parameters
# randomForest.set_params(random_state=0)

# #Fit model with data
# randomForest.fit(x_train, y_train)

# #Calculate the train and test accuracy
# train_acc = randomForest.score(x_train, y_train)
# test_acc = randomForest.score(x_test, y_test)

# print('The training error is: .5f' % train_acc)
# print('The test error is: %.5f' % test_acc)

### Create Partial Dependence Plot
- Link https://towardsdatascience.com/introducing-pdpbox-2aa820afd312

### Survival Analysis

**KM Curve**

**Cox model**
- Use same predictors as random forest