# Build Model

In [12]:
import os
import pandas as pd
import numpy as np
import requests
from matplotlib.cbook import boxplot_stats 

import warnings
warnings.simplefilter(action='ignore')

from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import roc_auc_score as ra

In [13]:
url = "https://raw.githubusercontent.com/joepollastrini/Loan-Prediction/master"
train_name = 'train_loan_data.csv'
test_name = 'test_loan_data.csv'

Below are the functions needed for imputation and cleaning based on decisions made in Impute and Clean step

In [14]:
def data_grab_online(base, name, out='output.csv'):
    #get data from url
    u = base + '/' + name
    r = requests.get(u).content
    df = pd.read_csv(u)
    return df

def gender_impute_and_dummy(row):
    #if unmarried and 1 dependent female, else male
    if pd.isnull(row['Gender']):
        if row['Married'] == 'No' and row['Dependents'] == '1':
            return 0
        else:
            return 1
    else:
        if row['Gender'] == 'Male':
            return 1
        else:
            return 0
        
def married_impute_and_dummy(x):
    #if female, not married, otherwise married
    if pd.isnull(x['Married']):
        if x['Gender'] == 'Female':
            return 0
        else:
            return 1
    else:
        if x['Married'] == 'Yes':
            return 1
        else:
            return 0
        
def dependents_impute_and_ordinal(x):
    #if married, one dependent, otherwise none
    #convert dependents to ordinal int as well
    if pd.isnull(x['Dependents']):
        if x['Married_IO'] == 1:
            if x['Gender_IO'] == 1:
                return 1
            else:
                return 0
        else:
            return 0
    elif x['Dependents'] == '3+':
        return 3
    else:
        return int(x['Dependents'])
    
def dual_income_dummy(x):
    if x['Married'] == 'Yes':
        if x['CoapplicantIncome'] > 0:
            return 1
        else:
            return 0
    else:
        return 0
    
#group loan terms
def loan_group(x):
    x = int(x)
    if x == 360:
        return '30'
    elif x == 180:
        return '15'
    elif x < 180:
        return '<15'
    elif x > 180 and x < 360:
        return '(15, 30)'
    elif x > 360:
        return '>30'
    else:
        return '??'
    
#log transform loan amount for better mean calculation later
#group by loan term for imputation later
def get_transform(df):
    df['LoanAmountLog'] = np.log(df['LoanAmount'].astype('float64'))
    group = df.groupby(['LoanTermGroups'])['LoanAmountLog']
    return group


def clean(df, train_orig, tf = False):
    col_drop = []
    col_rename = {}
    
    # GENDER #
    df['Male_IO'] = df.apply(lambda x: gender_impute_and_dummy(x), axis=1)
    col_drop.append('Gender')
    
    # MARRIED #
    df['Married_IO'] = df.apply(lambda x: married_impute_and_dummy(x), axis=1)
    col_drop.append('Married')
    
    # DEPENDENTS #
    df['Dependents2'] = df.apply(lambda x: dependents_impute_and_ordinal(x), axis=1)
    col_drop.append('Dependents')
    col_rename['Dependents2'] = 'Dependents'
    
    # EDUCATION #
    df['Education_IO'] = df['Education'].apply(lambda x: 1 if x == 'Graduate' else 0)
    col_drop.append('Education')
    
    # SELF EMPLOYED #
    df['Self_Employed'].replace(np.nan, 'No', inplace=True)
    df['Self_Employed_IO'] = df['Self_Employed'].apply(lambda x: 1 if x == 'Yes' else 0)
    col_drop.append('Self_Employed')

    # INCOME #
    df['FamilyIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
    df['DualIncome_IO'] = df.apply(lambda x: dual_income_dummy(x), axis=1)
    col_drop.extend(['ApplicantIncome', 'CoapplicantIncome'])
    if tf == True:
        box = boxplot_stats(df['FamilyIncome'])[0]
        lowVal = box['whislo']
        highVal = box['whishi']        
        #training set, remove outliers
        df['income_out'] = df['FamilyIncome'].apply(lambda x: 1 if (x < lowVal or x > highVal) else 0)
        df = df.loc[df['income_out'] == 0]
        df.reset_index(inplace=True, drop=True)
        col_drop.append('income_out')
    else:
        pass
    
    # LOAN TERM #
    df['Loan_Amount_Term'].replace(np.nan, 360.0, inplace=True)
    df['LoanTermGroups'] = df['Loan_Amount_Term'].apply(lambda x: loan_group(x))
    
    train_orig['Loan_Amount_Term'].replace(np.nan, 360.0, inplace=True)
    train_orig['LoanTermGroups'] = train_orig['Loan_Amount_Term'].apply(lambda x: loan_group(x))
    col_drop.append('LoanTermGroups')
    
    # LOAN AMOUNT #
    df['LoanAmountLog'] = np.log(df['LoanAmount'].astype('float64'))
    t = get_transform(train_orig)
    df['LoanAmountLog'] = t.transform(lambda x: x.fillna(x.mean()))
    df['LoanAmount2'] = np.exp(df['LoanAmountLog'])
    if tf == True:
        box = boxplot_stats(df['LoanAmount2'])[0]
        lowVal = box['whislo']
        highVal = box['whishi']
        df['loan_out'] = df['LoanAmount2'].apply(lambda x: 1 if (x < lowVal or x > highVal) else 0)
        df = df.loc[df['loan_out'] == 0]
        df.reset_index(inplace=True, drop=True)
        col_drop.append('loan_out')
    else:
        pass
    col_drop.append('LoanAmount')
    
    # CREDIT_HISTORY #
    df['CreditHistory_IO'] = df['Credit_History'].fillna(1.0).astype('int')
    col_drop.append('Credit_History')
    
    # PROPERTY AREA #
    df['PA_Urban'] = df['Property_Area'].apply(lambda x: 1 if x == 'Urban' else 0)
    df['PA_Rural'] = df['Property_Area'].apply(lambda x: 1 if x == 'Rural' else 0)
    df['PA_Semiurban'] = df['Property_Area'].apply(lambda x: 1 if x == 'Semiurban' else 0)
    col_drop.append('Property_Area')
    
    # DEBT/EQUITY #
    df['Debt_Equity'] = (df['LoanAmount2'] * 1000) / df['FamilyIncome']
    df['Debt_Equity_Annual'] = (((df['LoanAmount2'] * 1000) / df['Loan_Amount_Term']) * 12) / df['FamilyIncome']
    col_drop.extend(['FamilyIncome', 'LoanAmount2'])
    
    # LOAN STATUS #
    try:
        df['Loan_Status'].replace('Y', 1, inplace=True)
        df['Loan_Status'].replace('N', 0, inplace=True)
    except KeyError:
        pass
    
    #column cleaning
    df.drop(columns = col_drop, inplace=True)
    df.rename(columns=col_rename, inplace=True)
    return df

In [15]:
def run_all(base, df1, df2):
    #grabs the data
    t1 = data_grab_online(base, df1)
    t2 = data_grab_online(base, df2)
    
    #cleans and imputes data
    orig = t1.copy()
    trc = clean(t1, orig, True)
    tsc = clean(t2, orig)
    
    #weird issues with extra column
    try:
        trc.drop(columns = ['LoanTermGroups'], inplace=True)
    except KeyError:
        pass
    
    return trc, tsc

Below is a function to print model statistics for scoring.

In [16]:
def score(actual, pred, pos, neg):
    #confusion matrix
    matrix = cm(actual, pred, labels = [pos, neg])
    tp, fn, fp, tn = matrix.ravel()
    #matrix statistics
    precision = tp / (tp + fp) # p(correct | predict pos)
    recall = tp / (tp + fn) # p(correct | actual pos)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f = (2*recall*precision) / (recall + precision) # f-score
    fpr = fp / (tn + fp) # false positive rate
    aucroc = ra(actual, pred) # auc roc score
    
    print('Accuracy:\t{:.1f}%'.format(accuracy * 100.0))
    print('Precision:\t{:.3f}'.format(precision))
    print('Recall:\t\t{:.3f}'.format(recall))
    print('F-Score:\t{:.3f}'.format(f))
    print('AUC-ROC Score:\t{:.3f}'.format(aucroc))
    print('FPR:\t\t{:.2f}%'.format(fpr * 100.0))
    print('\n\n')
    
    print('Actual (side) vs. Predicted (top)')
    print('\t|  {}  \t|  {}  \t|'.format(pos, neg))
    print('-------------------------')
    print('  {}  \t|  {} \t|  {} \t|'.format(pos, tp, fn))
    print('-------------------------')
    print('  {}  \t|  {} \t|  {} \t|'.format(neg, fp, tn))

***
# Model 2: Random Forest
Did not optimize, but chose max depth of 5 to allow for growth, but not overfit.  Will keep variables once all are above 1% variable importance.

In [17]:
from sklearn.ensemble import RandomForestClassifier as rfc

In [20]:
def model_build(base, tr, tst, col_drop):
    train, test = run_all(base, tr, tst)
    
    cols = train.columns
    
    x_cols = cols.drop(['Loan_ID', 'Loan_Status'] + col_drop)
    y_train = train[['Loan_ID', 'Loan_Status']]
    x_train = train[x_cols]

    y_test = test['Loan_ID']
    x_test = test[x_cols]
    
    model = rfc(random_state=1, max_depth = 5, min_samples_split=20, min_samples_leaf=10)
    model.fit(x_train, y_train.iloc[:, 1])

    features = pd.concat((pd.DataFrame(x_train.columns, columns = ['variable']), 
               pd.DataFrame(model.feature_importances_, columns = ['importance'])), 
              axis = 1).sort_values(by='importance', ascending = False)[:20]
    print('\n', features, '\n')

    y_true = y_train['Loan_Status']
    y_pred = model.predict(x_train)

    score(y_true, y_pred, 1, 0)

In [21]:
model_build(url, train_name, test_name, [])


               variable  importance
8     CreditHistory_IO    0.558941
7        LoanAmountLog    0.099321
12         Debt_Equity    0.084454
13  Debt_Equity_Annual    0.077844
10            PA_Rural    0.061274
11        PA_Semiurban    0.034206
4         Education_IO    0.017165
6        DualIncome_IO    0.015543
0     Loan_Amount_Term    0.014796
9             PA_Urban    0.012948
3           Dependents    0.009631
1              Male_IO    0.007923
5     Self_Employed_IO    0.003048
2           Married_IO    0.002905 

Accuracy:	80.9%
Precision:	0.789
Recall:		0.986
F-Score:	0.877
AUC-ROC Score:	0.701
FPR:		58.39%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  352 	|  5 	|
-------------------------
  0  	|  94 	|  67 	|


In [22]:
model_build(url, train_name, test_name, ['Married_IO'])


               variable  importance
7     CreditHistory_IO    0.467994
11         Debt_Equity    0.137768
12  Debt_Equity_Annual    0.116902
6        LoanAmountLog    0.106202
10        PA_Semiurban    0.040389
2           Dependents    0.030527
9             PA_Rural    0.027220
8             PA_Urban    0.016579
5        DualIncome_IO    0.014120
3         Education_IO    0.013052
0     Loan_Amount_Term    0.012610
1              Male_IO    0.009976
4     Self_Employed_IO    0.006660 

Accuracy:	81.7%
Precision:	0.798
Recall:		0.983
F-Score:	0.881
AUC-ROC Score:	0.715
FPR:		55.28%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  351 	|  6 	|
-------------------------
  0  	|  89 	|  72 	|


In [23]:
model_build(url, train_name, test_name, ['Married_IO', 'Self_Employed_IO'])


               variable  importance
6     CreditHistory_IO    0.375702
11  Debt_Equity_Annual    0.191188
10         Debt_Equity    0.144948
5        LoanAmountLog    0.123499
3         Education_IO    0.042399
8             PA_Rural    0.039400
9         PA_Semiurban    0.029180
7             PA_Urban    0.017266
2           Dependents    0.016848
4        DualIncome_IO    0.010595
0     Loan_Amount_Term    0.005592
1              Male_IO    0.003383 

Accuracy:	81.1%
Precision:	0.791
Recall:		0.986
F-Score:	0.878
AUC-ROC Score:	0.704
FPR:		57.76%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  352 	|  5 	|
-------------------------
  0  	|  93 	|  68 	|


In [24]:
model_build(url, train_name, test_name, ['Married_IO', 'Self_Employed_IO', 'Male_IO'])


               variable  importance
5     CreditHistory_IO    0.503288
9          Debt_Equity    0.166162
4        LoanAmountLog    0.101199
10  Debt_Equity_Annual    0.100488
8         PA_Semiurban    0.045249
7             PA_Rural    0.029898
2         Education_IO    0.015491
0     Loan_Amount_Term    0.014464
1           Dependents    0.012069
6             PA_Urban    0.007897
3        DualIncome_IO    0.003795 

Accuracy:	82.6%
Precision:	0.807
Recall:		0.983
F-Score:	0.886
AUC-ROC Score:	0.731
FPR:		52.17%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  351 	|  6 	|
-------------------------
  0  	|  84 	|  77 	|


In [25]:
model_build(url, train_name, test_name, ['Married_IO', 'Self_Employed_IO', 'Male_IO', 'DualIncome_IO'])


              variable  importance
4    CreditHistory_IO    0.592806
8         Debt_Equity    0.159455
9  Debt_Equity_Annual    0.117471
3       LoanAmountLog    0.039792
7        PA_Semiurban    0.023403
1          Dependents    0.023285
6            PA_Rural    0.020546
5            PA_Urban    0.010686
2        Education_IO    0.009294
0    Loan_Amount_Term    0.003262 

Accuracy:	81.7%
Precision:	0.798
Recall:		0.983
F-Score:	0.881
AUC-ROC Score:	0.715
FPR:		55.28%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  351 	|  6 	|
-------------------------
  0  	|  89 	|  72 	|


In [26]:
model_build(url, train_name, test_name, ['Married_IO', 'Self_Employed_IO', 'Male_IO', 'DualIncome_IO', 'Loan_Amount_Term'])


              variable  importance
3    CreditHistory_IO    0.532164
8  Debt_Equity_Annual    0.152609
7         Debt_Equity    0.140558
2       LoanAmountLog    0.097669
6        PA_Semiurban    0.029464
5            PA_Rural    0.018447
1        Education_IO    0.017243
0          Dependents    0.006230
4            PA_Urban    0.005616 

Accuracy:	81.5%
Precision:	0.797
Recall:		0.980
F-Score:	0.879
AUC-ROC Score:	0.714
FPR:		55.28%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  350 	|  7 	|
-------------------------
  0  	|  89 	|  72 	|


In [27]:
model_build(url, train_name, test_name, ['Married_IO', 'Self_Employed_IO', 'Male_IO', 'DualIncome_IO', 'Loan_Amount_Term'
                                        ,'PA_Urban'])


              variable  importance
3    CreditHistory_IO    0.550604
6         Debt_Equity    0.179375
7  Debt_Equity_Annual    0.107358
2       LoanAmountLog    0.057980
1        Education_IO    0.029953
5        PA_Semiurban    0.029121
4            PA_Rural    0.026613
0          Dependents    0.018996 

Accuracy:	81.9%
Precision:	0.802
Recall:		0.978
F-Score:	0.881
AUC-ROC Score:	0.722
FPR:		53.42%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  349 	|  8 	|
-------------------------
  0  	|  86 	|  75 	|


Final model will have the following variables:
* Credit History
* Debt/Equity
* Debt/Equity per Year
* Loan Amount (Logged)
* Education
* Property Area (Semiurban)
* Property Area (Rural)
* Dependents

Model has good accuracy, a good F-score, a decent AUC-ROC score.

Model's false positive rate is high.  If looking to improve, targeting the FPR will likely help.