# Model Build
Use KNN imputation models (forModel data sets) and then build model random forest model using cross validation and hyper param optimization.  Use the same variables as original model. (Debt/Equity, Debt/Equity per Year, Loan Amount (Logged), Education, Property Area (Semiurban), Property Area (Rural), Dependents)

# Imports

In [1]:
import os
import pandas as pd
import requests

from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import roc_auc_score as ra

from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import GridSearchCV as gscv
from sklearn.model_selection import RandomizedSearchCV as rscv

# Variables

In [2]:
url = "https://raw.githubusercontent.com/joepollastrini/Loan-Prediction/master"
train_name = 'forModel_train.csv'
test_name = 'forModel_test.csv'
response = 'Loan_Status'
k = 10 #number of k-folds for cross validation
n = 200 #number of iterations for randomized grid search

# Functions

In [3]:
def data_grab_online(base, name, out='output.csv'):
    #get data from url
    u = base + '/' + name
    r = requests.get(u).content
    df = pd.read_csv(u)
    return df

In [4]:
def score(actual, pred, pos, neg):
    #confusion matrix
    matrix = cm(actual, pred, labels = [pos, neg])
    tp, fn, fp, tn = matrix.ravel()
    #matrix statistics
    precision = tp / (tp + fp) # p(correct | predict pos)
    recall = tp / (tp + fn) # p(correct | actual pos)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f = (2*recall*precision) / (recall + precision) # f-score
    fpr = fp / (tn + fp) # false positive rate
    aucroc = ra(actual, pred) # auc roc score
    
    print('Accuracy:\t{:.1f}%'.format(accuracy * 100.0))
    print('Precision:\t{:.3f}'.format(precision))
    print('Recall:\t\t{:.3f}'.format(recall))
    print('F-Score:\t{:.3f}'.format(f))
    print('AUC-ROC Score:\t{:.3f}'.format(aucroc))
    print('FPR:\t\t{:.2f}%'.format(fpr * 100.0))
    print('\n\n')
    
    print('Actual (side) vs. Predicted (top)')
    print('\t|  {}  \t|  {}  \t|'.format(pos, neg))
    print('-------------------------')
    print('  {}  \t|  {} \t|  {} \t|'.format(pos, tp, fn))
    print('-------------------------')
    print('  {}  \t|  {} \t|  {} \t|'.format(neg, fp, tn))

# 1) Gather Data

In [5]:
train = data_grab_online(url, train_name)
test = data_grab_online(url, test_name)

In [10]:
train.columns

Index(['Loan_ID', 'Loan_Status', 'Male_IO', 'Married_IO', 'Dependents',
       'FamilySize', 'Education_IO', 'FamilyIncome', 'DualIncome_IO',
       'PA_Urban', 'PA_Rural', 'PA_Semiurban', 'IncomePerMember',
       'LoanTermGroups', 'Loan_Amount_Term', 'LoanAmountLog', 'LTG_<15',
       'LTG_15', 'LTG_1530', 'LTG_30', 'LTG_>30', 'LoanAmount', 'Debt_Equity',
       'Debt_Equity_Annual', 'Self_Employed_IO', 'Credit_History',
       'income_out_io', 'la_out_io'],
      dtype='object')

In [13]:
train_inlier = train.loc[(train['income_out_io'] == 0) & (train['la_out_io'] == 0)]
test_inlier = test.loc[(test['income_out_io'] == 0) & (test['la_out_io'] == 0)]

# 2) Paramater Tune
Start with a randomized grid to find a rough starting point.  Use the starting points to do a grid search.

In [6]:
random_grid =  {'n_estimators': list(range(50, 250, 10))
               ,'max_features': ['auto'] + list(range(3, 7, 1))
               ,'max_depth': list(range(4, 16, 2))
               ,'min_samples_split': [2, 5, 10, 20]
               ,'min_samples_leaf': [1, 2, 5, 10]
              }

In [14]:
features = ['Credit_History', 'Debt_Equity', 'Debt_Equity_Annual', 'LoanAmountLog', 'FamilyIncome', 'IncomePerMember']

x_train = train_inlier[features]
y_train = train_inlier[response]

In [15]:
model = rfc()
tuner = rscv(estimator = model
            ,param_distributions = random_grid
            ,n_iter = n
            ,cv = k
            ,random_state = 0
            ,verbose = 2
            ,n_jobs = -1)
tuner.fit(x_train, y_train)
print(tuner.best_params_)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  4.2min finished


{'n_estimators': 210, 'min_samples_split': 20, 'min_samples_leaf': 10, 'max_features': 5, 'max_depth': 6}


In [16]:
grid_search = {'n_estimators': list(range(190, 230, 5))
               ,'max_features': list(range(4, 7, 1))
               ,'max_depth': list(range(4, 8, 1))
               ,'min_samples_split': list(range(16, 24, 2))
               ,'min_samples_leaf': list(range(6, 14, 2))
              }
model = rfc()
tuner = gscv(estimator = model
            ,param_grid = grid_search
            ,cv = k
            ,verbose = 2
            ,n_jobs = -1)
tuner.fit(x_train, y_train)
print(tuner.best_params_)

Fitting 10 folds for each of 1536 candidates, totalling 15360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 3273 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 4893 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 5824 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 6837 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 7930 tasks      | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done 9105 tasks      | 

{'max_depth': 5, 'max_features': 5, 'min_samples_leaf': 10, 'min_samples_split': 16, 'n_estimators': 195}


# 3) Build Model

In [17]:
d = tuner.best_params_
rf = rfc(n_estimators = d['n_estimators']
        ,max_depth = d['max_depth']
        ,min_samples_split = d['min_samples_split']
        ,min_samples_leaf = d['min_samples_leaf']
        ,max_features = d['max_features'])
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features=5, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=16,
                       min_weight_fraction_leaf=0.0, n_estimators=195,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# 4) Predictions and Score

In [18]:
preds = rf.predict(x_train)
score(y_train, preds, 1, 0)

Accuracy:	83.6%
Precision:	0.822
Recall:		0.974
F-Score:	0.892
AUC-ROC Score:	0.750
FPR:		47.34%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  370 	|  10 	|
-------------------------
  0  	|  80 	|  89 	|


In [19]:
feat_imp = pd.concat((pd.DataFrame(x_train.columns, columns = ['variable']), 
               pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
              axis = 1).sort_values(by='importance', ascending = False)[:20]
print('\n', feat_imp, '\n')


              variable  importance
0      Credit_History    0.677214
1         Debt_Equity    0.129431
2  Debt_Equity_Annual    0.068167
5     IncomePerMember    0.058765
4        FamilyIncome    0.038897
3       LoanAmountLog    0.027526 



In [20]:
test_preds = rf.predict(test[features])
predictions = pd.DataFrame(test_preds, columns = ['Predictions'])
test_predictions = pd.concat((test['Loan_ID'], predictions), axis=1)
test_predictions['Loan_Status'] = test_predictions['Predictions'].apply(lambda x: 'Y' if x == 1 else 'N')
test_predictions.drop(columns = ['Predictions'], inplace=True)

In [21]:
direct = os.getcwd()
path = os.path.join(direct, 'submission.csv')
print('writing file to {}'.format(path))
test_predictions.to_csv(path, index=False)

writing file to C:\Users\joepo\Desktop\Project Portfolio\Loan Prediction\submission.csv


Same accuracy score of 0.7847