This model will utilize the data set that KNN models were utilized to impute missing values, instead of the mode as used previously.  It will also attempt to improve the random forest model by utilizing cross validation and hyperparameter optimization.

The benchmark rate to beat is 0.7847

# Imports

In [18]:
import os
import pandas as pd
import requests

from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import roc_auc_score as ra

from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import RandomizedSearchCV as rscv
from sklearn.model_selection import GridSearchCV as gscv
from sklearn.feature_selection import SelectFromModel as sfm

from IPython.display import Audio  # remove later

# Variables

In [20]:
url = "https://raw.githubusercontent.com/joepollastrini/Loan-Prediction/master"
train_name = 'forModel_train.csv'
test_name = 'forModel_test.csv'
response = 'Loan_Status'

sound = 'C:\Windows\Media\Alarm07.wav'  # remove later

# Functions

In [3]:
def data_grab_online(base, name, out='output.csv'):
    #get data from url
    u = base + '/' + name
    r = requests.get(u).content
    df = pd.read_csv(u)
    return df

In [4]:
def tuning(rg, n, k, seed, drop, data, y, keep=False, rand=True):
    #create model and random search grid
    model = rfc()
    if rand==True:
        tuner = rscv(estimator = model
                    ,param_distributions = rg
                    ,n_iter = n
                    ,cv = k
                    ,random_state = seed
                    ,verbose = 0
                    ,n_jobs = -1)
    else:
        tuner = gscv(estimator = model
                     ,param_grid = rg
                     ,cv = k
                     ,verbose = 0
                     ,n_jobs = -1)
    
    if keep == False:
        #get data for model build
        allCols = data.columns
        x_cols = allCols.drop(drop)
        x_train = data[x_cols]
        y_train = data[y]
    else:
        x_train = data[drop]
        y_train = data[y]
    
    #fit model randomly
    tuner.fit(x_train, y_train)
    
    #return best model
    return tuner.best_params_, x_train, y_train

In [5]:
def model_build(pg, data, cols, y):
    model = rfc(n_estimators = pg['n_estimators']
               ,max_depth = pg['max_depth']
               ,min_samples_split = pg['min_samples_split']
               ,min_samples_leaf = pg['min_samples_leaf'])
    
    x_train = data[cols]
    y_train = data[y]
    model.fit(x_train, y_train)
    preds = model.predict(x_train)
    
    return model, preds

In [6]:
def score(actual, pred, pos, neg):
    matrix = cm(actual, pred, labels = [pos, neg])
    tp, fn, fp, tn = matrix.ravel()
    precision = tp / (tp + fp) # p(correct | predict pos)
    recall = tp / (tp + fn) # p(correct | actual pos)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f = (2*recall*precision) / (recall + precision)
    fpr = fp / (tn + fp)
    aucroc = ra(actual, pred)
    
    print('Accuracy:\t{:.1f}%'.format(accuracy * 100.0))
    print('Precision:\t{:.3f}'.format(precision))
    print('Recall:\t\t{:.3f}'.format(recall))
    print('F-Score:\t{:.3f}'.format(f))
    print('AUC-ROC Score:\t{:.3f}'.format(aucroc))
    print('FPR:\t\t{:.2f}%'.format(fpr * 100.0))
    print('\n\n')
    
    print('Actual (side) vs. Predicted (top)')
    print('\t|  {}  \t|  {}  \t|'.format(pos, neg))
    print('-------------------------')
    print('  {}  \t|  {} \t|  {} \t|'.format(pos, tp, fn))
    print('-------------------------')
    print('  {}  \t|  {} \t|  {} \t|'.format(neg, fp, tn))

# 1) Gather Data

In [7]:
train = data_grab_online(url, train_name)
test = data_grab_online(url, test_name)

***
### Data Set 1

Keep outliers, use Loan Amount, Dependents, and LoanAmountTerm

# 2) Parameter Tuning

In [8]:
random_grid = {'n_estimators': list(range(50, 250, 10))
               ,'max_features': ['auto'] + list(range(4, 10, 1))
               ,'max_depth': list(range(4, 16, 2))
               ,'min_samples_split': [2, 5, 10, 20]
               ,'min_samples_leaf': [1, 2, 5, 10]
              }
dropCols = ['Loan_ID', response, 'FamilySize', 'LoanAmountLog', 'LoanTermGroups'
            ,'LTG_<15', 'LTG_15', 'LTG_1530', 'LTG_30', 'LTG_>30']
    
param_start, x_t, y_t = tuning(random_grid, 200, 10, 0, dropCols, train, response)

In [9]:
param_start

{'n_estimators': 110,
 'min_samples_split': 10,
 'min_samples_leaf': 10,
 'max_features': 'auto',
 'max_depth': 10}

# 2b) Parameter Fine Tuning

In [12]:
random_grid = {'n_estimators': list(range(90, 130, 5))
               ,'max_depth': list(range(7, 13, 1))
               ,'min_samples_split': list(range(7, 13, 1))
               ,'min_samples_leaf': list(range(7, 13, 1))
              }
dropCols = ['Loan_ID', response, 'FamilySize', 'LoanAmountLog', 'LoanTermGroups'
            ,'LTG_<15', 'LTG_15', 'LTG_1530', 'LTG_30', 'LTG_>30']

param_finer, x_t, y_t = tuning(random_grid, 200, 10, 0, dropCols, train, response, rand=False)
param_finer

{'max_depth': 7,
 'min_samples_leaf': 8,
 'min_samples_split': 11,
 'n_estimators': 115}

# 3) Feature Selection

In [14]:
sel = sfm(rfc(n_estimators = param_finer['n_estimators']
              ,min_samples_split = param_finer['min_samples_split']
              ,min_samples_leaf = param_finer['min_samples_leaf']
              ,max_depth = param_finer['max_depth']
              ,verbose=0, n_jobs=-1))
sel.fit(x_t, y_t)
feats = x_t.columns[sel.get_support()]

In [15]:
feats

Index(['FamilyIncome', 'IncomePerMember', 'LoanAmount', 'Debt_Equity',
       'Debt_Equity_Annual', 'Credit_History'],
      dtype='object')

# 4) Re tune with selected variables

In [16]:
random_grid = {'n_estimators': list(range(95, 130, 5))
               ,'max_depth': list(range(5, 10, 1))
               ,'min_samples_split': list(range(8, 14, 1))
               ,'min_samples_leaf': list(range(5, 11, 1))
              }
    
params_v2, x_t, y_t = tuning(random_grid, 200, 10, 0, list(feats), train, response, keep=True)
params_v2

{'n_estimators': 100,
 'min_samples_split': 8,
 'min_samples_leaf': 10,
 'max_depth': 5}

# 4b) Parameter Fine Tuning

In [18]:
random_grid = {'n_estimators': list(range(90, 110, 5))
               ,'max_depth': list(range(4, 6, 1))
               ,'min_samples_split': list(range(6,10,1))
               ,'min_samples_leaf': list(range(8,12,1))
              }
    
final_params, x_t, y_t = tuning(random_grid, 200, 10, 0, list(feats), train, response, keep=True, rand=False)
final_params

{'max_depth': 4,
 'min_samples_leaf': 11,
 'min_samples_split': 9,
 'n_estimators': 100}

# 5) Model Build and Score

In [19]:
la_outliers, predictions = model_build(final_params, train, list(feats), response)

In [20]:
score(train[response], predictions, 1, 0)

Accuracy:	82.2%
Precision:	0.807
Recall:		0.974
F-Score:	0.883
AUC-ROC Score:	0.732
FPR:		51.04%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  411 	|  11 	|
-------------------------
  0  	|  98 	|  94 	|


In [22]:
features = pd.concat((pd.DataFrame(train[list(feats)].columns, columns = ['variable']), 
               pd.DataFrame(la_outliers.feature_importances_, columns = ['importance'])), 
              axis = 1).sort_values(by='importance', ascending = False)[:20]
print('\n', features, '\n')


              variable  importance
5      Credit_History    0.614218
3         Debt_Equity    0.112968
1     IncomePerMember    0.083125
0        FamilyIncome    0.075190
4  Debt_Equity_Annual    0.070806
2          LoanAmount    0.043694 



# 6) Predict test values

In [30]:
preds = la_outliers.predict(test[list(feats)])
predictions = pd.DataFrame(preds, columns = ['Predictions'])
test_predictions = pd.concat((test['Loan_ID'], predictions), axis=1)
test_predictions['Loan_Status'] = test_predictions['Predictions'].apply(lambda x: 'Y' if x == 1 else 'N')
test_predictions.drop(columns = ['Predictions'], inplace=True)

In [31]:
direct = os.getcwd()
path = os.path.join(direct, 'sub.csv')
print('writing file to {}'.format(path))
test_predictions.to_csv(path, index=False)

writing file to C:\Users\joepo\Desktop\Project Portfolio\Loan Prediction\sub.csv


Test accuracy 0.78472 (from analytics vidhaya)

***
### Data Set 2
Replace LoanAmountLog with LoanAmount

# 2) Parameter Tuning

In [23]:
random_grid = {'n_estimators': list(range(50, 250, 10))
               ,'max_features': ['auto'] + list(range(4, 10, 1))
               ,'max_depth': list(range(4, 16, 2))
               ,'min_samples_split': [2, 5, 10, 20]
               ,'min_samples_leaf': [1, 2, 5, 10]
              }
dropCols = ['Loan_ID', response, 'FamilySize', 'LoanAmount', 'LoanTermGroups', 'Loan_Amount_Term']
    
param_start, x_t, y_t = tuning(random_grid, 200, 10, 0, dropCols, train, response)



In [28]:
param_start

{'n_estimators': 50,
 'min_samples_split': 10,
 'min_samples_leaf': 10,
 'max_features': 6,
 'max_depth': 10}

# 2b) Parameter Fine Tuning

In [32]:
random_grid = {'n_estimators': list(range(30, 70, 5))
               ,'max_features': list(range(3, 9, 1))
               ,'max_depth': list(range(6, 14, 2))
               ,'min_samples_split': list(range(6, 14, 2))
               ,'min_samples_leaf': list(range(6, 14, 2))
              }
dropCols = ['Loan_ID', response, 'FamilySize', 'LoanAmount', 'LoanTermGroups', 'Loan_Amount_Term']

param_finer, x_t, y_t = tuning(random_grid, 200, 10, 0, dropCols, train, response, rand=False)
param_finer



{'max_depth': 10,
 'max_features': 3,
 'min_samples_leaf': 6,
 'min_samples_split': 6,
 'n_estimators': 40}

# 3) Feature Selection

In [33]:
sel = sfm(rfc(n_estimators = param_finer['n_estimators']
              ,min_samples_split = param_finer['min_samples_split']
              ,min_samples_leaf = param_finer['min_samples_leaf']
              ,max_features = param_finer['max_features']
              ,max_depth = param_finer['max_depth']
              ,verbose=0, n_jobs=-1))
sel.fit(x_t, y_t)
feats = x_t.columns[sel.get_support()]

# 4) Re tune with selected variables

In [34]:
random_grid = {'n_estimators': list(range(35, 55, 5))
               ,'max_depth': list(range(8, 12, 1))
               ,'min_samples_split': list(range(4, 8, 1))
               ,'min_samples_leaf': list(range(4, 8, 1))
              }
    
params_v2, x_t, y_t = tuning(random_grid, 200, 10, 0, list(feats), train, response, keep=True)
params_v2

{'n_estimators': 45,
 'min_samples_split': 5,
 'min_samples_leaf': 7,
 'max_depth': 8}

# 4b) Parameter Fine Tuning

In [35]:
random_grid = {'n_estimators': list(range(43, 47, 1))
               ,'max_depth': list(range(6, 10, 1))
               ,'min_samples_split': list(range(4,6, 1))
               ,'min_samples_leaf': list(range(5, 9, 1))
              }
    
final_params, x_t, y_t = tuning(random_grid, 200, 10, 0, list(feats), train, response, keep=True, rand=False)
final_params

{'max_depth': 6,
 'min_samples_leaf': 7,
 'min_samples_split': 4,
 'n_estimators': 46}

# 5) Model Build and Score

In [36]:
laLog_outliers, predictions = model_build(final_params, train, list(feats), response)

In [37]:
score(train[response], predictions, 1, 0)

Accuracy:	82.7%
Precision:	0.813
Recall:		0.972
F-Score:	0.886
AUC-ROC Score:	0.741
FPR:		48.96%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  410 	|  12 	|
-------------------------
  0  	|  94 	|  98 	|


In [39]:
features = pd.concat((pd.DataFrame(train[list(feats)].columns, columns = ['variable']), 
               pd.DataFrame(laLog_outliers.feature_importances_, columns = ['importance'])), 
              axis = 1).sort_values(by='importance', ascending = False)[:20]
print('\n', features, '\n')


              variable  importance
5      Credit_History    0.583363
3         Debt_Equity    0.122266
0        FamilyIncome    0.098940
4  Debt_Equity_Annual    0.075175
1     IncomePerMember    0.069348
2       LoanAmountLog    0.050908 



# 6) Predict Test Values

In [40]:
preds = laLog_outliers.predict(test[list(feats)])
predictions = pd.DataFrame(preds, columns = ['Predictions'])
test_predictions = pd.concat((test['Loan_ID'], predictions), axis=1)
test_predictions['Loan_Status'] = test_predictions['Predictions'].apply(lambda x: 'Y' if x == 1 else 'N')
test_predictions.drop(columns = ['Predictions'], inplace=True)

In [41]:
direct = os.getcwd()
path = os.path.join(direct, 'sub.csv')
print('writing file to {}'.format(path))
test_predictions.to_csv(path, index=False)

writing file to C:\Users\joepo\Desktop\Project Portfolio\Loan Prediction\sub.csv


Test accuracy of 0.78472

***
### Data set 3
Remove outliers, use same vars as data set 1

# 1b) Remove outliers

In [9]:
train_inlier = train.loc[(train['income_out_io'] == 0) & (train['la_out_io'] == 0)]

# 2) Parameter Tuning

In [11]:
random_grid = {'n_estimators': list(range(50, 250, 10))
               ,'max_features': ['auto'] + list(range(4, 10, 1))
               ,'max_depth': list(range(4, 16, 2))
               ,'min_samples_split': [2, 5, 10, 20]
               ,'min_samples_leaf': [1, 2, 5, 10]
              }
dropCols = ['Loan_ID', response, 'FamilySize', 'LoanAmountLog', 'LoanTermGroups'
            ,'LTG_<15', 'LTG_15', 'LTG_1530', 'LTG_30', 'LTG_>30']
    
param_start, x_t, y_t = tuning(random_grid, 200, 10, 0, dropCols, train_inlier, response)

In [13]:
param_start

{'n_estimators': 230,
 'min_samples_split': 5,
 'min_samples_leaf': 10,
 'max_features': 8,
 'max_depth': 8}

# 2b) Parameter Fine Tuning

In [16]:
random_grid = {'n_estimators': list(range(200, 250, 10))
               ,'max_features': list(range(6, 12, 2))
               ,'max_depth': list(range(6, 12, 2))
               ,'min_samples_split': list(range(3, 9, 2))
               ,'min_samples_leaf': list(range(6, 14, 2))
              }
dropCols = ['Loan_ID', response, 'FamilySize', 'LoanAmount', 'LoanTermGroups', 'Loan_Amount_Term']

param_finer, x_t, y_t = tuning(random_grid, 200, 10, 0, dropCols, train_inlier, response, rand=False)
param_finer

{'max_depth': 10,
 'max_features': 8,
 'min_samples_leaf': 8,
 'min_samples_split': 3,
 'n_estimators': 210}

# 3) Feature Selection

In [17]:
sel = sfm(rfc(n_estimators = param_finer['n_estimators']
              ,min_samples_split = param_finer['min_samples_split']
              ,min_samples_leaf = param_finer['min_samples_leaf']
              ,max_features = param_finer['max_features']
              ,max_depth = param_finer['max_depth']
              ,verbose=0, n_jobs=-1))
sel.fit(x_t, y_t)
feats = x_t.columns[sel.get_support()]

# 4) Retune with selected variables

In [23]:
random_grid = {'n_estimators': list(range(195, 225, 5))
               ,'max_depth': list(range(8, 12, 2))
               ,'min_samples_split': list(range(2, 5, 1))
               ,'min_samples_leaf': list(range(6, 12, 2))
              }
    
params_v2, x_t, y_t = tuning(random_grid, 200, 10, 0, list(feats), train_inlier, response, keep=True)
params_v2



{'n_estimators': 205,
 'min_samples_split': 2,
 'min_samples_leaf': 10,
 'max_depth': 8}

# 4b) Parameter Fine Tuning

In [25]:
random_grid = {'n_estimators': list(range(195, 215, 5))
               ,'max_depth': list(range(6, 10, 1))
               ,'min_samples_split': list(range(2, 5, 1))
               ,'min_samples_leaf': list(range(8, 12, 1))
              }
    
final_params, x_t, y_t = tuning(random_grid, 200, 10, 0, list(feats), train_inlier, response, keep=True, rand=False)
final_params

{'max_depth': 6,
 'min_samples_leaf': 9,
 'min_samples_split': 3,
 'n_estimators': 210}

# 5) Model Build and Score

In [27]:
la_inliers, predictions = model_build(final_params, train_inlier, list(feats), response)

In [29]:
score(train_inlier[response], predictions, 1, 0)

Accuracy:	83.4%
Precision:	0.820
Recall:		0.974
F-Score:	0.890
AUC-ROC Score:	0.747
FPR:		47.93%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  370 	|  10 	|
-------------------------
  0  	|  81 	|  88 	|


In [30]:
features = pd.concat((pd.DataFrame(train_inlier[list(feats)].columns, columns = ['variable']), 
               pd.DataFrame(la_inliers.feature_importances_, columns = ['importance'])), 
              axis = 1).sort_values(by='importance', ascending = False)[:20]
print('\n', features, '\n')


              variable  importance
5      Credit_History    0.585317
3         Debt_Equity    0.134114
4  Debt_Equity_Annual    0.082525
1     IncomePerMember    0.072456
0        FamilyIncome    0.071426
2       LoanAmountLog    0.054162 



Test accuracy is 0.7847

***
### Data Set 4
Remove outliers (use train_inlier), use same cols as data set 2

# 2) Parameter Tuning

In [31]:
random_grid = {'n_estimators': list(range(50, 250, 10))
               ,'max_features': ['auto'] + list(range(4, 10, 1))
               ,'max_depth': list(range(4, 16, 2))
               ,'min_samples_split': [2, 5, 10, 20]
               ,'min_samples_leaf': [1, 2, 5, 10]
              }
dropCols = ['Loan_ID', response, 'FamilySize', 'LoanAmount', 'LoanTermGroups', 'Loan_Amount_Term']
    
param_start, x_t, y_t = tuning(random_grid, 200, 10, 0, dropCols, train_inlier, response)

In [32]:
param_start

{'n_estimators': 50,
 'min_samples_split': 20,
 'min_samples_leaf': 10,
 'max_features': 6,
 'max_depth': 14}

# 2b) Parameter Fine Tuning

In [34]:
random_grid = {'n_estimators': list(list(range(30, 90, 10)))
               ,'max_features': list(range(4, 10, 2))
               ,'max_depth': list(range(10, 20, 2))
               ,'min_samples_split': list(range(16, 24, 2))
               ,'min_samples_leaf': list(range(6, 14, 2))
              }
dropCols = ['Loan_ID', response, 'FamilySize', 'LoanAmount', 'LoanTermGroups', 'Loan_Amount_Term']

param_finer, x_t, y_t = tuning(random_grid, 200, 10, 0, dropCols, train_inlier, response, rand=False)
param_finer

{'max_depth': 10,
 'max_features': 8,
 'min_samples_leaf': 8,
 'min_samples_split': 22,
 'n_estimators': 30}

# 3) Feature Selection

In [36]:
sel = sfm(rfc(n_estimators = param_finer['n_estimators']
              ,min_samples_split = param_finer['min_samples_split']
              ,min_samples_leaf = param_finer['min_samples_leaf']
              ,max_features = param_finer['max_features']
              ,max_depth = param_finer['max_depth']
              ,verbose=0, n_jobs=-1))
sel.fit(x_t, y_t)
feats = x_t.columns[sel.get_support()]

# 4) Retune with Selected Variables

In [37]:
random_grid = {'n_estimators': list(range(30, 120, 10))
               ,'max_depth': list(range(6, 14, 2))
               ,'min_samples_split': list(range(14, 28, 4))
               ,'min_samples_leaf': list(range(4, 14, 2))
              }
    
params_v2, x_t, y_t = tuning(random_grid, 200, 10, 0, list(feats), train_inlier, response, keep=True)
params_v2

{'n_estimators': 30,
 'min_samples_split': 22,
 'min_samples_leaf': 12,
 'max_depth': 10}

# 4b) Parameter Fine Tuning

In [39]:
random_grid = {'n_estimators': list(range(20, 40, 5))
               ,'max_depth': list(range(8, 12, 1))
               ,'min_samples_split': list(range(20, 24, 1))
               ,'min_samples_leaf': list(range(10, 14, 1))
              }
    
final_params, x_t, y_t = tuning(random_grid, 200, 10, 0, list(feats), train_inlier, response, keep=True, rand=False)
final_params

{'max_depth': 9,
 'min_samples_leaf': 10,
 'min_samples_split': 21,
 'n_estimators': 20}

# 5) Model Build and Score

In [41]:
laLog_inliers, predictions = model_build(final_params, train_inlier, list(feats), response)

In [43]:
score(train_inlier[response], predictions, 1, 0)

Accuracy:	83.6%
Precision:	0.821
Recall:		0.976
F-Score:	0.892
AUC-ROC Score:	0.749
FPR:		47.93%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  371 	|  9 	|
-------------------------
  0  	|  81 	|  88 	|


In [44]:
features = pd.concat((pd.DataFrame(train_inlier[list(feats)].columns, columns = ['variable']), 
               pd.DataFrame(laLog_inliers.feature_importances_, columns = ['importance'])), 
              axis = 1).sort_values(by='importance', ascending = False)[:20]
print('\n', features, '\n')


              variable  importance
5      Credit_History    0.496121
4  Debt_Equity_Annual    0.136619
3         Debt_Equity    0.117723
0        FamilyIncome    0.091333
1     IncomePerMember    0.088936
2       LoanAmountLog    0.069268 



Test accuracy is 0.7847

Exact same accuracy, but let's implement in the final model to get away from mode imputation and use an actual model