In [123]:
import requests
import os
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [124]:
url = "https://raw.githubusercontent.com/jpollas2/Project-Data/master"
train_name = 'train_loan_data.csv'
test_name = 'test_loan_data.csv'

In [125]:
def data_grab_online(base, name, out='output.csv'):
    #get data from url
    u = base + '/' + name
    r = requests.get(u).content
    df = pd.read_csv(u)
    return df

In [126]:
def gender_impute_and_dummy(row):
    #if unmarried and 1 dependent female, else male
    if pd.isnull(row['Gender']):
        if row['Married'] == 'No' and row['Dependents'] == '1':
            return 0
        else:
            return 1
    else:
        if row['Gender'] == 'Male':
            return 1
        else:
            return 0
        
def married_impute_and_dummy(x):
    #if female, not married, otherwise married
    if pd.isnull(x['Married']):
        if x['Male_IO'] == 0:
            return 0
        else:
            return 1
    else:
        if x['Married'] == 'Yes':
            return 1
        else:
            return 0
        
def dependents_impute_and_ordinal(x):
    #if married, one dependent, otherwise none
    #convert dependents to ordinal int as well
    if pd.isnull(x['Dependents']):
        if x['Married_IO'] == 1:
            return 1
        else:
            return 0
    elif x['Dependents'] == '3+':
        return 3
    else:
        return int(x['Dependents'])
    
def dual_income_dummy(x):
    if x['Married_IO'] == 1:
        if x['CoapplicantIncome'] > 0:
            return 1
        else:
            return 0
    else:
        return 0
    
#group loan terms
def loan_group(x):
    x = int(x)
    if x == 360:
        return '30'
    elif x == 180:
        return '15'
    elif x < 180:
        return '<15'
    elif x > 180 and x < 360:
        return '(15, 30)'
    elif x > 360:
        return '>30'
    else:
        return '??'
    
#log transform loan amount for better mean calculation later
#group by loan term for imputation later
def get_transform(df):
    df['LoanAmountLog'] = np.log(df['LoanAmount'].astype('float64'))
    group = df.groupby(['LoanTermGroups'])['LoanAmountLog']
    return group


def clean(df, train_orig):
    col_drop = []
    col_rename = {}
    
    # GENDER #
    df['Male_IO'] = df.apply(lambda x: gender_impute_and_dummy(x), axis=1)
    col_drop.append('Gender')
    
    # MARRIED #
    df['Married_IO'] = df.apply(lambda x: married_impute_and_dummy(x), axis=1)
    col_drop.append('Married')
    
    # DEPENDENTS #
    df['Dependents2'] = df.apply(lambda x: dependents_impute_and_ordinal(x), axis=1)
    col_drop.append('Dependents')
    col_rename['Dependents2'] = 'Dependents'
    
    # FAMILY SIZE #
    df['FamilySize'] = df['Dependents2'] + df['Married_IO'] + 1
        
    # EDUCATION #
    df['Education_IO'] = df['Education'].apply(lambda x: 1 if x == 'Graduate' else 0)
    col_drop.append('Education')
    
    # SELF EMPLOYED #
    df['Self_Employed'].replace(np.nan, 'No', inplace=True)
    df['Self_Employed_IO'] = df['Self_Employed'].apply(lambda x: 1 if x == 'Yes' else 0)
    col_drop.append('Self_Employed')

    # INCOME #
    df['FamilyIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
    df['DualIncome_IO'] = df.apply(lambda x: dual_income_dummy(x), axis=1)
    col_drop.extend(['ApplicantIncome', 'CoapplicantIncome'])
    
    # LOAN TERM #
    df['Loan_Amount_Term'].replace(np.nan, 360.0, inplace=True)
    df['LoanTermGroups'] = df['Loan_Amount_Term'].apply(lambda x: loan_group(x))
    
    train_orig['Loan_Amount_Term'].replace(np.nan, 360.0, inplace=True)
    train_orig['LoanTermGroups'] = train_orig['Loan_Amount_Term'].apply(lambda x: loan_group(x))
    col_drop.append('LoanTermGroups')
    
    # LOAN AMOUNT #
    df['LoanAmountLog'] = np.log(df['LoanAmount'].astype('float64'))
    t = get_transform(train_orig)
    df['LoanAmountLog'] = t.transform(lambda x: x.fillna(x.mean()))
    df['LoanAmount2'] = np.exp(df['LoanAmountLog'])
    col_drop.extend(['LoanAmountLog', 'LoanAmount'])
    col_rename['LoanAmount2'] = 'LoanAmount'
    
    # PROPERTY AREA #
    df['PA_Urban'] = df['Property_Area'].apply(lambda x: 1 if x == 'Urban' else 0)
    df['PA_Rural'] = df['Property_Area'].apply(lambda x: 1 if x == 'Rural' else 0)
    df['PA_Semiurban'] = df['Property_Area'].apply(lambda x: 1 if x == 'Semiurban' else 0)
    col_drop.append('Property_Area')
    
    # DEBT/EQUITY #
    df['LoanPerIncome'] = (df['LoanAmount2'] * 1000) / df['FamilyIncome']
    df['Income_for_Loan_Annual'] = (((df['LoanAmount2'] * 1000) / df['Loan_Amount_Term']) * 12) / df['FamilyIncome']
    
    # INCOME PER FAMILY MEMBER #
    df['IncomePerMember'] = df['FamilyIncome'] / df['FamilySize']
    
    #column cleaning
    df.drop(columns = col_drop, inplace=True)
    df.rename(columns=col_rename, inplace=True)
    return df

In [127]:
train = data_grab_online(url, train_name)
test = data_grab_online(url, test_name)

In [128]:
orig = train.copy()
train_clean = clean(train, orig)
test_clean = clean(test, orig)

614

In [132]:
train_forMod = train_clean.dropna()
test_forMod = test_clean.dropna()

***
***
# Model Building

In [133]:
def score(actual, pred, pos, neg):
    matrix = cm(actual, pred, labels = [pos, neg])
    tp, fn, fp, tn = matrix.ravel()
    precision = tp / (tp + fp) # p(correct | predict pos)
    recall = tp / (tp + fn) # p(correct | actual pos)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f = (2*recall*precision) / (recall + precision)
    fpr = fp / (tn + fp)
    aucroc = ra(actual, pred)
    
    print('Accuracy:\t{:.1f}%'.format(accuracy * 100.0))
    print('Precision:\t{:.3f}'.format(precision))
    print('Recall:\t\t{:.3f}'.format(recall))
    print('F-Score:\t{:.3f}'.format(f))
    print('AUC-ROC Score:\t{:.3f}'.format(aucroc))
    print('FPR:\t\t{:.2f}%'.format(fpr * 100.0))
    print('\n\n')
    
    print('Actual (side) vs. Predicted (top)')
    print('\t|  {}  \t|  {}  \t|'.format(pos, neg))
    print('-------------------------')
    print('  {}  \t|  {} \t|  {} \t|'.format(pos, tp, fn))
    print('-------------------------')
    print('  {}  \t|  {} \t|  {} \t|'.format(neg, fp, tn))

In [134]:
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import roc_auc_score as ra

from scipy import stats
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression as LogReg

from sklearn.ensemble import RandomForestClassifier as rfc

***
## Logistic Regression

In [135]:
logReg_train = train_forMod.copy()
logReg_test = test_forMod.copy()

In [138]:
y_train = logReg_train['Credit_History']
y_test = logReg_test['Credit_History']

In [172]:
cols = logReg_train.columns
x_cols = cols.drop(['Loan_ID', 'Loan_Status', 'Credit_History', 'Dependents', 'PA_Semiurban', 'Self_Employed_IO'
                   ,'LoanAmount', 'Male_IO', 'FamilySize', 'PA_Rural', 'PA_Urban', 'DualIncome_IO', 'Loan_Amount_Term'
                   ,'Income_for_Loan_Annual', 'LoanPerIncome', 'Married_IO', 'IncomePerMember', 'FamilyIncome'])
x_cols

Index(['Education_IO'], dtype='object')

In [173]:
model = sm.Logit(y_train, sm.add_constant(logReg_train[x_cols])).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.432856
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:         Credit_History   No. Observations:                  564
Model:                          Logit   Df Residuals:                      562
Method:                           MLE   Df Model:                            1
Date:                Mon, 08 Jun 2020   Pseudo R-squ.:                0.007225
Time:                        15:08:46   Log-Likelihood:                -244.13
converged:                       True   LL-Null:                       -245.91
Covariance Type:            nonrobust   LLR p-value:                   0.05942
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const            1.2958      0.221      5.854      0.000       0.862       1.730
Education_IO     0.5013

In [184]:
model = LogReg()
model.fit(logReg_train[['Education_IO', 'FamilyIncome']], y_train)
preds = model.predict(logReg_train[['Education_IO', 'FamilyIncome']])

score(y_train, preds, 1, 0)

Accuracy:	84.4%
Precision:	0.844
Recall:		1.000
F-Score:	0.915
AUC-ROC Score:	0.506
FPR:		98.88%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  475 	|  0 	|
-------------------------
  0  	|  88 	|  1 	|




Basically the exact same imputation I had.  Not worth using this model.

***
## Random Forest

In [185]:
rf_train = train_forMod.copy()
rf_test = test_forMod.copy()

In [186]:
y_train = rf_train['Credit_History']
y_test = rf_test['Credit_History']

In [213]:
cols = rf_train.columns
x_cols = cols.drop(['Loan_ID', 'Loan_Status', 'Credit_History', 'Dependents', 'PA_Semiurban', 'Self_Employed_IO'
                   ,'Male_IO', 'Married_IO', 'Loan_Amount_Term', 'PA_Rural', 'DualIncome_IO','PA_Urban'])
x_cols

Index(['FamilySize', 'Education_IO', 'FamilyIncome', 'LoanAmount',
       'LoanPerIncome', 'Income_for_Loan_Annual', 'IncomePerMember'],
      dtype='object')

In [214]:
model = rfc(random_state=1, max_depth = 5, min_samples_split=20, min_samples_leaf=10)
model.fit(rf_train[x_cols], y_train)

features = pd.concat((pd.DataFrame(rf_train[x_cols].columns, columns = ['variable']), 
           pd.DataFrame(model.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]
print('\n', features, '\n')


                  variable  importance
2            FamilyIncome    0.212009
6         IncomePerMember    0.187972
4           LoanPerIncome    0.176010
5  Income_for_Loan_Annual    0.158263
3              LoanAmount    0.155034
1            Education_IO    0.067408
0              FamilySize    0.043304 





In [215]:
preds = model.predict(rf_train[x_cols])

score(y_train, preds, 1, 0)

Accuracy:	84.2%
Precision:	0.842
Recall:		1.000
F-Score:	0.914
AUC-ROC Score:	0.500
FPR:		100.00%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  475 	|  0 	|
-------------------------
  0  	|  89 	|  0 	|


Same as my imputation again