This is mostly the exact same as the model build.  At bottom of this, contains setup for prediction of test set and preparation for submission of test set to https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/#SolutionChecker

In [1]:
import os
import pandas as pd
import numpy as np
import requests
from matplotlib.cbook import boxplot_stats 
import warnings
warnings.simplefilter(action='ignore')
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import roc_auc_score as ra
from sklearn.ensemble import RandomForestClassifier as rf

In [2]:
url = "https://raw.githubusercontent.com/joepollastrini/Loan-Prediction/master"
train_name = 'train_loan_data.csv'
test_name = 'test_loan_data.csv'
out_name = 'submission rand forest.v4.csv'

In [3]:
def data_grab_online(base, name, out='output.csv'):
    #get data from url
    u = base + '/' + name
    r = requests.get(u).content
    df = pd.read_csv(u)
    return df

In [4]:
def gender_impute_and_dummy(row):
    #if unmarried and 1 dependent female, else male
    if pd.isnull(row['Gender']):
        if row['Married'] == 'No' and row['Dependents'] == '1':
            return 0
        else:
            return 1
    else:
        if row['Gender'] == 'Male':
            return 1
        else:
            return 0

In [5]:
def married_impute_and_dummy(x):
    #if female, not married, otherwise married
    if pd.isnull(x['Married']):
        if x['Gender'] == 'Female':
            return 0
        else:
            return 1
    else:
        if x['Married'] == 'Yes':
            return 1
        else:
            return 0

In [6]:
def dependents_impute_and_ordinal(x):
    #if married, one dependent, otherwise none
    #convert dependents to ordinal int as well
    if pd.isnull(x['Dependents']):
        if x['Married_IO'] == 1:
            return 1
        else:
            return 0
    elif x['Dependents'] == '3+':
        return 3
    else:
        return int(x['Dependents'])

In [7]:
def dual_income_dummy(x):
    if x['Married'] == 'Yes':
        if x['CoapplicantIncome'] > 0:
            return 1
        else:
            return 0
    else:
        return 0

In [8]:
#group loan terms
def loan_group(x):
    x = int(x)
    if x == 360:
        return '30'
    elif x == 180:
        return '15'
    elif x < 180:
        return '<15'
    elif x > 180 and x < 360:
        return '(15, 30)'
    elif x > 360:
        return '>30'
    else:
        return '??'

In [9]:
#log transform loan amount for better mean calculation later
#group by loan term for imputation later
def get_transform(df):
    df['LoanAmountLog'] = np.log(df['LoanAmount'].astype('float64'))
    group = df.groupby(['LoanTermGroups'])['LoanAmountLog']
    return group

In [10]:
def clean(df, train_orig, tf = False):
    col_drop = []
    col_rename = {}
    
    # GENDER #
    df['Male_IO'] = df.apply(lambda x: gender_impute_and_dummy(x), axis=1)
    col_drop.append('Gender')
    
    # MARRIED #
    df['Married_IO'] = df.apply(lambda x: married_impute_and_dummy(x), axis=1)
    col_drop.append('Married')
    
    # DEPENDENTS #
    df['Dependents2'] = df.apply(lambda x: dependents_impute_and_ordinal(x), axis=1)
    col_drop.append('Dependents')
    col_rename['Dependents2'] = 'Dependents'
    
    # EDUCATION #
    df['Education_IO'] = df['Education'].apply(lambda x: 1 if x == 'Graduate' else 0)
    col_drop.append('Education')
    
    # SELF EMPLOYED #
    df['Self_Employed'].replace(np.nan, 'No', inplace=True)
    df['Self_Employed_IO'] = df['Self_Employed'].apply(lambda x: 1 if x == 'Yes' else 0)
    col_drop.append('Self_Employed')

    # INCOME #
    df['FamilyIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
    df['DualIncome_IO'] = df.apply(lambda x: dual_income_dummy(x), axis=1)
    col_drop.extend(['ApplicantIncome', 'CoapplicantIncome'])
    if tf == True:
        box = boxplot_stats(df['FamilyIncome'])[0]
        lowVal = box['whislo']
        highVal = box['whishi']        
        #training set, remove outliers
        df['income_out'] = df['FamilyIncome'].apply(lambda x: 1 if (x < lowVal or x > highVal) else 0)
        df = df.loc[df['income_out'] == 0]
        df.reset_index(inplace=True, drop=True)
        col_drop.append('income_out')
    else:
        pass
    
    # LOAN TERM #
    df['Loan_Amount_Term'].replace(np.nan, 360.0, inplace=True)
    df['LoanTermGroups'] = df['Loan_Amount_Term'].apply(lambda x: loan_group(x))
    
    train_orig['Loan_Amount_Term'].replace(np.nan, 360.0, inplace=True)
    train_orig['LoanTermGroups'] = train_orig['Loan_Amount_Term'].apply(lambda x: loan_group(x))
    col_drop.append('LoanTermGroups')
    
    # LOAN AMOUNT #
    df['LoanAmountLog'] = np.log(df['LoanAmount'].astype('float64'))
    t = get_transform(train_orig)
    df['LoanAmountLog'] = t.transform(lambda x: x.fillna(x.mean()))
    df['LoanAmount2'] = np.exp(df['LoanAmountLog'])
    if tf == True:
        box = boxplot_stats(df['LoanAmount2'])[0]
        lowVal = box['whislo']
        highVal = box['whishi']
        df['loan_out'] = df['LoanAmount2'].apply(lambda x: 1 if (x < lowVal or x > highVal) else 0)
        df = df.loc[df['loan_out'] == 0]
        df.reset_index(inplace=True, drop=True)
        col_drop.append('loan_out')
    else:
        pass
    col_drop.append('LoanAmount')
    
    # CREDIT_HISTORY #
    df['CreditHistory_IO'] = df['Credit_History'].fillna(1.0).astype('int')
    col_drop.append('Credit_History')
    
    # PROPERTY AREA #
    df['PA_Urban'] = df['Property_Area'].apply(lambda x: 1 if x == 'Urban' else 0)
    df['PA_Rural'] = df['Property_Area'].apply(lambda x: 1 if x == 'Rural' else 0)
    df['PA_Semiurban'] = df['Property_Area'].apply(lambda x: 1 if x == 'Semiurban' else 0)
    col_drop.append('Property_Area')
    
    # DEBT/EQUITY #
    df['Debt_Equity'] = (df['LoanAmount2'] * 1000) / df['FamilyIncome']
    df['Debt_Equity_Annual'] = (((df['LoanAmount2'] * 1000) / df['Loan_Amount_Term']) * 12) / df['FamilyIncome']
    col_drop.extend(['FamilyIncome', 'LoanAmount2'])
    
    # LOAN STATUS #
    try:
        df['Loan_Status'].replace('Y', 1, inplace=True)
        df['Loan_Status'].replace('N', 0, inplace=True)
    except KeyError:
        pass
    
    #column cleaning
    df.drop(columns = col_drop, inplace=True)
    df.rename(columns=col_rename, inplace=True)
    return df

In [11]:
def score(actual, pred, pos, neg):
    matrix = cm(actual, pred, labels = [pos, neg])
    tp, fn, fp, tn = matrix.ravel()
    precision = tp / (tp + fp) # p(correct | predict pos)
    recall = tp / (tp + fn) # p(correct | actual pos)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f = (2*recall*precision) / (recall + precision)
    fpr = fp / (tn + fp)
    aucroc = ra(actual, pred)
    
    print('Accuracy:\t{:.1f}%'.format(accuracy * 100.0))
    print('Precision:\t{:.3f}'.format(precision))
    print('Recall:\t\t{:.3f}'.format(recall))
    print('F-Score:\t{:.3f}'.format(f))
    print('AUC-ROC Score:\t{:.3f}'.format(aucroc))
    print('FPR:\t\t{:.2f}%'.format(fpr * 100.0))
    print('\n\n')
    
    print('Actual (side) vs. Predicted (top)')
    print('\t|  {}  \t|  {}  \t|'.format(pos, neg))
    print('-------------------------')
    print('  {}  \t|  {} \t|  {} \t|'.format(pos, tp, fn))
    print('-------------------------')
    print('  {}  \t|  {} \t|  {} \t|'.format(neg, fp, tn))

***
# 1) Gather Data

In [12]:
#gather data
train = data_grab_online(url, train_name)
test = data_grab_online(url, test_name)

***
# 2) Impute Missing Values and Clean

In [13]:
train_orig = train.copy()
train = clean(train, train_orig, True)
test = clean(test, train_orig)

# 3) Build Model

In [14]:
cols = train.columns
x_cols = cols.drop(['Loan_ID', 'Loan_Status', 'Married_IO', 'Self_Employed_IO', 'Male_IO', 'DualIncome_IO'
                   , 'Loan_Amount_Term', 'PA_Urban'])

y_train = train[['Loan_ID', 'Loan_Status']]
x_train = train[x_cols]

y_test = test['Loan_ID']
x_test = test[x_cols]

model = rf(random_state=1, max_depth = 5, min_samples_split=20, min_samples_leaf=10)
model.fit(x_train, y_train.iloc[:, 1])


features = pd.concat((pd.DataFrame(x_train.columns, columns = ['variable']), 
           pd.DataFrame(model.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]
print('\n', features, '\n')


              variable  importance
3    CreditHistory_IO    0.550604
6         Debt_Equity    0.179375
7  Debt_Equity_Annual    0.107358
2       LoanAmountLog    0.057980
1        Education_IO    0.029953
5        PA_Semiurban    0.029121
4            PA_Rural    0.026613
0          Dependents    0.018996 



# 4) Score Model

In [15]:
y_true = y_train['Loan_Status']
y_pred = model.predict(x_train)

In [16]:
score(y_true, y_pred, 1, 0)

Accuracy:	81.9%
Precision:	0.802
Recall:		0.978
F-Score:	0.881
AUC-ROC Score:	0.722
FPR:		53.42%



Actual (side) vs. Predicted (top)
	|  1  	|  0  	|
-------------------------
  1  	|  349 	|  8 	|
-------------------------
  0  	|  86 	|  75 	|


# 5) Predict Test Values

In [17]:
test_pred = model.predict(x_test)
predictions = pd.DataFrame(test_pred, columns=['Loan Status Prediction'])
test_predictions = pd.concat((y_test, predictions), axis=1)

# 6) Prepare for Submission

In [18]:
test_predictions['Loan_Status'] = test_predictions['Loan Status Prediction'].apply(lambda x: 'Y' if x==1 else 'N')
test_predictions.drop(columns = ['Loan Status Prediction'], inplace=True)


direct = os.getcwd()
path = os.path.join(direct, out_name)
print('writing file to '+path)
test_predictions.to_csv(path, index=False)

writing file to C:\Users\joepo\Desktop\Project Portfolio\Loan Prediction\To Publish\submission rand forest.v4.csv
