In [2]:
import os
from os.path import normpath, join
import csv
import random
from math import log, log10, floor

import pandas as pd
import numpy as np

from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression


In [3]:
CURRENT_DIRECTORY = os.getcwd()
TRAIN_CSV_NAME = "LoanStats3a-split-train.csv"
TEST_CSV_NAME = "LoanStats3a-split-test.csv"
train_filepath = normpath(join(CURRENT_DIRECTORY, TRAIN_CSV_NAME))
test_filepath = normpath(join(CURRENT_DIRECTORY, TEST_CSV_NAME))

print( "train_filepath: %s" % train_filepath)
print( "test_filepath: %s" % test_filepath)

training = pd.read_csv("LoanStats3a-split-train.csv")
test = pd.read_csv("LoanStats3a-split-test.csv")

train_filepath: /Users/jgroob/Documents/Resume/2017/Assignments/Octane Lending/lending-club-problem-11-07-2017/LoanStats3a-split-train.csv
test_filepath: /Users/jgroob/Documents/Resume/2017/Assignments/Octane Lending/lending-club-problem-11-07-2017/LoanStats3a-split-test.csv


In [4]:
# Create features from raw data

In [5]:
def df_to_features(df):
    new_df = pd.DataFrame()
    
    # Prediction Variable
    def is_default(row):
        if row in {"Charged Off", "Default"}:
            return 1.0
        elif row == "Fully Paid":
            return 0.0
        raise Exception("Invalid status: %s" % loan_status)
    
    new_df['is_default'] = df['loan_status'].apply(is_default)
    
    ##########################################################
    # Creating Features
    
    # Per Lending Club, any Debt-to-Income Ratio > 20 is 'high'
    new_df['high_dti'] = df.apply(lambda x: 1 if float(x['dti']) > 20 else 0, axis=1)
    new_df['dti'] = df['dti']
    
    # Log Annual Income
    new_df['annual_inc_log'] = (df['annual_inc']+1).apply('log10')
    
    # Revolving line utilization rate, or the amount of credit 
    # the borrower is using relative to all available revolving credit.
    new_df['revol_util'] = (df
                            .apply(lambda x: float(str(x['revol_util']).split('%')[0])/100
                             , axis=1)
                            .fillna(0)
                           )
    # revol_bal
    new_df['revol_bal_log'] = (df['revol_bal']+1).apply('log10')
    
    # Previous Delinquency
    new_df['previous_delinq'] = (df
                                 .apply(lambda x: 0 if pd.isnull(x['mths_since_last_delinq']) else 1, axis=1)
                                )
    # Homeownership - OHE
    def homeownership_switch(row):
        if row in {"MORTGAGE", "OWN", "RENT"}:
            return row
    home = df['home_ownership'].apply(homeownership_switch)
    home_ohe = pd.get_dummies(home, prefix='home')
    new_df = pd.concat([new_df,home_ohe], axis=1)
    
    # Purpose - OHE
    def purpose_switch(row):
        if row in {"debt_consolidation", "credit_card", "other"
                   , "home_improvement", "major_purchase", "car", "small_business"}:
            return row
    purpose = df['purpose'].apply(purpose_switch)
    purpose_ohe = pd.get_dummies(purpose, prefix='purpose')
    new_df = pd.concat([new_df,purpose_ohe], axis=1)
    
    
    # Employement History Provided?
    new_df['emp_length'] = df['emp_length'].apply(lambda x: 0 if pd.isna(x) else 1)
    
    # Employement Title Provided?
    new_df['emp_title'] = df['emp_title'].apply(lambda x: 0 if pd.isna(x) else 1)
    
    # Term
    new_df['term_60m'] = df['term'].apply(lambda x: 1 if int(x.split()[0])> 36 else 0)
    
    return(new_df)

In [6]:
def log_interaction_terms(df):
    new_df = pd.DataFrame()
    
    new_df['int_inc_dti'] = df['annual_inc_log']*df['dti']
    new_df['int_inc_revol_util'] = df['annual_inc_log']*df['revol_util']
    new_df['int_dti_revol_util'] = df['dti']*df['revol_util']
    new_df['int_inc_revol_bal_log'] = df['annual_inc_log']*df['revol_bal_log']

    return(new_df)

# Creating Test / Train Feature Sets

In [7]:
df_train = df_to_features(training)
df_test = df_to_features(test)

# Create Results DataFrames

In [8]:
# AUC metric
def calculate_auc(y, y_pred):
    fpr, tpr, thresholds = roc_curve(y, y_pred)
    auc_score = auc(fpr, tpr)
    return auc_score

def pred_results(name, actual, pred):
    
    # AUC metric
    def calculate_auc(y, y_pred):
        fpr, tpr, thresholds = roc_curve(y, y_pred)
        auc_score = auc(fpr, tpr)
        return auc_score
    
    auc_score = calculate_auc(actual, pred)
    
    combined = pd.DataFrame({'actual':actual, 'pred': pred})
    
    num_pred_25 = sum(i > 0.25 for i in pred) / len(pred)
    num_pred_50 = sum(i > 0.50 for i in pred) / len(pred)
    num_pred_75 = sum(i > 0.75 for i in pred) / len(pred)
    
    defaultRate_pred_25 = combined[combined['pred']> 0.25]['actual'].mean()
    defaultRate_pred_50 = combined[combined['pred']> 0.50]['actual'].mean()
    defaultRate_pred_75 = combined[combined['pred']> 0.75]['actual'].mean()
    
    new_df = pd.DataFrame([[auc_score
                            , num_pred_25
                            , defaultRate_pred_25
                            , num_pred_50
                            , defaultRate_pred_50
                            , num_pred_75
                            , defaultRate_pred_75
                           ]]
                          , index = [name]
                          , columns=['auc'
                                     , 'pct_preds_greater_25pct', 'defaultRate_preds_greater_25pct'
                                     , 'pct_preds_greater_50pct', 'defaultRate_preds_greater_50pct'
                                     , 'pct_preds_greater_75pct', 'defaultRate_preds_greater_75pct'
                                    ])
    
    return(new_df.fillna(0)) 

# Modeling

## Naive Baseline

Assume Training Default rate for everyone

In [9]:
baseline_default_rate = df_train['is_default'].mean()

y_pred = [ baseline_default_rate for x in range(len(df_test['is_default'])) ]

auc_score = calculate_auc(df_test['is_default'], y_pred)
print("Baseline Default Rate AUC: %s" % auc_score)

results_baseline = pred_results('avg_default_baseline', df_test['is_default'], y_pred)

Baseline Default Rate AUC: 0.5


# Random Guess Baseline

In [10]:
y_pred = [ random.choice([0.0, 1.0]) for _ in range(len(df_test['is_default'])) ]
auc_score = calculate_auc(df_test['is_default'], y_pred)
print("random-guess AUC: %s" % auc_score)

results_random = pred_results('random_baseline', df_test['is_default'], y_pred)

random-guess AUC: 0.456442136643


## Logistic Regression

In [11]:
# Fit a model
prediction_col = 'is_default'
feature_cols = [col for col in df_train if col != prediction_col]


# Fit a model
seed =101010
log_classifier = LogisticRegression(random_state = seed)
log_classifier.fit(df_train[feature_cols],df_train[prediction_col] )


# Annual income results
test_preds = log_classifier.predict_proba(df_test[feature_cols])
auc_score = calculate_auc(df_test[prediction_col].values, test_preds[:,1])
print("annual_inc-model AUC: %s" % auc_score)

results_logistic = pred_results('basic_logistic_regression', df_test['is_default'], test_preds[:,1])

annual_inc-model AUC: 0.695437380867


## Logistic Regression with Interaction Terms

In [12]:
train_int = log_interaction_terms(df_train)
test_int = log_interaction_terms(df_test)

# Fit a model
seed =101010
log_classifier = LogisticRegression(random_state = seed)
log_classifier.fit(pd.concat([df_train[feature_cols],train_int], axis=1)
                   ,df_train[prediction_col] )


# Annual income results
test_preds = log_classifier.predict_proba(pd.concat([df_test[feature_cols],test_int], axis=1))
auc_score = calculate_auc(df_test[prediction_col].values, test_preds[:,1])
print("annual_inc-model AUC: %s" % auc_score)

results_logistic_interaction = pred_results('logistic_regression_interaction', df_test['is_default'], test_preds[:,1])

annual_inc-model AUC: 0.697992311884


## Logistic Regression Limited Variables -- with Interaction

In [13]:
train_int = log_interaction_terms(df_train)
test_int = log_interaction_terms(df_test)

feature_cols = ['annual_inc_log', 'dti','high_dti', 'revol_util', 'revol_bal_log', 'term_60m']

# Fit a model
seed =101010
log_classifier = LogisticRegression(random_state = seed)
log_classifier.fit(pd.concat([df_train[feature_cols],train_int], axis=1)
                   ,df_train[prediction_col] )


# Annual income results
test_preds = log_classifier.predict_proba(pd.concat([df_test[feature_cols],test_int], axis=1))
auc_score = calculate_auc(df_test[prediction_col].values, test_preds[:,1])
print("annual_inc-model AUC: %s" % auc_score)

results_logistic_limited_interaction = pred_results('logistic_limited_interaction', df_test['is_default'], test_preds[:,1])

annual_inc-model AUC: 0.678010843085


## AdaBoost

In [14]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

prediction_col = 'is_default'
feature_cols = [col for col in df_train if col != prediction_col]

adaboost_model = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=5),
    n_estimators=400,
    learning_rate=1)

adaboost_model.fit(df_train[feature_cols],df_train[prediction_col])

ada_preds = adaboost_model.predict_proba(df_test[feature_cols])[:,1]

auc_score = calculate_auc(df_test[prediction_col].values, ada_preds)
ada_default_rate = adaboost_model.predict(df_test[feature_cols]).mean()
print("AdaBoost AUC: %s" % auc_score)
print("AdaBoost Predicted Default Rate: %s" % ada_default_rate)

results_adaboost = pred_results('adaboost', df_test['is_default'], ada_preds)

AdaBoost AUC: 0.869721841162
AdaBoost Predicted Default Rate: 0.0882620564149


# Final Comparison of Models

Validation done on hold-out test set

In [15]:
model_results = pd.concat([results_baseline
    , results_random
    , results_logistic
    , results_logistic_interaction
    , results_logistic_limited_interaction
    , results_adaboost], axis=0)


model_results

Unnamed: 0,auc,pct_preds_greater_25pct,defaultRate_preds_greater_25pct,pct_preds_greater_50pct,defaultRate_preds_greater_50pct,pct_preds_greater_75pct,defaultRate_preds_greater_75pct
avg_default_baseline,0.5,0.0,0.0,0.0,0.0,0.0,0.0
random_baseline,0.456442,0.509099,0.09294,0.509099,0.09294,0.509099,0.09294
basic_logistic_regression,0.695437,0.071884,0.310127,0.003185,0.428571,0.0,0.0
logistic_regression_interaction,0.697992,0.072793,0.29375,0.00364,0.375,0.0,0.0
logistic_limited_interaction,0.678011,0.069609,0.313725,0.000455,0.0,0.0,0.0
adaboost,0.869722,1.0,0.109645,0.088262,0.871134,0.0,0.0


# Conclusions

The current models simply assess, given the current loan status, if someone will default.  In my mind, this is interesting but not that practical.  Specifically, if the model determines a user is going to default, there's not much to do as the loan has already been issued.  Maybe, Lending Club could do out-reach and prevent the user from defaulting, but if the user simply doesn't have the money not much can be done.

A better model would be to predict if someone is going to default before issuing the loan.

To improve the model, it would be interesting to look at all of loans at the same stage (i.e. 6 months after issuance).  This would help improve the predictive power of the model by normalizing the loan states.

##  Model Validation

Out of the box, the logistic regression models with the simple features seem to do pretty good.  They have all have AUCs in the same range (~68%), which is considerably better then the naive / random models.  Also, the predicted default probability seems to correlate with higher actual default rates. This is a good secondary metric for assessing model quality.

The Adaboost model preforms really well in terms of AUC (~87%), but the predicted probabilities appear to be on a different scale than the other models.  For example, the high risk users may have a probability >50% for the adaboost but >25% for the logistic regressions.  This isn't bad, but just something to keep in mind when compairing results between the two.


## Additiona Feature Engineering Ideas
Here are a few ideas on additional feature engineering:
1. Include features on the month data
2. Up-sample the training default rate to give more signal
3. Create different models based on different income levels / loan sizes (i.e. use a more hierarchical approach)
4. Dive into the variable importances of the various models to better finetune the performance.

## Outstanding questions

With these models, I ended up not using any time variables becuase it wasn't clear to me exactly what they represent.  For example, it's clear if all of the loans are currently outstanding with the user making payments or if some of the older loans are paid off.  This distinction is important when looking at a feature like 'months since last payment'. I would need to be 100% of when the next payment is due to use this variable.

