In [22]:
import pandas as pd
loans = pd.read_csv('cleaned_loans_2007.csv')
print(loans.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39724 entries, 0 to 39723
Data columns (total 38 columns):
loan_amnt                              39724 non-null float64
int_rate                               39724 non-null float64
installment                            39724 non-null float64
emp_length                             39724 non-null int64
annual_inc                             39724 non-null float64
loan_status                            39724 non-null int64
dti                                    39724 non-null float64
delinq_2yrs                            39724 non-null float64
inq_last_6mths                         39724 non-null float64
open_acc                               39724 non-null float64
pub_rec                                39724 non-null float64
revol_bal                              39724 non-null float64
revol_util                             39724 non-null float64
total_acc                              39724 non-null float64
home_ownership_MORTGAGE    

In [6]:
#Set up a function to calculate True/False positive rate

def tpr_fpr(predictions, loans):
    tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
    tn = len(predictions[tn_filter])
    tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
    tp = len(predictions[tp_filter])
    fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
    fn = len(predictions[fn_filter])
    fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
    fp = len(predictions[fp_filter])

    fpr = fp/(fp+tn)
    tpr = tp/(tp+fn)

    print("True Positve Rate: {}, False Postiive Rate: {}.".format(tpr, fpr))

In [8]:
#First try a basic logistic regression model.

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
features = loans.drop('loan_status',axis=1)
target = loans['loan_status']
lr.fit(features,target)
predictions = lr.predict(features)

tpr_fpr(predictions,loans)

#This model will overfit, because we are training and testing on the same data.

True Positve Rate: 0.999354327473366, False Postiive Rate: 0.9982304016988144.


In [10]:
#Now try using cross_val_predict

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict, KFold


lr = LogisticRegression()
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, y=target, cv=kf)
predictions = pd.Series(predictions)

tpr_fpr(predictions,loans)

#We still see that true/false positive rates are close to 1.  This is due to the class imbalance in the dataset.  
#There is approximately a 6:1 ratio of Fully Paid: Charged Off in the dataset.

True Positve Rate: 0.9991488862148915, False Postiive Rate: 0.9989382410192886.


In [11]:
#Next, try using a class weight of "balanced"

lr = LogisticRegression(class_weight='balanced')
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, y=target, cv=kf)
predictions = pd.Series(predictions)

tpr_fpr(predictions,loans)

#We reduce the false positive rate to approximately 0.4, but also reduce the true positive rate to 0.66.  
#This reduces our number of possible loans to invest in, but also lowers the risk. 

True Positve Rate: 0.6594664397029907, False Postiive Rate: 0.3935586621836843.


In [12]:
#To reduce the false positive rate further, we can assign a harsher penalty to the bad loans 
#by changing the class_weight parameter.


penalty = {
    0: 10,
    1: 1
}

lr = LogisticRegression(class_weight=penalty)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, y=target, cv=kf)
predictions = pd.Series(predictions)

tpr_fpr(predictions,loans)


#The false positive rate has been reduced to approximately 8%, but now we are rejecting more than 3/4 of the good loans.
#As a conservative investor, this might make sense, but it does eliminate the possibility to make money on the other loans.
#However, this model is successful, because randomly choosing loans will lead do 14% of borrowers failing to repay the loan.

True Positve Rate: 0.2229624629472016, False Postiive Rate: 0.08228632100513184.


In [13]:
#We can try using a random forest classifier, which may increase accuracy because it can account for columns
#that correlate non-linearly with loan_status

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(class_weight='balanced',random_state=1)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(rf, features, y=target, cv=kf)
predictions = pd.Series(predictions)

tpr_fpr(predictions,loans)

#Unfortuantely this model did not reduce the false positive rate.  We can try adjusting some of the parameters.

True Positve Rate: 0.9766677427875444, False Postiive Rate: 0.9470890107945497.


In [14]:
#Try using a harsher penalty for bad loans.

penalty = {
    0: 10,
    1: 1
}


rf = RandomForestClassifier(class_weight=penalty,random_state=1)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(rf, features, y=target, cv=kf)
predictions = pd.Series(predictions)

tpr_fpr(predictions,loans)

#This did not change the false positive rate much at all, it seems that the random forest classifier is largely insensitive
#to changes in the class weight.

True Positve Rate: 0.9789275966307633, False Postiive Rate: 0.9477968501150239.


In [20]:
#Try manipulating the max_depth

for i in range(1,10):
    

    rf = RandomForestClassifier(class_weight='balanced',random_state=1,max_depth=i)
    kf = KFold(features.shape[0], random_state=1)
    predictions = cross_val_predict(rf, features, y=target, cv=kf)
    predictions = pd.Series(predictions)

    tpr_fpr(predictions,loans)


#Limiting the depth of the tree reduces the false positive rate at the expense of the true positive rate
#However, none of the models preform better than randomly choosing loans to fund.

True Positve Rate: 0.49602324421095884, False Postiive Rate: 0.2712794195717572.
True Positve Rate: 0.5454171924984592, False Postiive Rate: 0.3087949035568926.
True Positve Rate: 0.634842837437267, False Postiive Rate: 0.3820562732259777.
True Positve Rate: 0.5995949872332932, False Postiive Rate: 0.350734383294992.
True Positve Rate: 0.6216358993924809, False Postiive Rate: 0.36825340647672977.
True Positve Rate: 0.6577348633815631, False Postiive Rate: 0.3970978587860556.
True Positve Rate: 0.6847944119977695, False Postiive Rate: 0.4347903026013095.
True Positve Rate: 0.7029025914947319, False Postiive Rate: 0.4622190762696868.
True Positve Rate: 0.7366243066357526, False Postiive Rate: 0.5153070253052557.


In [44]:
#Try manipulating min_samples_split
for i in range(5,50,5):
    rf = RandomForestClassifier(class_weight='balanced',random_state=1,min_samples_split=i)
    kf = KFold(features.shape[0], random_state=1)
    predictions = cross_val_predict(rf, features, y=target, cv=kf)
    predictions = pd.Series(predictions)

    tpr_fpr(predictions,loans)

True Positve Rate: 0.9756698852463828, False Postiive Rate: 0.9417802158909927.
True Positve Rate: 0.9496375429225486, False Postiive Rate: 0.8957706600601664.
True Positve Rate: 0.9244563143838229, False Postiive Rate: 0.8322420810476022.
True Positve Rate: 0.9030610747512693, False Postiive Rate: 0.7885330030083171.
True Positve Rate: 0.8862442403075749, False Postiive Rate: 0.7554415147761459.
True Positve Rate: 0.867372993279136, False Postiive Rate: 0.719341709431959.
True Positve Rate: 0.8550465177706689, False Postiive Rate: 0.6961599716864272.
True Positve Rate: 0.8464179849147418, False Postiive Rate: 0.6754556715625553.
True Positve Rate: 0.8342676019135385, False Postiive Rate: 0.6611219253229517.


In [46]:
#Use a combination of penalty, min_samples_split, and max_depth.

penalty = {
    0: 12,
    1: 1
}

rf = RandomForestClassifier(class_weight=penalty,random_state=1,min_samples_split=20,max_depth=5)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(rf, features, y=target, cv=kf)
predictions = pd.Series(predictions)

tpr_fpr(predictions,loans)

#Using a combination of penalty, min_samples_split, and max_depth, we can find a model using the 
#random forest classifier that has both a lower false positive rate and higher true positive 
#rate than the logistic regression model.

True Positve Rate: 0.23825316232794294, False Postiive Rate: 0.07591576712086356.
