# Lending Club Prediction Models

The following was inspired by a very good yhat blog post that created a prediction model for Lending Club loans using R:
http://blog.yhathq.com/posts/machine-learning-for-predicting-bad-loans.html

The point of this exercise was to translate the yhat post into Python as much as possible and use sci-kit learn to evaluate several Machine Learning models.

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

import model_eval

pd.options.mode.chained_assignment = None

sns.set(color_codes=True)

%matplotlib inline

## Data Acquisition
Lending Club helpfully publishes anonymized loan data in csv format, which we'll use for this analysis.
You have to have a Lending Club log in to access the data, which is available here:
https://www.lendingclub.com/info/download-data.action

I've copied the files used in this analysis to a public Dropbox folder here:
https://dl.dropboxusercontent.com/u/12406727/Data/

To start, we'll load the files into a single pandas data frame:

In [None]:
# file_base_url = "https://dl.dropboxusercontent.com/u/12406727/Data/{0}"
file_base_url = '~/Dropbox/Public/Data/{0}'

files = ['LoanStats3a_securev1.csv', 
         'LoanStats3b_securev1.csv', 
         'LoanStats3c_securev1.csv', 
         'LoanStats3d_securev1.csv']

df = pd.DataFrame()

for f in files:
    d = pd.read_csv(file_base_url.format(f), low_memory=False, 
                          index_col='id', header=1, parse_dates=['issue_d'])
    print len(d)
    df = df.append(d)

In [None]:
print 'Record:', len(df)
df.describe()

## Data Prep & Cleanup
We'll need to do a bit of data cleanup and we'll follow the yhat post pretty closely here.

In [None]:
del df['desc']
del df['mths_since_last_record']

### Drop columns with more than 80% NA values
This also drops any totals columns that might have snuck in.

In [None]:
data = df[df.member_id > 0].dropna(axis=1,thresh=df.member_id.count()*.80)

In [None]:
data;

In [None]:
print len(data)
data.describe()

### Tag bad loans

In [None]:
bad_indicators = ['Late (16-30 days)', 'Late (31-120 days)', 'Default', 'Charged Off']

In [None]:
# set is_bad indicator to true to loans in status with 'bad' indicators
data.loc[:,'is_bad'] = False
data.loc[data['loan_status'].isin(bad_indicators), 'is_bad'] = True
data.loc[data['loan_status'] =="", 'is_bad'] = np.nan

In [None]:
# how many bad loans overall?
print 'Total loans:\t', data.member_id.count()
print 'Bad loans:\t', data[data['is_bad']==True].member_id.count()
print 'Bad loan %:\t', data[data['is_bad']==True].member_id.count()*1./data.member_id.count()*1.

## Data Type Cleanup

In [None]:
def convert_date(x):
    try:
        return pd.datetime.strptime(x, '%b-%Y')
    except:
        print x
        raise
        
# sometimes this necessary, sometimes not...probably depends on version of pandas installed
# dateparse = lambda x: convert_date(x)
# data.loc[:,'issue_d'] = data.issue_d.map(dateparse)

### Dates

In [None]:
data.loc[:,'issue_d'] = pd.to_datetime(data.issue_d)
data.loc[:,'year_issued'] = data.issue_d.dt.year
data.loc[:,'month_issued'] = data.issue_d.dt.month
data.loc[:,'earliest_cr_line'] = pd.to_datetime(data.earliest_cr_line)
data.loc[:,'revol_util'] = data['revol_util'].str.replace("[%]", "").astype(float)

### Categorical Features

In [None]:
data.home_ownership = data.home_ownership.astype('category')
data.loc[:,'is_rent'] = False
data.loc[data['home_ownership'].isin(['RENT']), 'is_rent'] = True
data.loc[:,'fico_range'] = data.fico_range_high.astype('category')
data.loc[:,'fico_ordered'] = data.fico_range_high.astype('float')

In [None]:
data.groupby('fico_ordered').member_id.count();

In [None]:
pd.crosstab(data.year_issued, data.loan_status)

In [None]:
pd.DataFrame(data.groupby('year_issued').is_bad.sum()/data.groupby('year_issued').is_bad.count()).plot()

### Get only mature loans to make sure they had enough to be paid off 

In [None]:
mature_loans = data[data['year_issued'] <= 2012]

In [None]:
print 'Total loans:\t', len(mature_loans)
print 'Bad loans:\t', len(mature_loans[mature_loans['is_bad']==True])
print 'Bad loan %:\t', len(mature_loans[mature_loans['is_bad']==True])*1./len(mature_loans)*1.

In [None]:
pd.DataFrame(mature_loans.groupby('year_issued').is_bad.sum()/mature_loans.groupby('year_issued').is_bad.count()).plot()

In [None]:
mature_loans.shape

In [None]:
feature_cols = ['loan_amnt', 'annual_inc', 'fico_range_low', 'fico_range_high', 
                'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 
                'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 
                'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 
                'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 
                'last_fico_range_high', 'last_fico_range_low',
                'is_rent']

label = ['is_bad']



In [None]:
X = mature_loans[feature_cols].fillna(0)
y = mature_loans[label].fillna(0)#.values
print 'X:', X.shape
print 'y:', y.shape


In [None]:
X.describe()

### Scale numeric columns
We'll scale all numeric columns by adjusting them by their means and standard deviations.

In [None]:
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()

X_scaled = pd.DataFrame(stdsc.fit_transform(X), columns=feature_cols)

## Prediction Modeling
We'll use sci-kit learn for all models, to create a model designed to predict whether a loan will be 'bad'. 

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import roc_curve, auc

from sklearn import metrics

import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

### Let's try some basic logistic regression first

In [None]:
%%time
log = LogisticRegression(C=1e3, penalty='l2') # params learned from GridSearchCV
log, log_data_split, log_y_score = model_eval.run_prediction(X_scaled, y, log)


In [None]:
%%time
model = LogisticRegression(C=1e3, penalty='l2') # params learned from GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.25)
    
#fit on training data
fit = model.fit(X_train, y_train)

# predict on test data
y_pred = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print "Accuracy of model:\t", accuracy
print 'MSE\t', metrics.mean_squared_error(y_test, y_pred)
print 'RMSE\t', np.sqrt(metrics.mean_squared_error(y_test, y_pred))

score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

print '\n'
print '-----------------------------------------'
print 'Scores:'
print '-----------------------------------------'
print 'Train\t', score_train
print 'Test\t', score_test
print '-----------------------------------------\n'

In [None]:
scores = cross_val_score(model, X_scaled, y['is_bad'].values, cv=10, scoring='accuracy', n_jobs=1, verbose=1)
score_cross_val_mean = scores.mean()
print 'Mean Cross-Val Score:', score_cross_val_mean

In [None]:
log_y_test = log_data_split['y_test']
log_y_pred = log_data_split['y_pred']

cm, cm_norm = model_eval.confusion_matrix(log_y_test, log_y_pred)

# print cm_norm
model_eval.plot_confusion_matrix(cm_norm)
# test_score = model_eval.model_metrics(log, X, y, log_data_split)


In [None]:
print '\n'
print 'Coefficients for each X:\n'
coeff = pd.DataFrame(zip(feature_cols, log.coef_[0]), columns=['Feature', 'Weight'])
coeff.sort_values(by='Weight', ascending=False)

In [None]:
p = log.predict_proba(log_data_split['X_test'])
model_eval.plot_histogram(p[:,1])

## Parameter tuning

In [None]:
model_eval.plot_roc(log_data_split['y_test'], log_y_score)

In [None]:
from sklearn.grid_search import GridSearchCV

C_range = 10.0 ** np.arange(-2, 9)

param_grid = [{
              'C': C_range, 
                'penalty': ['l1', 'l2']
             }]

log = LogisticRegression()
print X_scaled.shape
# print y[:,0].values.shape
print param_grid

In [None]:
 y['is_bad'].values

In [None]:
y.values

In [None]:
grid = GridSearchCV(log, param_grid, scoring='accuracy', n_jobs = 1)
grid.fit(X_scaled.values,  y['is_bad'].values)

In [None]:
print 'best params', grid.best_params_
print 'best params', grid.best_estimator_
print 'best params', grid.best_score_

grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
print grid_mean_scores

### K Neighbors

In [None]:
%%time

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn, knn_data_split, knn_y_score = run_prediction(X, y, knn)

knn_y_test = knn_data_split['y_test']
knn_y_pred = knn_data_split['y_pred']

confusion_matrix(knn_y_test, knn_y_pred)

model_metrics(knn, X, y, knn_data_split)
# scores = cross_validation(knn, X, y, n_jobs=5)

In [None]:
p = knn.predict_proba(knn_data_split['X_test'])
plot_histogram(p[:,1])

#### Using GridSearch to test n_neighbors parameters

In [None]:
# %%time
# from sklearn.grid_search import GridSearchCV

# n_range = range(1,11)
# param_grid = dict(
#               n_neighbors=n_range
#              )
# knn = KNeighborsClassifier()

# grid = GridSearchCV(knn, param_grid, scoring='accuracy', n_jobs = 1)
# grid.fit(X, y);

In [None]:
plot_roc(knn_data_split['y_test'], knn_y_score)

In [None]:
# grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
# print grid_mean_scores
# print 'best params', grid.best_params_
# print 'best params', grid.best_estimator_
# print 'best params', grid.best_score_

In [None]:
# plt.plot(n_range, score grid.grid_scores_)
# plt.xlabel('Value of N for n_neighbors')
# plt.ylabel('Cross-Validated Accuracy')

### Random Forest

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=5)
print rf

rf, rf_data_split, rf_y_score = run_prediction(X, y, rf)

rf_y_test = rf_data_split['y_test']
rf_y_pred = rf_data_split['y_pred']

confusion_matrix(rf_y_test, rf_y_pred)

model_metrics(log, X, y, rf_data_split)

scores = cross_validation(rf, X, y, n_jobs=5)

In [None]:
# 0.977352599188
p = rf.predict_proba(rf_data_split['X_test'])
plot_histogram(p[:,1])

In [None]:
# plot_roc(rf_data_split['y_test'], rf_y_score)
print pd.DataFrame(zip(cols, rf.feature_importances_)).sort(1, ascending=False)

In [None]:
data.groupby("is_bad")["recoveries"].sum()

## Gradient Boosted Regression Tree

In [None]:
%%time

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=3000, max_features=1.0, learning_rate=0.01, 
                                    max_depth=4, min_samples_leaf=5)

gb, data_split, y_score = model_eval.run_prediction(X_scaled, y, gb)

y_test = data_split['y_test']
y_pred = data_split['y_pred']

model_eval.confusion_matrix(y_test, y_pred)

model_eval.model_metrics(log, X, y, data_split)
# scores = cross_validation(log, X, y, n_jobs=5)

In [None]:
print pd.DataFrame(zip(cols, gb.feature_importances_)).sort(1, ascending=False)

In [None]:
p = gb.predict_proba(data_split['X_test'])
plot_histogram(p[:,1])

##ROC Curves

In [None]:
plot_roc(data_split['y_test'], y_score)

##Selecting Hyperparameters With Cross Validation


In [None]:
from sklearn.grid_search import GridSearchCV

# est_range = [100, 150, 200]
# depth_range = range(1,6)
# n_scores = []
param_grid = dict(
              learning_rate=[0.1, 0.05, 0.02, 0.01],
              max_depth=[4, 6],
              min_samples_leaf=[3, 5, 9, 17],
              max_features=[1.0, 0.3, 0.1]
             )
gb = GradientBoostingClassifier(n_estimators=3000)

grid = GridSearchCV(gb, param_grid, scoring='accuracy', n_jobs = 4)
grid.fit(X, y)

In [None]:
print 'best params', grid.best_params_
print 'best params', grid.best_estimator_
print 'best params', grid.best_score_

grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
print grid_mean_scores

In [None]:
# r = pd.DataFrame(grid.grid_scores_)

In [None]:
# res = zip([x['n_estimators'] for x in r[0]], [x['max_depth'] for x in r[0]], [x for x in r[1]])

In [None]:
# dr = pd.DataFrame(res)
# dr;

In [None]:
# # plot the results
# est_score = dr.groupby(dr[0]).mean()
# print est_score

# plt.plot(est_range, est_score[2])
# plt.xlabel('Value of N for n_estimators')
# plt.ylabel('Cross-Validated Accuracy')
# plt.ylim(dr[2].min(), dr[2].max())

In [None]:
# plot the results
md_score = dr.groupby(dr[1]).mean()
print md_score
plt.plot(depth_range, md_score[2])
plt.xlabel('Value of N for max_depth')
plt.ylabel('Cross-Validated Accuracy')
plt.ylim(dr[2].min(), dr[2].max())

In [None]:
(md_score[2]-md_score[2].mean())/(md_score[2].xmax()-est_score[2].min())