In [60]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from scipy.stats.mstats import winsorize
from scipy.stats import boxcox
from scipy.stats import jarque_bera
from scipy.stats import normaltest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from statsmodels.tools.eval_measures import mse, rmse
from wordcloud import WordCloud
import statsmodels.api as sm

from sklearn import ensemble

import warnings

%matplotlib inline
sns.set()

warnings.filterwarnings('ignore')

In [19]:

y2015 = pd.read_csv(
    'https://www.dropbox.com/s/0so14yudedjmm5m/LoanStats3d.csv?dl=1',
    skipinitialspace=True,
    header=1
)

In [22]:
y2015_df=y2015
y2015_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,68009401,72868139.0,16000.0,16000.0,16000.0,60 months,14.85%,379.39,C,C5,...,0.0,2.0,78.9,0.0,0.0,2.0,298100.0,31329.0,281300.0,13400.0
1,68354783,73244544.0,9600.0,9600.0,9600.0,36 months,7.49%,298.58,A,A4,...,0.0,2.0,100.0,66.7,0.0,0.0,88635.0,55387.0,12500.0,75635.0
2,68466916,73356753.0,25000.0,25000.0,25000.0,36 months,7.49%,777.55,A,A4,...,0.0,0.0,100.0,20.0,0.0,0.0,373572.0,68056.0,38400.0,82117.0
3,68466961,73356799.0,28000.0,28000.0,28000.0,36 months,6.49%,858.05,A,A2,...,0.0,0.0,91.7,22.2,0.0,0.0,304003.0,74920.0,41500.0,42503.0
4,68495092,73384866.0,8650.0,8650.0,8650.0,36 months,19.89%,320.99,E,E3,...,0.0,12.0,100.0,50.0,1.0,0.0,38998.0,18926.0,2750.0,18248.0


In [23]:
# Convert ID and Interest Rate to numeric.
y2015_df['id'] = pd.to_numeric(y2015_df['id'], errors='coerce')
y2015_df['int_rate'] = pd.to_numeric(y2015_df['int_rate'].str.strip('%'), errors='coerce')

# Drop other columns with many unique variables
y2015_df.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc'], 1, inplace=True)

In [24]:
# Remove two summary rows at the end that don't actually contain data.
y2015_df = y2015_df[:-2]

## Get rid of as much data as possible without dropping below an average of 90% accuracy in a 10-fold cross validation.

In [53]:
#convert the target variable into numeric data for correlation matrix
loan_status_numeric = []

for idx, val in enumerate(y2015_df['loan_status']):
    if val == 'Charged Off':
        loan_status_numeric.append(1)
    elif val == 'Current':
        loan_status_numeric.append(2)
    elif val == 'Default':
        loan_status_numeric.append(3)
    elif val == 'Fully Paid':
        loan_status_numeric.append(4)    
    elif val == 'Fully Paid':
        loan_status_numeric.append(5)
    elif val == 'In Grace Period':
        loan_status_numeric.append(6)
    elif val == 'Late (16-30 days)':
        loan_status_numeric.append(7)
    else:
        loan_status_numeric.append(8)

In [58]:
y2015_df['loan_status_numeric'] = loan_status_numeric

In [59]:
#checking the features most highly correlated with the target
np.abs(y2015_df[y2015_df.select_dtypes(['int64', 'float64']).columns].iloc[:,1:].corr().loc[:,"loan_status_numeric"]).sort_values(ascending=False).head(20)


loan_status_numeric        1.000000
last_pymnt_amnt            0.433013
total_rec_prncp            0.359556
total_pymnt                0.304540
total_pymnt_inv            0.304522
out_prncp_inv              0.180147
out_prncp                  0.180105
collection_recovery_fee    0.150975
recoveries                 0.150363
acc_open_past_24mths       0.068256
dti_joint                  0.066225
inq_last_12m               0.066203
num_tl_op_past_12m         0.065039
open_acc_6m                0.061385
open_il_24m                0.059374
total_rec_int              0.056529
open_il_12m                0.056208
open_rv_24m                0.051654
member_id                  0.051370
annual_inc_joint           0.049166
Name: loan_status_numeric, dtype: float64

In [67]:
rfc = ensemble.RandomForestClassifier()
X = y2015_df[['last_pymnt_amnt','total_rec_prncp','total_pymnt','total_pymnt_inv','out_prncp_inv']]
Y = y2015_df['loan_status']
X = X.dropna(axis=1)

print('The 10-fold cross validation average is ', cross_val_score(rfc, X, Y, cv=10).mean())

The 10-fold cross validation average is  0.9293252489228732
