In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from classifier import runclassifiers
from sklearn.linear_model import LogisticRegressionCV
import pickle
from sklearn.externals import joblib

In [104]:
data = pd.read_csv('../data/loan.csv')
data.head(5)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
0,5000,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.65,0.0,83.7,9.0,0,26.0,verified
1,2500,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,1,12.0,verified
2,2400,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,0,10.0,not verified
3,10000,36 months,13.49,10.0,RENT,49200.0,other,CA,20.0,0.0,21.0,37.0,0,15.0,verified
4,5000,36 months,7.9,3.0,RENT,36000.0,wedding,AZ,11.2,0.0,28.3,12.0,0,7.0,verified


In [105]:
data.shape

(163987, 15)

In [106]:
data.describe()

Unnamed: 0,loan_amnt,int_rate,emp_length,annual_inc,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length
count,163987.0,163987.0,158183.0,163983.0,163987.0,163958.0,163794.0,163958.0,163987.0,163958.0
mean,13074.169141,13.715904,5.684353,71915.67,15.88153,0.227357,54.079173,24.579734,0.183039,14.854274
std,7993.556189,4.39194,3.610664,59070.92,7.587668,0.694168,25.285367,11.68519,0.3867,6.947733
min,500.0,5.42,0.0,1896.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,7000.0,10.65,2.0,45000.0,10.23,0.0,35.6,16.0,0.0,10.0
50%,11325.0,13.49,6.0,61000.0,15.62,0.0,55.8,23.0,0.0,14.0
75%,18000.0,16.32,10.0,85000.0,21.26,0.0,74.2,31.0,0.0,18.0
max,35000.0,26.06,10.0,7141778.0,39.99,29.0,150.7,118.0,1.0,65.0


In [107]:
data.dtypes

loan_amnt                  int64
term                      object
int_rate                 float64
emp_length               float64
home_ownership            object
annual_inc               float64
purpose                   object
addr_state                object
dti                      float64
delinq_2yrs              float64
revol_util               float64
total_acc                float64
bad_loan                   int64
longest_credit_length    float64
verification_status       object
dtype: object

In [108]:
data.term.unique()

array(['36 months', '60 months'], dtype=object)

In [109]:
data.home_ownership.unique()

array(['RENT', 'OWN', 'MORTGAGE', 'OTHER', 'NONE', 'ANY'], dtype=object)

In [110]:
data.purpose.unique()

array(['credit_card', 'car', 'small_business', 'other', 'wedding',
       'debt_consolidation', 'home_improvement', 'major_purchase',
       'medical', 'moving', 'vacation', 'house', 'renewable_energy',
       'educational'], dtype=object)

In [111]:
data.addr_state.unique()

array(['AZ', 'GA', 'IL', 'CA', 'TX', 'VA', 'MO', 'CT', 'UT', 'FL', 'NY',
       'PA', 'MN', 'NJ', 'OR', 'KY', 'OH', 'SC', 'RI', 'LA', 'MA', 'WA',
       'WI', 'AL', 'NV', 'CO', 'MD', 'WV', 'VT', 'MI', 'DC', 'SD', 'NC',
       'AR', 'KS', 'NM', 'HI', 'AK', 'OK', 'MT', 'WY', 'NH', 'DE', 'MS',
       'TN', 'IA', 'NE', 'ID', 'IN', 'ME'], dtype=object)

In [112]:
data.verification_status.unique()

array(['verified', 'not verified'], dtype=object)

Isolating target variables

In [113]:
y = data.bad_loan
data = data.drop(['bad_loan', 'int_rate'], axis=1)

Persist column names order

In [114]:
joblib.dump(data.columns, '../results/columns.pkl')

['../results/columns.pkl']

Some cleaning

In [115]:
# data['term'] = data.term.apply(lambda x: int(x.split(' ')[0])).replace([36, 60], [0, 1])

In [116]:
# data['verification_status'] = data.verification_status.replace(['verified', 'not verified'], [1, 0])

In [86]:
cat_cols = ['term', 'home_ownership', 'purpose', 'addr_state', 'verification_status']
cat_data = data[cat_cols]
data = data.drop(cat_cols, axis=1)

In [87]:
cat_data.head()

Unnamed: 0,term,home_ownership,purpose,addr_state,verification_status
0,36 months,RENT,credit_card,AZ,verified
1,60 months,RENT,car,GA,verified
2,36 months,RENT,small_business,IL,not verified
3,36 months,RENT,other,CA,verified
4,36 months,RENT,wedding,AZ,verified


In [88]:
le = LabelEncoder()
le.fit(cat_data.purpose.tolist())

LabelEncoder()

In [89]:
le.transform(cat_data.purpose)

array([ 1,  0, 11, ...,  2,  1,  2])

In [90]:
le.classes_

array(['car', 'credit_card', 'debt_consolidation', 'educational',
       'home_improvement', 'house', 'major_purchase', 'medical', 'moving',
       'other', 'renewable_energy', 'small_business', 'vacation', 'wedding'], 
      dtype='<U18')

In [91]:
label_encoders = []

In [92]:
label_encoders = []
cat_data_encoded = np.empty(data.shape[0])
# LabelEncode the cat variables
for col in cat_cols:
    le = LabelEncoder()
    le.fit(cat_data[col].tolist())
    encoded = le.transform(cat_data[col])
    cat_data_encoded = np.column_stack((cat_data_encoded, encoded))
    label_encoders.append(le)
    
cat_data_encoded = np.delete(cat_data_encoded, 0, 1)


In [93]:
# OneHotEncoder the vars
enc = OneHotEncoder()
enc.fit(cat_data_encoded)
cat_data_encoded = enc.transform(cat_data_encoded).toarray()

In [96]:
data = np.column_stack((data.values, cat_data_encoded))
data.shape

(163987, 82)

In [97]:
X = data
nans = np.isnan(X).any(axis=1)
X = X[~nans]
y = y[~nans]

Modeling eval

In [98]:
runclassifiers(X, y, cv=3, classifiers=['l1', 'rf', 'gbm'])

GBM-Accuracy: 0.820046076177 std: 0.000291567808161 model-fit: 0.00116142199172 score-time: 0.0370916660572 fit-time: 0.597977774309
GBM-Precision: 0.516680638004 std: 0.0330190023976 model-fit: 0.0971913342079 score-time: 0.0370916660572 fit-time: 0.597977774309
GBM-Recall: 0.0160627846416 std: 0.00233042487766 model-fit: 0.00293477192607 score-time: 0.0370916660572 fit-time: 0.597977774309
GBM-Roc_auc: 0.675273275874 std: 0.00472715992771 model-fit: 0.0185915274553 score-time: 0.0370916660572 fit-time: 0.597977774309

L1-Accuracy: 0.820065065201 std: 0.000294928240412 model-fit: 0.000186714031539 score-time: 0.00922480532149 fit-time: 0.440462885267
L1-Precision: 0.514701276151 std: 0.0311355535059 model-fit: 0.0124195538148 score-time: 0.00922480532149 fit-time: 0.440462885267
L1-Recall: 0.016871297093 std: 0.0037448611363 model-fit: 0.000860913122304 score-time: 0.00922480532149 fit-time: 0.440462885267
L1-Roc_auc: 0.669992637736 std: 0.00313033555864 model-fit: 0.00687332139306 sc

Final model

In [99]:
lr = LogisticRegressionCV()
lr.fit(X, y)
joblib.dump(lr, '../results/final_model.pkl')

['../results/final_model.pkl']

Also persist Label and OneHot Encoders

In [100]:
joblib.dump(label_encoders, '../results/label_encoders.pkl')
joblib.dump(enc, '../results/onehot_encoder.pkl')

['../results/onehot_encoder.pkl']