In [1]:
def get_lending_club_data():
    raw_data = pd.read_csv("LendingClub2012to2013.csv", low_memory = False, skiprows=[0])
    return(raw_data)
    
def data_clean(raw_data):
    no_incomplete_rows = raw_data[raw_data['loan_status'].isin(['Fully Paid', 'Charged Off', 'Default'])]
    no_incomplete_rows['loan_status'] = no_incomplete_rows['loan_status'].apply(lambda x: 0 if x == "Fully Paid" else 1)
    leakage_to_drop = ['issue_d', 'recoveries', 'collection_recovery_fee', 'last_fico_range_high', 'last_fico_range_low', 'last_credit_pull_d', 'total_rec_prncp', 'last_pymnt_amnt', 'total_pymnt', 'total_pymnt_inv', 'last_pymnt_d', 'total_rec_late_fee', 'total_rec_int', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'out_prncp', 'out_prncp_inv', 'pymnt_plan', 'next_pymnt_d']
    no_information_features_to_drop = ['all_util', 'dti_joint', 'il_util', 'inq_fi', 'inq_last_12m', 'max_bal_bc', 'mths_since_rcnt_il', 'open_acc_6m', 'open_il_12m', 'open_il_24m', 'open_il_6m', 'open_rv_12m', 'open_rv_24m', 'total_bal_il', 'verification_status_joint', 'annual_inc_joint', 'application_type',  'policy_code', 'total_cu_tl']
    text_columns = ['emp_title', 'url', 'desc', 'purpose', 'title', 'zip_code', 'id']


    no_leakage = no_incomplete_rows.drop(text_columns + leakage_to_drop + no_information_features_to_drop, axis = 1)

    no_leakage['earliest_cr_line'] = pd.to_datetime(no_leakage['earliest_cr_line'], format='%b-%Y')
    no_leakage['time_since_earliest_cr_line'] = no_leakage['earliest_cr_line'].apply(lambda x: pd.to_datetime('20000101', format='%Y%m%d') - x).dt.days

    # and it looks like there's a pesky % symbol in my interest rate variable
    no_leakage['int_rate'] = pd.to_numeric(no_leakage['int_rate'].str.strip('%'), errors='coerce')
    no_leakage['revol_util'] = pd.to_numeric(no_leakage['revol_util'].str.strip('%'), errors='coerce')

    no_leakage['term'] = no_leakage['term'].apply(lambda x: x.strip().replace(" ", "_"))

    no_leakage = no_leakage.drop(['earliest_cr_line'], axis = 1)
    return(no_leakage)

def data_pre_process(no_leakage):
    categorical = no_leakage.select_dtypes(include=['object'])
    numeric = no_leakage.select_dtypes(exclude=['object'])
    
    # create dummy variables
    for name, values in categorical.items():
        dummies = pd.get_dummies(values.str.strip(), prefix = name, dummy_na=True)
        cleaned_data = pd.concat([numeric, dummies], axis=1)
    return(cleaned_data)

def data_Imputation(cleaned_data):
    for name in cleaned_data:
        if pd.isnull(cleaned_data[name]).sum() > 0:
            cleaned_data["%s_mi" % (name)] = pd.isnull(cleaned_data[name])
            median = cleaned_data[name].median()
            cleaned_data[name] = cleaned_data[name].apply(lambda x: median if pd.isnull(x) else x)
    return(cleaned_data)

In [5]:
import pandas as pd
pd.options.mode.chained_assignment = None
from sklearn import preprocessing

import numpy as np
from sklearn.model_selection import train_test_split

# Import the raw data from the csv file
raw_data = get_lending_club_data()

# Clean and process the data 
no_leakage = data_clean(raw_data)

# Pre-process the data for modelling
cleaned_data  = data_pre_process(no_leakage)

# Imputation of missing values
clean_imputed_data = data_Imputation(cleaned_data)

#Defining Target and independent/predictor variables
y = clean_imputed_data['loan_status']
X = clean_imputed_data.drop(['loan_status'], axis = 1)

#Creating the holdout
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

#Decision Tree model
from sklearn import tree
dt = tree.DecisionTreeClassifier()

# set up cv
from sklearn import model_selection
cv = model_selection.KFold(5)

# pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler()),('model', dt) ])


# Parameters for tuning the model
tree_depth = [5,6,7]
tree_min_samples_split = [2, 10, 20]
tree_min_samples_leaf = [1, 5, 10]
tree_max_leaf_nodes = [10,20,25]
#Implementing GridSearchCV
from sklearn.model_selection import GridSearchCV
optimized_dt = GridSearchCV(estimator=pipeline
                            , cv=cv
                            , param_grid=dict(model__max_depth= tree_depth, 
                                              model__min_samples_split = tree_min_samples_split,
                                              model__min_samples_leaf = tree_min_samples_leaf,
                                              model__max_leaf_nodes = tree_max_leaf_nodes)
                            , scoring = 'roc_auc'
                            , verbose = 1
                            , n_jobs = -1)

#Fitting the model with Grid Search
optimized_dt.fit(X_train, y_train)

#Get the best estimator from the grid Search
print(optimized_dt.best_estimator_)
print(optimized_dt.best_score_)

# Evaluate on holdout
from sklearn.metrics import roc_auc_score
y_pred = optimized_dt.predict_proba(X_test)[:, 1]

roc_on_holdout = roc_auc_score(y_test, y_pred)

print(roc_on_holdout)

# train model on entire dataset
final = pipeline.fit(X, y)



Fitting 5 folds for each of 135 candidates, totalling 675 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 675 out of 675 | elapsed: 21.8min finished


Pipeline(steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=25, min_impurity_split=1e-07,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])
0.684773303774


In [6]:
tree.export_graphviz(dt, out_file='treepipe1.dot')

In [23]:
from sklearn import tree
dt = tree.DecisionTreeClassifier(max_depth=4,max_leaf_nodes=25, min_samples_leaf=10,min_samples_split=2)
dt.fit(X,y)
from sklearn.tree import export_graphviz
dot_data=export_graphviz(dt,out_file="tree.dot",feature_names=X.columns.values)
