In [45]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

pd.set_option('display.max_colwidth', -1)
np.set_printoptions(threshold=np.nan)

# Data preparation

In [2]:
# import data
loans = pd.read_csv('lending-club-data.csv',dtype = {'desc':np.str,'next_pymnt_d':np.str})


In [3]:
# In order to make this more intuitive and consistent with the lectures, we reassign the 
# target to be:

loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x == 0 else -1 )

# only use a subset of the features

features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [4]:
# split into training and validation

import json 
with open('module-5-assignment-1-validation-idx.json') as valid_index:
    valid_index = json.load(valid_index)
with open('module-5-assignment-1-train-idx.json') as train_index:
    train_index = json.load(train_index)

In [5]:
categorical_variables = []
for feat_name,feat_type in zip(list(loans.columns),list(loans.dtypes)):
    if feat_type == "object":
        categorical_variables.append(feat_name)
        
for feature in categorical_variables:
    globals()['df_'+feature] = pd.get_dummies(loans[feature])
    globals()['df_'+feature].columns = [feature+'_'+str(col) for col in  \
                                        globals()['df_'+feature].columns]
    loans.pop(feature)
    loans = pd.concat([loans,globals()['df_'+feature]],axis = 1)

safe_loans_raw = loans[loans['safe_loans'] == 1]
risky_loans_raw = loans[loans['safe_loans'] == -1]

print ("Number of safe loans: %s" %(len(safe_loans_raw)))
print ("Number of safe loans: %s" %(len(risky_loans_raw)))


### One way to combat class imbalance is to undersample the larger class until the class distribution is approximately half and half. Here, we will undersample the larger class (safe loans) in order to balance out our dataset. This means we are throwing away many data points. We used seed=1 so everyone gets the same results.

Since there are fewer risky loans than safe loans, find the ratio of the sizes
and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(frac = percentage, random_state=1)

Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)


### Create dummy variables for categorical variables

categorical_variables = []
for feat_name,feat_type in zip(list(loans_data.columns),list(loans_data.dtypes)):
    if feat_type == "object":
        categorical_variables.append(feat_name)
        
for feature in categorical_variables:
    globals()['df_'+feature] = pd.get_dummies(loans_data[feature])
    globals()['df_'+feature].columns = [feature+'_'+str(col) for col in  \
                                        globals()['df_'+feature].columns]
    deleted_col = loans_data.pop(feature)
    loans_data = pd.concat([loans_data,globals()['df_'+feature]],axis = 1)


### Split data into training and validation

In [67]:
train_data = loans.iloc[train_index]
validation_data = loans.iloc[valid_index]

model_feature = list(train_data.columns)
model_feature.remove('safe_loans')

x_train = train_data[model_feature].values
y_train = train_data['safe_loans'].values

x_valid = validation_data[model_feature].values
y_valid = validation_data['safe_loans'].values

In [35]:
classifier_2 = DecisionTreeClassifier(max_depth = 2)
classifier_6 = DecisionTreeClassifier(max_depth = 6)

decision_tree_model = classifier_6.fit(x_train,y_train)

small_model = classifier_2.fit(x_train,y_train)

In [39]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)

x_sample_validation = sample_validation_data[model_feature].values

y_sample_validation = sample_validation_data['safe_loans'].values


In [40]:
decision_tree_model.predict(x_sample_validation)

array([ 1, -1, -1,  1])

In [41]:
small_model.predict(x_sample_validation)

array([ 1, -1, -1,  1])

In [42]:
decision_tree_model.predict_proba(x_sample_validation)

array([[ 0.34156543,  0.65843457],
       [ 0.53630646,  0.46369354],
       [ 0.64750958,  0.35249042],
       [ 0.20789474,  0.79210526]])

In [43]:
small_model.predict_proba(x_sample_validation)

array([[ 0.41896585,  0.58103415],
       [ 0.59255339,  0.40744661],
       [ 0.59255339,  0.40744661],
       [ 0.23120112,  0.76879888]])

In [46]:
confusion_matrix(y_valid,decision_tree_model.predict(x_valid))

array([[3013, 1661],
       [1717, 2893]])

In [47]:
confusion_matrix(y_valid,small_model.predict(x_valid))

array([[3342, 1332],
       [2202, 2408]])

In [59]:
(3013+2893)/float(3013+1661+1717+2893)

0.6361482119775959

In [62]:
classifier_10 = DecisionTreeClassifier(max_depth = 10)

In [64]:
big_model = classifier_10.fit(x_train,y_train)

In [65]:
confusion_matrix(y_valid,big_model.predict(x_valid))

array([[3024, 1650],
       [1823, 2787]])