In [1]:
import numpy as np
import pandas as pd
import json
from sklearn import tree
from os import system

Load the dataset:

In [2]:
loans = pd.read_csv('lending-club-data.csv', low_memory=False)

Look at columns:

In [3]:
print loans.columns.tolist()

['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'is_inc_v', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'not_compliant', 'status', 'inactive_loans', 'bad_loans', 'emp_length_num', 'grade_num', 'sub_grade_num', 'delinq_2yrs_zero', 'pub_rec_zero', 'collections_12_mths_zero', 'short_emp', 'payment_in

In [4]:
# reformat 0 and 1 to 1 and -1
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : 1 if x==0 else -1)
# and remove the column
del loans['bad_loans']

Check the distribution of the loans safe vs bad:

In [5]:
safe, bad = loans['safe_loans'].value_counts()
print 'Percentage of safe loans is: {0:.2f}.'.format(safe / float(len(loans)))
print 'Percentage of safe loans is: {0:.2f}.'.format(bad / float(len(loans)))

Percentage of safe loans is: 0.81.
Percentage of safe loans is: 0.19.


In [6]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# extract the feature columns and target column and do 1-hot encoding via get_dummies
loans = pd.get_dummies(loans[features + [target]])

In [7]:
with open('module-5-assignment-1-train-idx.json') as f:
    train_idx = json.loads(next(f))
with open('module-5-assignment-1-validation-idx.json') as f:
    validation_idx = json.loads(next(f))

In [8]:
train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]

In [25]:
safe, bad = train_data['safe_loans'].value_counts()
print 'Percentage of safe loans is: {0:.2f}.'.format(safe / float(len(train_data)))
print 'Percentage of safe loans is: {0:.2f}.'.format(bad / float(len(train_data)))

Percentage of safe loans is: 0.50.
Percentage of safe loans is: 0.50.


In [10]:
X = train_data.drop('safe_loans', axis=1).as_matrix()
y = train_data['safe_loans'].values

Train the decision tree classifier from sklearn:

In [11]:
decision_tree_model = tree.DecisionTreeClassifier(max_depth=6)

In [12]:
decision_tree_model.fit(X, y)

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=6, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')

In [13]:
# small tree for visualization purposes:
small_model = tree.DecisionTreeClassifier(max_depth=2)
small_model.fit(X, y)

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=2, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')

In [14]:
tree.export_graphviz(small_model, out_file='tree.dot')  
system('dot tree.dot -Tpng -o tree.png')

0

Grab 2 positive and 2 negative examples to test our prediction:

In [15]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_features = sample_validation_data.drop('safe_loans', axis=1)
sample_validation_labels = sample_validation_data['safe_loans']

In [16]:
predictions = decision_tree_model.predict(sample_validation_features)

What percentage of the predictions on sample_validation_data did decision_tree_model get correct?

In [17]:
# check the accuracy
sum(predictions == sample_validation_labels) / float(len(predictions))

0.5

Which loan has the highest probability of being classified as a safe loan?

In [18]:
p = decision_tree_model.predict_proba(sample_validation_features)
print p
# take probability of loan being safe save it to a list for convenience
p = [x[1] for x in p]
print p.index(max(p)) + 1

[[ 0.34156543  0.65843457]
 [ 0.53630646  0.46369354]
 [ 0.64750958  0.35249042]
 [ 0.20789474  0.79210526]]
4


In [19]:
small_model.predict_proba(sample_validation_features)

array([[ 0.41896585,  0.58103415],
       [ 0.59255339,  0.40744661],
       [ 0.59255339,  0.40744661],
       [ 0.23120112,  0.76879888]])

Let's check how well do these models do on our validation dataset:

In [20]:
validation_data_features = validation_data.drop('safe_loans', axis=1).as_matrix()
validation_data_labels = validation_data['safe_loans'].values

In [21]:
print round(decision_tree_model.score(validation_data_features, validation_data_labels), 2)
print round(small_model.score(validation_data_features, validation_data_labels), 2)

0.64
0.62


In [22]:
big_model = tree.DecisionTreeClassifier(max_depth=10)
big_model.fit(X, y)

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=10, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')

How does the performance of big_model on the validation set compare to decision_tree_model on the validation set? Is this a sign of overfitting?

In [23]:
print round(big_model.score(validation_data_features, validation_data_labels), 2)

0.63


Let us assume that each mistake costs money:

Assume a cost of \$10,000 per false negative.
Assume a cost of \$20,000 per false positive.

Compute cost of mistakes made by model:

In [24]:
# get predictions
predictions = decision_tree_model.predict(validation_data_features)
# there is probably a better way
false_positives = len(np.where((predictions == 1) & (validation_data_labels == -1))[0])
false_negatives = len(np.where((predictions == -1) & (validation_data_labels == 1))[0])
print 'Total cost is %d.' % (false_negatives * 10e3 + false_positives * 20e3)

Total cost is 50390000.
