In [1]:
#Identifying Safe loans with Decision trees

In [2]:
import graphlab
graphlab.canvas.set_target('ipynb')

In [5]:
loans = graphlab.SFrame('lending-club-data.gl/')
loans['grade'].show()

In [14]:
loans['home_ownership'].show()

loans.column_names()

['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'is_inc_v',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'not_compliant',
 'status',
 'inactive_loans',
 'emp_length_num',
 'grade_num',
 'sub_grade_num',
 'delinq_2yrs_zero',
 'pub_rec_zero',
 'coll

In [None]:
#Exploring the target column
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')


In [15]:
loans['safe_loans'].show(view = 'Categorical')

In [16]:

features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                   # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [17]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)

Number of safe loans  : 99457
Number of risky loans : 23150


In [18]:
print "Percentage of safe loans  :", (len(safe_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw)))
print "Percentage of risky loans :", (len(risky_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw)))

Percentage of safe loans  : 0.811185331996
Percentage of risky loans : 0.188814668004


In [19]:
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

In [20]:
print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

Percentage of safe loans                 : 0.502236174422
Percentage of risky loans                : 0.497763825578
Total number of loans in our new dataset : 46508


In [21]:
#TRAINING AND VALIDATION DATA SPLIT
train_data, validation_data = loans_data.random_split(.8, seed=1)


In [22]:
#BUILDING A DECISION TREE CLASSIFIER
decision_tree_model = graphlab.decision_tree_classifier.create(train_data, validation_set=None,
                                target = target, features = features)

In [23]:
decision_tree_model.show(view="Tree")

In [25]:
#VISUALIZING THE LEARNED MODEL
small_model = graphlab.decision_tree_classifier.create(train_data, validation_set=None,
                   target = target, features = features, max_depth = 2)

In [26]:
small_model.show(view='Tree')

In [27]:
#MAKING PREDICTIONS
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none
B,B3,0,11,OWN,11.18,credit_card,36 months,1
D,D1,0,10,RENT,16.85,debt_consolidation,36 months,1
D,D2,0,3,RENT,13.97,other,60 months,0
A,A5,0,11,MORTGAGE,16.33,debt_consolidation,36 months,1

last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,82.4,0.0,1
1,96.4,0.0,1
1,59.5,0.0,-1
1,62.1,0.0,-1


In [28]:
decision_tree_model.predict(sample_validation_data)


dtype: int
Rows: 4
[1, -1, -1, 1]

In [29]:
(sample_validation_data['safe_loans'] == decision_tree_model.predict(sample_validation_data)).sum()/float(len(sample_validation_data))

0.5

In [30]:
#EXPLORING PROBABILITY PREDICTIONS
decision_tree_model.predict(sample_validation_data, output_type='probability')

dtype: float
Rows: 4
[0.5473502278327942, 0.4891221821308136, 0.4559234082698822, 0.5864479541778564]

In [31]:
small_model.predict(sample_validation_data, output_type='probability')

dtype: float
Rows: 4
[0.5242817997932434, 0.47226759791374207, 0.47226759791374207, 0.5798847675323486]

In [33]:
#VISUALIZE THE PREDICTIONS ON A TREE
sample_validation_data[1]
small_model.show(view="Tree")



In [34]:
small_model.predict(sample_validation_data[1])


dtype: int
Rows: 1
[-1]

In [37]:
#EVALUATING THE ACCURACY OF DECISION TREE MODEL
print "Accuracy of small model=", small_model.evaluate(train_data)['accuracy']
print "Acuuracy of decision tree=", decision_tree_model.evaluate(train_data)['accuracy']

Accuracy of small model= 0.613502041694
Acuuracy of decision tree= 0.640581345369


In [38]:
#EVALUATING ACCURACY OF COMPLEX DECISION TREE MODEL
big_model = graphlab.decision_tree_classifier.create(train_data, validation_set=None,
                   target = target, features = features, max_depth = 10)

In [39]:
print "Big model accuracy on train data=", big_model.evaluate(train_data)['accuracy']
print "Big model accuracy on Validataion data=", big_model.evaluate(validation_data)['accuracy']


Big model accuracy on train data= 0.665538362347
Big model accuracy on Validataion data= 0.627423524343


In [40]:
#COST OF MISTAKES 10,000 for FP and 20000 for FN
predictions = decision_tree_model.predict(validation_data)
decision_tree_model.show(view='Evaluation')


In [44]:
#False positives are predictions where the model predicts +1 but the true label is -1.
print "total predictions =", len(predictions)

false_positives = (validation_data[validation_data['safe_loans'] != predictions]['safe_loans'] == -1).sum()
print "total false positives=", false_positives

false_negatives = (validation_data[validation_data['safe_loans'] != predictions]['safe_loans'] == +1).sum()
print "total false negatives=", false_negatives


total predictions = 9284
total false positives= 1656
total false negatives= 1716


In [47]:
cost_of_mistakes = (false_negatives * 10000) + (false_positives * 20000)
print "cost of mistakes=", cost_of_mistakes

cost of mistakes= 50280000
