In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
%matplotlib inline

In [32]:
start = time.time()

# Replace the path below with the correct path for your data
y2015 = pd.read_csv(
    'https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/LoanStats3d.csv',
    skipinitialspace=True,
    header=1
)

print(f'Total time to read in raw data: {round(time.time() - start, 2)} seconds.')

# Note the warning about dtypes

  y2015 = pd.read_csv(


Total time to read in raw data: 20.78 seconds.


In [33]:
# Number of rows and columns
y2015.shape

(421097, 111)

In [34]:
y2015.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,68009401,72868139.0,16000.0,16000.0,16000.0,60 months,14.85%,379.39,C,C5,...,0.0,2.0,78.9,0.0,0.0,2.0,298100.0,31329.0,281300.0,13400.0
1,68354783,73244544.0,9600.0,9600.0,9600.0,36 months,7.49%,298.58,A,A4,...,0.0,2.0,100.0,66.7,0.0,0.0,88635.0,55387.0,12500.0,75635.0
2,68466916,73356753.0,25000.0,25000.0,25000.0,36 months,7.49%,777.55,A,A4,...,0.0,0.0,100.0,20.0,0.0,0.0,373572.0,68056.0,38400.0,82117.0
3,68466961,73356799.0,28000.0,28000.0,28000.0,36 months,6.49%,858.05,A,A2,...,0.0,0.0,91.7,22.2,0.0,0.0,304003.0,74920.0,41500.0,42503.0
4,68495092,73384866.0,8650.0,8650.0,8650.0,36 months,19.89%,320.99,E,E3,...,0.0,12.0,100.0,50.0,1.0,0.0,38998.0,18926.0,2750.0,18248.0


In [35]:
# Distribution of response values (note that this is a multiclass problem, not binary)
y2015['loan_status'].value_counts()

loan_status
Current               287414
Fully Paid             87989
Charged Off            29178
Late (31-120 days)      9510
In Grace Period         4320
Late (16-30 days)       1888
Default                  796
Name: count, dtype: int64

In [36]:
# eda 
y2015['term'].unique()

array([' 60 months', ' 36 months', nan], dtype=object)

In [37]:
pd.get_dummies(y2015['term'].head(),dtype=int)

Unnamed: 0,36 months,60 months
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0


In [38]:
#EDA
# Convert `member_id` to string; it generally isn't appropriate to represent identifier fields as numbers
y2015['member_id'] = y2015['member_id'].astype(str)

# Identify categorical columns
categorical = y2015.select_dtypes(include=['object'])
categorical_dict = {}

# Create dictionary of categorical columns and the number of unique categories for each one
for i in categorical:
    column = categorical[i]
    num_unique = column.nunique()
    categorical_dict[i] = num_unique

for key, value in categorical_dict.items():
    print(f'{key}: {value}')

id: 421097
member_id: 421096
term: 2
int_rate: 110
grade: 7
sub_grade: 35
emp_title: 120812
emp_length: 11
home_ownership: 4
verification_status: 3
issue_d: 12
loan_status: 7
pymnt_plan: 1
url: 421095
desc: 34
purpose: 14
title: 27
zip_code: 914
addr_state: 49
earliest_cr_line: 668
revol_util: 1211
initial_list_status: 2
last_pymnt_d: 25
next_pymnt_d: 4
last_credit_pull_d: 26
application_type: 2
verification_status_joint: 3


In [39]:
# Created a cleaned data DataFrame, which is initially a copy of the raw data
y2015_cleaned = y2015.copy()

# Convert `int_rate` to numeric
y2015_cleaned['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')

# Drop high-cardinality categorical features
drop_cols = [col for col, value in categorical_dict.items() if value >= 30 and col != 'int_rate']
print(f'High-cardinality columns to drop: {drop_cols}\n')
print(y2015_cleaned.columns)
y2015_cleaned.drop(columns=drop_cols, axis=1, inplace=True)

# Drop date columns
date_cols = list(y2015_cleaned.filter(regex='_d$').columns)
y2015_cleaned.drop(columns=date_cols, axis=1, inplace=True)

print(y2015_cleaned.shape)

High-cardinality columns to drop: ['id', 'member_id', 'sub_grade', 'emp_title', 'url', 'desc', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util']

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       ...
       'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq',
       'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tax_liens',
       'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
       'total_il_high_credit_limit'],
      dtype='object', length=111)
(421097, 97)


In [40]:
y2015_cleaned.tail()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
421092,13000.0,13000.0,13000.0,60 months,15.99,316.07,D,5 years,RENT,35000.0,...,0.0,3.0,100.0,50.0,1.0,0.0,51239.0,34178.0,10600.0,33239.0
421093,12000.0,12000.0,12000.0,60 months,19.99,317.86,E,1 year,RENT,64400.0,...,1.0,2.0,95.0,66.7,0.0,0.0,96919.0,58418.0,9700.0,69919.0
421094,20000.0,20000.0,20000.0,36 months,11.99,664.2,B,10+ years,RENT,100000.0,...,0.0,1.0,100.0,50.0,0.0,1.0,43740.0,33307.0,41700.0,0.0
421095,,,,,,,,,,,...,,,,,,,,,,
421096,,,,,,,,,,,...,,,,,,,,,,


In [41]:
# Remove two summary rows (that don't actually contain data) at the end
y2015_cleaned = y2015_cleaned[:-2]

# Drop columns with missing values
y2015_cleaned.dropna(axis=1, inplace=True)

print(y2015_cleaned.shape)

(421095, 65)


In [42]:
pd.get_dummies(y2015_cleaned)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_f,initial_list_status_w,application_type_INDIVIDUAL,application_type_JOINT
0,16000.0,16000.0,16000.0,14.85,379.39,48000.0,33.18,0.0,0.0,11.0,...,False,False,False,False,False,False,False,True,True,False
1,9600.0,9600.0,9600.0,7.49,298.58,60000.0,22.44,0.0,0.0,7.0,...,False,False,False,False,False,False,False,True,True,False
2,25000.0,25000.0,25000.0,7.49,777.55,109000.0,26.02,0.0,1.0,9.0,...,False,False,False,False,False,False,False,True,True,False
3,28000.0,28000.0,28000.0,6.49,858.05,92000.0,21.60,0.0,0.0,16.0,...,False,False,False,False,False,False,False,True,True,False
4,8650.0,8650.0,8650.0,19.89,320.99,55000.0,25.49,0.0,4.0,18.0,...,False,False,False,False,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421090,10000.0,10000.0,10000.0,11.99,332.10,31000.0,28.69,0.0,0.0,9.0,...,False,False,False,False,False,False,True,False,True,False
421091,24000.0,24000.0,24000.0,11.99,797.03,79000.0,3.90,0.0,1.0,5.0,...,False,False,False,False,False,False,True,False,True,False
421092,13000.0,13000.0,13000.0,15.99,316.07,35000.0,30.90,0.0,0.0,9.0,...,False,False,False,False,False,False,False,True,True,False
421093,12000.0,12000.0,12000.0,19.99,317.86,64400.0,27.19,1.0,2.0,17.0,...,False,False,False,False,False,False,False,True,True,False


In [43]:
# split X and Y
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
import time
X = y2015_cleaned.drop('loan_status', axis=1)
Y = y2015_cleaned['loan_status']
X = pd.get_dummies(X)

In [44]:
Y.unique().tolist()

['Current',
 'Fully Paid',
 'Charged Off',
 'Late (31-120 days)',
 'In Grace Period',
 'Default',
 'Late (16-30 days)']

In [48]:
# Decision trees
# This is the model that you'll be using
from sklearn import tree

# A convenience for displaying visualizations
from IPython.display import Image

# Packages for rendering the tree
import pydotplus
import graphviz
# random farest
start = time.time()
# Initialize and train the tree
decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=4,
    random_state = 1337
)

print(f'{cross_val_score(decision_tree, X, Y, cv=10)}\n')

print(f'Cross-validation time elapsed: {round(time.time() - start, 2)} seconds.')

[0.89313702 0.89275706 0.89342199 0.89375445 0.89297079 0.89318198
 0.89318198 0.89232706 0.89332447 0.89220832]

Cross-validation time elapsed: 5.17 seconds.


In [49]:
# random farest
start = time.time()

rfc = ensemble.RandomForestClassifier(n_estimators=10, n_jobs=-1)

print(f'{cross_val_score(rfc, X, Y, cv=10)}\n')

print(f'Cross-validation time elapsed: {round(time.time() - start, 2)} seconds.')

[0.95528378 0.96646877 0.96238423 0.96079316 0.96060318 0.96060225
 0.96055475 0.96048351 0.960531   0.96026978]

Cross-validation time elapsed: 31.29 seconds.
