In [1]:
import pandas as pd
import string
import re
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder 
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
loans = pd.read_csv('lending-club-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122607 entries, 0 to 122606
Data columns (total 68 columns):
id                             122607 non-null int64
member_id                      122607 non-null int64
loan_amnt                      122607 non-null int64
funded_amnt                    122607 non-null int64
funded_amnt_inv                122607 non-null int64
term                           122607 non-null object
int_rate                       122607 non-null float64
installment                    122607 non-null float64
grade                          122607 non-null object
sub_grade                      122607 non-null object
emp_title                      115767 non-null object
emp_length                     118516 non-null object
home_ownership                 122607 non-null object
annual_inc                     122603 non-null float64
is_inc_v                       122607 non-null object
issue_d                        122607 non-null object
loan_status                

In [4]:
loans.columns.values

array(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'is_inc_v', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc',
       'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee',
       'recoveries', 'collection_recovery_fee', 'last_pymnt_d',
       'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans',
       'bad_loans', 'emp_length_num', 'grade_num', '

In [5]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: 1 if x==0 else -1)

In [6]:
loans.drop('bad_loans', axis=1,inplace=True)

In [7]:
loans.columns.values

array(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'is_inc_v', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc',
       'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee',
       'recoveries', 'collection_recovery_fee', 'last_pymnt_d',
       'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'not_compliant', 'status', 'inactive_loans',
       'emp_length_num', 'grade_num', 'sub_grade_num

In [8]:
percentage_safe = loans.safe_loans[loans.safe_loans==1].count()/loans.safe_loans.count()
percentage_safe

0.8111853319957262

In [9]:
percentage_risky = loans.safe_loans[loans.safe_loans==-1].count()/loans.safe_loans.count()
percentage_risky

0.18881466800427382

In [10]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [11]:
subset_percentage_safe = loans.safe_loans[loans.safe_loans==1].count()/loans.safe_loans.count()
subset_percentage_safe

0.8111853319957262

In [12]:
train_idx = pd.read_json('module-5-assignment-1-train-idx.json')
validation_idx = pd.read_json('module-5-assignment-1-validation-idx.json')

In [13]:
train_idx[0]

0             1
1             6
2             7
3            10
4            12
          ...  
37219    122572
37220    122575
37221    122588
37222    122599
37223    122603
Name: 0, Length: 37224, dtype: int64

In [14]:
train_data = loans.iloc[train_idx[0].to_numpy()]
validation_data = loans.iloc[validation_idx[0].to_numpy()]

In [15]:
#train_data = train_data[features + [target]]
#validation_data = validation_data[features + [target]]

In [16]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37224 entries, 1 to 122603
Data columns (total 13 columns):
grade                    37224 non-null object
sub_grade                37224 non-null object
short_emp                37224 non-null int64
emp_length_num           37224 non-null int64
home_ownership           37224 non-null object
dti                      37224 non-null float64
purpose                  37224 non-null object
term                     37224 non-null object
last_delinq_none         37224 non-null int64
last_major_derog_none    37224 non-null int64
revol_util               37224 non-null float64
total_rec_late_fee       37224 non-null float64
safe_loans               37224 non-null int64
dtypes: float64(3), int64(5), object(5)
memory usage: 4.0+ MB


In [17]:
validation_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9284 entries, 24 to 122480
Data columns (total 13 columns):
grade                    9284 non-null object
sub_grade                9284 non-null object
short_emp                9284 non-null int64
emp_length_num           9284 non-null int64
home_ownership           9284 non-null object
dti                      9284 non-null float64
purpose                  9284 non-null object
term                     9284 non-null object
last_delinq_none         9284 non-null int64
last_major_derog_none    9284 non-null int64
revol_util               9284 non-null float64
total_rec_late_fee       9284 non-null float64
safe_loans               9284 non-null int64
dtypes: float64(3), int64(5), object(5)
memory usage: 1015.4+ KB


In [18]:
train_data.sample(5)

Unnamed: 0,grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
111488,C,C1,0,7,RENT,25.3,debt_consolidation,36 months,0,0,59.8,0.0,-1
71000,B,B4,0,5,RENT,17.75,debt_consolidation,36 months,1,1,55.7,0.0,1
6819,A,A3,0,5,OWN,7.45,small_business,36 months,1,1,1.6,0.0,1
3920,D,D2,1,0,RENT,11.12,debt_consolidation,36 months,0,1,77.9,0.0,-1
16733,E,E3,0,3,RENT,7.62,debt_consolidation,60 months,0,1,0.0,0.0,-1


In [19]:
safe_loans = train_data[train_data[target] == +1]
risky_loans = train_data[train_data[target] == -1]
print ("Number of safe loans  : %s" % len(safe_loans))
print ("Number of risky loans : %s" % len(risky_loans))

Number of safe loans  : 18748
Number of risky loans : 18476


In [20]:
print ("Percentage of safe loans                 :", len(safe_loans) / float(len(train_data)))
print ("Percentage of risky loans                :", len(risky_loans) / float(len(train_data)))
print ("Total number of loans in our new dataset :", len(train_data))

Percentage of safe loans                 : 0.5036535568450462
Percentage of risky loans                : 0.4963464431549538
Total number of loans in our new dataset : 37224


In [21]:
train_data_objects = train_data.select_dtypes(['object'])

In [22]:
object_columns_train = train_data_objects.columns.values

In [23]:
train_data = pd.get_dummies(train_data, columns=object_columns_train, drop_first=True) 

In [24]:
validation_data_objects = validation_data.select_dtypes(['object'])
object_columns_validation = validation_data_objects.columns.values
validation_data = pd.get_dummies(validation_data, columns=object_columns_validation, drop_first=True) 

In [25]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37224 entries, 1 to 122603
Data columns (total 63 columns):
short_emp                     37224 non-null int64
emp_length_num                37224 non-null int64
dti                           37224 non-null float64
last_delinq_none              37224 non-null int64
last_major_derog_none         37224 non-null int64
revol_util                    37224 non-null float64
total_rec_late_fee            37224 non-null float64
safe_loans                    37224 non-null int64
grade_B                       37224 non-null uint8
grade_C                       37224 non-null uint8
grade_D                       37224 non-null uint8
grade_E                       37224 non-null uint8
grade_F                       37224 non-null uint8
grade_G                       37224 non-null uint8
sub_grade_A2                  37224 non-null uint8
sub_grade_A3                  37224 non-null uint8
sub_grade_A4                  37224 non-null uint8
sub_grade_A5     

In [26]:
validation_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9284 entries, 24 to 122480
Data columns (total 63 columns):
short_emp                     9284 non-null int64
emp_length_num                9284 non-null int64
dti                           9284 non-null float64
last_delinq_none              9284 non-null int64
last_major_derog_none         9284 non-null int64
revol_util                    9284 non-null float64
total_rec_late_fee            9284 non-null float64
safe_loans                    9284 non-null int64
grade_B                       9284 non-null uint8
grade_C                       9284 non-null uint8
grade_D                       9284 non-null uint8
grade_E                       9284 non-null uint8
grade_F                       9284 non-null uint8
grade_G                       9284 non-null uint8
sub_grade_A2                  9284 non-null uint8
sub_grade_A3                  9284 non-null uint8
sub_grade_A4                  9284 non-null uint8
sub_grade_A5                  9284

In [27]:
y_train = train_data['safe_loans']
X_train = train_data.drop('safe_loans', axis=1)

In [28]:
decison_tree_model = DecisionTreeClassifier()
decison_tree_model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [29]:
small_model = DecisionTreeClassifier(max_depth=2)
small_model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [30]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data


Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_B,grade_C,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 60 months
19,0,11,11.18,1,1,82.4,0.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
79,0,10,16.85,1,1,96.4,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
24,0,3,13.97,0,1,59.5,0.0,-1,0,0,...,0,0,0,0,0,1,0,0,0,1
41,0,11,16.33,1,1,62.1,0.0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
y_sample_validation_data = sample_validation_data.safe_loans
x_sample_validation_data = sample_validation_data.drop('safe_loans', axis=1)

In [32]:
decison_tree_model.predict(x_sample_validation_data)

array([ 1, -1,  1,  1], dtype=int64)

In [33]:
(sample_validation_data['safe_loans'] == decison_tree_model.predict(x_sample_validation_data)).sum()/float(len(sample_validation_data))

0.25

In [34]:
decison_tree_model.predict_proba(x_sample_validation_data)

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [35]:
small_model.predict(x_sample_validation_data)

array([ 1,  1, -1,  1], dtype=int64)

In [36]:
small_model.predict_proba(x_sample_validation_data)

array([[0.42788678, 0.57211322],
       [0.42788678, 0.57211322],
       [0.6356058 , 0.3643942 ],
       [0.42788678, 0.57211322]])

In [37]:
y_validation = validation_data['safe_loans']
X_validation = validation_data.drop('safe_loans', axis=1)

In [38]:
decision_predict = decison_tree_model.predict(X_validation)

In [39]:
(decision_predict==y_validation).sum()/len(y_validation)

0.5546100818612667

In [40]:
from sklearn.metrics import accuracy_score
decision_accuracy = accuracy_score(decision_predict, y_validation)

In [41]:
decision_accuracy

0.5546100818612667

In [42]:
decision_predict_train = decison_tree_model.predict(X_train)
decision_accuracy_train = accuracy_score(decision_predict_train, y_train)
decision_accuracy_train

1.0

In [43]:
small_predict = small_model.predict(X_validation)
small_accuracy = accuracy_score(small_predict, y_validation)
small_accuracy

0.5996337785437311

In [44]:
small_predict_train = small_model.predict(X_train)
small_accuracy_train = accuracy_score(small_predict_train, y_train)
small_accuracy_train

0.5998280679131743

In [45]:
decison_tree_model.tree_.max_depth

44

In [46]:
big_model = DecisionTreeClassifier(max_depth=10)
big_model.fit(X_train,y_train)
big_predict = big_model.predict(X_validation)
big_accuracy = accuracy_score(big_predict, y_validation)
big_accuracy

0.6287160706591987

In [47]:
big_predict_train = big_model.predict(X_train)
big_accuracy_train = accuracy_score(big_predict_train, y_train)
big_accuracy_train

0.6656995486782721

In [48]:
len(big_predict)

9284

In [49]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(big_predict, y_validation).ravel()

In [50]:
print(fp)
print(fn)

1737
1710


In [51]:
(fp * 20000) + (fn * 10000)

51840000