In [1]:
import pandas as pd
import string
import re
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder 
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
loans = pd.read_csv('lending-club-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans.drop('bad_loans', axis=1,inplace=True)

In [4]:
target = 'safe_loans'
features = ['grade',                     # grade of the loan (categorical)
            'sub_grade_num',             # sub-grade of the loan as a number from 0 to 1
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'payment_inc_ratio',         # ratio of the monthly payment to income
            'delinq_2yrs',               # number of delinquincies
             'delinq_2yrs_zero',          # no delinquincies in last 2 years
            'inq_last_6mths',            # number of creditor inquiries in last 6 months
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'open_acc',                  # number of open credit accounts
            'pub_rec',                   # number of derogatory public records
            'pub_rec_zero',              # no derogatory public records
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'int_rate',                  # interest rate of the loan
            'total_rec_int',             # interest received to date
            'annual_inc',                # annual income of borrower
            'funded_amnt',               # amount committed to the loan
            'funded_amnt_inv',           # amount committed by investors for the loan
            'installment',               # monthly payment owed by the borrower
           ]


In [5]:
num_rows_with_na = loans.shape[0]
loans = loans[[target] + features].dropna()

# Count the number of rows with missing data

num_rows = loans.shape[0]
print ('Dropping %s observations; keeping %s ' % (num_rows_with_na, num_rows))

Dropping 122607 observations; keeping 122578 


In [6]:
train_idx = pd.read_json('module-8-assignment-1-train-idx.json')
validation_idx = pd.read_json('module-8-assignment-1-validation-idx.json')
train_data = loans.iloc[train_idx[0].to_numpy()]
validation_data = loans.iloc[validation_idx[0].to_numpy()]

In [7]:
train_data_objects = train_data.select_dtypes(['object'])
object_columns_train = train_data_objects.columns.values
train_data = pd.get_dummies(train_data, columns=object_columns_train, drop_first=False)
features_train = train_data.columns.values
features_train = np.delete(features_train, 0)
features_train

array(['sub_grade_num', 'short_emp', 'emp_length_num', 'dti',
       'payment_inc_ratio', 'delinq_2yrs', 'delinq_2yrs_zero',
       'inq_last_6mths', 'last_delinq_none', 'last_major_derog_none',
       'open_acc', 'pub_rec', 'pub_rec_zero', 'revol_util',
       'total_rec_late_fee', 'int_rate', 'total_rec_int', 'annual_inc',
       'funded_amnt', 'funded_amnt_inv', 'installment', 'grade_A',
       'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G',
       'home_ownership_MORTGAGE', 'home_ownership_OTHER',
       'home_ownership_OWN', 'home_ownership_RENT', 'purpose_car',
       'purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_home_improvement', 'purpose_house',
       'purpose_major_purchase', 'purpose_medical', 'purpose_moving',
       'purpose_other', 'purpose_small_business', 'purpose_vacation',
       'purpose_wedding'], dtype=object)

In [8]:
validation_data_objects = validation_data.select_dtypes(['object'])
object_columns_validation = validation_data_objects.columns.values
validation_data = pd.get_dummies(validation_data, columns=object_columns_validation, drop_first=False) 
features_validation = validation_data.columns.values
features_validation = np.delete(features_validation, 0)
features_validation

array(['sub_grade_num', 'short_emp', 'emp_length_num', 'dti',
       'payment_inc_ratio', 'delinq_2yrs', 'delinq_2yrs_zero',
       'inq_last_6mths', 'last_delinq_none', 'last_major_derog_none',
       'open_acc', 'pub_rec', 'pub_rec_zero', 'revol_util',
       'total_rec_late_fee', 'int_rate', 'total_rec_int', 'annual_inc',
       'funded_amnt', 'funded_amnt_inv', 'installment', 'grade_A',
       'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G',
       'home_ownership_MORTGAGE', 'home_ownership_OTHER',
       'home_ownership_OWN', 'home_ownership_RENT', 'purpose_car',
       'purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_home_improvement', 'purpose_house',
       'purpose_major_purchase', 'purpose_medical', 'purpose_moving',
       'purpose_other', 'purpose_small_business', 'purpose_vacation',
       'purpose_wedding'], dtype=object)

In [9]:
validation_data.shape

(9284, 45)

In [10]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,safe_loans,sub_grade_num,short_emp,emp_length_num,dti,payment_inc_ratio,delinq_2yrs,delinq_2yrs_zero,inq_last_6mths,last_delinq_none,...,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding
22,1,0.2,0,3,29.44,6.30496,0.0,1.0,0.0,1,...,0,0,0,0,0,0,0,0,0,0
26,1,0.6,1,1,12.19,13.4952,0.0,1.0,0.0,1,...,0,0,0,0,0,0,0,0,0,0
24,-1,0.4,0,3,13.97,2.96736,3.0,0.0,0.0,0,...,0,0,0,0,0,0,1,0,0,0
41,-1,1.0,0,11,16.33,1.90524,0.0,1.0,0.0,1,...,1,0,0,0,0,0,0,0,0,0


In [11]:
X_train=train_data[features_train]
y_train=train_data[target]
model_5 = GradientBoostingClassifier(max_depth=6 ,n_estimators=5)
model_5.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=5,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [12]:
model_5.predict(sample_validation_data[features_validation])

array([ 1,  1, -1,  1], dtype=int64)

In [13]:
model_5.predict_proba(sample_validation_data[features_validation])

array([[0.41642331, 0.58357669],
       [0.46949689, 0.53050311],
       [0.53807792, 0.46192208],
       [0.39591639, 0.60408361]])

In [14]:
model_5.score(validation_data[features_validation],validation_data[target])

0.6614605773373546

In [15]:
y_true = validation_data[target]
y_pred = model_5.predict(validation_data[features_validation])
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
fp

1652

In [16]:
fn

1491

In [17]:
cost = 10000 * fn  + 20000 * fp
cost

47950000

In [18]:
validation_data.shape

(9284, 45)

In [19]:
probability_predictions = model_5.predict_proba(validation_data[features_validation])
probability_predictions.shape

(9284, 2)

In [20]:
validation_data['prediction_0'] = probability_predictions[:,0]
validation_data['prediction_1'] = probability_predictions[:,1]

In [21]:
validation_data['prediction'] = pd.DataFrame(probability_predictions).apply(lambda x: x[0] if x[0]> x[1] else x[1], axis=1).values
    

In [34]:
validation_data[validation_data['safe_loans']==1].sort_values(by='prediction', ascending=False)

Unnamed: 0,safe_loans,sub_grade_num,short_emp,emp_length_num,dti,payment_inc_ratio,delinq_2yrs,delinq_2yrs_zero,inq_last_6mths,last_delinq_none,...,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,prediction_0,prediction_1,prediction
16985,1,0.6,0,2,15.15,15.28930,0.0,1.0,1.0,0,...,0,0,0,0,0,0,0,0.667450,0.332550,0.667450
109053,1,1.0,0,2,23.60,3.73994,0.0,1.0,3.0,1,...,0,0,0,0,0,0,0,0.663762,0.336238,0.663762
30367,1,0.4,0,5,15.93,13.75400,0.0,1.0,0.0,1,...,0,0,0,0,0,0,0,0.663197,0.336803,0.663197
93333,1,0.2,0,3,3.74,1.55419,1.0,0.0,1.0,0,...,0,0,0,0,0,0,0,0.662409,0.337591,0.662409
739,1,0.4,0,3,11.09,6.86200,0.0,1.0,1.0,1,...,0,0,0,0,1,0,0,0.662409,0.337591,0.662409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66523,1,0.6,0,11,23.84,6.69304,1.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0.500002,0.499998,0.500002
118538,1,0.2,0,4,18.36,5.18855,0.0,1.0,0.0,1,...,0,0,0,1,0,0,0,0.500002,0.499998,0.500002
39007,1,0.4,0,4,22.51,6.80014,0.0,1.0,3.0,0,...,0,0,0,0,0,0,0,0.500002,0.499998,0.500002
74697,1,0.6,0,6,26.04,7.60308,0.0,1.0,2.0,1,...,0,0,0,0,0,0,0,0.500002,0.499998,0.500002


In [None]:
model_10 = GradientBoostingClassifier(max_depth=10 ,n_estimators=5)
model_10.fit(train_data[features_train],train_data[target])
model_10.score(validation_data[features_validation],validation_data[target])

In [None]:
model_50 = GradientBoostingClassifier(max_depth=50 ,n_estimators=5)
model_50.fit(train_data[features_train],train_data[target])
model_50.score(validation_data[features_validation],validation_data[target])