## Import Data

In [1]:
# Option A. Call Helper, get_train_tets_split()

# Import "credit_data.ipynb". That is, processed
# train and test data and corresponding target
# variables. 

""" import import_ipynb
from ipynb.fs.full.credit_data import get_train_test_split
loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = \
    get_train_test_split() """

# Option B. Load processed loan data sets form .csv

import numpy as np
import pandas as pd
import re
import plotly.graph_objects as go
from plotly.subplots import make_subplots

loan_data_inputs_train = pd.read_csv('loan_data_inputs_train.csv', index_col = 0)
loan_data_targets_train = pd.read_csv('loan_data_targets_train.csv', index_col = 0) #, header = None)
loan_data_inputs_test = pd.read_csv('loan_data_inputs_test.csv', index_col = 0)
loan_data_targets_test = pd.read_csv('loan_data_targets_test.csv', index_col = 0) #, header = None) 

In [2]:
loan_data_inputs_train.head()

Unnamed: 0,Unnamed: 0.1,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,...,annual_inc:30K-40K,annual_inc:40K-50K,annual_inc:50K-60K,annual_inc:60K-70K,annual_inc:70K-80K,annual_inc:80K-90K,annual_inc:90K-100K,annual_inc:100K-120K,annual_inc:120K-140K,annual_inc:>140K
321284,321284,24846405,27279208,4075,"(3950.0, 4640.0]",4075.0,36 months,6.49,124.88,A,...,1,0,0,0,0,0,0,0,0,0
68126,68126,8574913,10337036,10075,"(9470.0, 10160.0]",10075.0,36 months,15.1,349.75,C,...,0,0,1,0,0,0,0,0,0,0
137748,137748,5144573,6466728,12000,"(11540.0, 12230.0]",12000.0,36 months,13.11,404.97,B,...,0,1,0,0,0,0,0,0,0,0
228145,228145,1112416,1342401,8000,"(7400.0, 8090.0]",8000.0,36 months,10.65,260.59,B,...,1,0,0,0,0,0,0,0,0,0
320995,320995,24726617,27169376,5900,"(5330.0, 6020.0]",5900.0,36 months,10.15,190.8,B,...,1,0,0,0,0,0,0,0,0,0


In [3]:
loan_data_targets_train.head()

Unnamed: 0,good_bad
321284,1
68126,0
137748,1
228145,1
320995,0


In [4]:
print(loan_data_inputs_train.shape)
print(loan_data_inputs_test.shape)
print(loan_data_targets_train.shape)
print(loan_data_targets_test.shape)

(373028, 288)
(93257, 288)
(373028, 1)
(93257, 1)


In [5]:
loan_data_inputs_train.columns.values
#loan_data_inputs_test.columns.values

array(['Unnamed: 0.1', 'id', 'member_id', 'loan_amnt', 'funded_amnt',
       'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade',
       'sub_grade', 'emp_title', 'emp_length', 'home_ownership',
       'annual_inc', 'verification_status', 'issue_d', 'loan_status',
       'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code',
       'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee',
       'recoveries', 'collection_recovery_fee', 'last_pymnt_d',
       'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint',
       've

### [Video 41.] Loading Data & Selecting the Features (for the PD Model)

In [6]:
# Here we select a limited set of input variables in a new dataframe.

# TECHNICAL NOTE: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike

inputs_train_with_ref_cat = loan_data_inputs_train.loc[:, ['grade:A',
'grade:B',
'grade:C',
'grade:D',
'grade:E',
'grade:F',
'grade:G',
'home_ownership:RENT_OTHER_NONE_ANY',
'home_ownership:OWN',
'home_ownership:MORTGAGE',
'addr_state:ND_NE_IA_NV_FL_HI_AL',
'addr_state:NM_VA',
'addr_state:NY',
'addr_state:OK_TN_MO_LA_MD_NC',
'addr_state:CA',
'addr_state:UT_KY_AZ_NJ',
'addr_state:AR_MI_PA_OH_MN',
'addr_state:RI_MA_DE_SD_IN',
'addr_state:GA_WA_OR',
'addr_state:WI_MT',
'addr_state:TX',
'addr_state:IL_CT',
'addr_state:KS_SC_CO_VT_AK_MS',
'addr_state:WV_NH_WY_DC_ME_ID',
'verification_status:Not Verified',
'verification_status:Source Verified',
'verification_status:Verified',
'purpose:educ__sm_b__wedd__ren_en__mov__house',
'purpose:credit_card',
'purpose:debt_consolidation',
'purpose:oth__med__vacation',
'purpose:major_purch__car__home_impr',
'initial_list_status:f',
'initial_list_status:w',
'term:36',
'term:60',
'emp_length:0',
'emp_length:1',
'emp_length:2-4',
'emp_length:5-6',
'emp_length:7-9',
'emp_length:10',
'mths_since_issue_d:<38',
'mths_since_issue_d:38-39',
'mths_since_issue_d:40-41',
'mths_since_issue_d:42-48',
'mths_since_issue_d:49-52',
'mths_since_issue_d:53-64',
'mths_since_issue_d:65-84',
'mths_since_issue_d:>84',
'int_rate:<9.548',
'int_rate:9.548-12.025',
'int_rate:12.025-15.74',
'int_rate:15.74-20.281',
'int_rate:>20.281',
'mths_since_earliest_cr_line:<140',
'mths_since_earliest_cr_line:141-164',
'mths_since_earliest_cr_line:165-247',
'mths_since_earliest_cr_line:248-270',
'mths_since_earliest_cr_line:271-352',
'mths_since_earliest_cr_line:>352',
'delinq_2yrs:0',
'delinq_2yrs:1-3',
'delinq_2yrs:>=4',
'inq_last_6mths:0',
'inq_last_6mths:1-2',
'inq_last_6mths:3-6',
'inq_last_6mths:>6',
'open_acc:0',
'open_acc:1-3',
'open_acc:4-12',
'open_acc:13-17',
'open_acc:18-22',
'open_acc:23-25',
'open_acc:26-30',
'open_acc:>=31',
'pub_rec:0-2',
'pub_rec:3-4',
'pub_rec:>=5',
'total_acc:<=27',
'total_acc:28-51',
'total_acc:>=52',
'annual_inc:<20K',
'annual_inc:20K-30K',
'annual_inc:30K-40K',
'annual_inc:40K-50K',
'annual_inc:50K-60K',
'annual_inc:60K-70K',
'annual_inc:70K-80K',
'annual_inc:80K-90K',
'annual_inc:90K-100K',
'annual_inc:100K-120K',
'annual_inc:120K-140K',
'annual_inc:>140K']]

#'acc_now_delinq:0',
#'acc_now_delinq:>=1',          TO-DO: Consider factorizing 'acc_now_delinq'

#'total_rev_hi_lim:<=5K',
#'total_rev_hi_lim:5K-10K',
#'total_rev_hi_lim:10K-20K',
#'total_rev_hi_lim:20K-30K',   TO-DO: Consider factorizing 'total_rev_hi_lim'
#'total_rev_hi_lim:30K-40K',
#'total_rev_hi_lim:40K-55K',
#'total_rev_hi_lim:55K-95K',
#'total_rev_hi_lim:>95K',

#'dti:<=1.4',
#'dti:1.4-3.5',
#'dti:3.5-7.7',                TO-DO: Consider factorizing 'dti'
#'dti:7.7-10.5',
#'dti:10.5-16.1',
#'dti:16.1-20.3',
#'dti:20.3-21.7',
#'dti:21.7-22.4',
#'dti:22.4-35',
#'dti:>35',

#'mths_since_last_delinq:Missing',
#'mths_since_last_delinq:0-3',
#'mths_since_last_delinq:4-30',  TO-DO: Consider factorizing 'mths_since_last_delinq'
#'mths_since_last_delinq:31-56',
#'mths_since_last_delinq:>=57',
#'mths_since_last_record:Missing',

#'mths_since_last_record:0-2',
#'mths_since_last_record:3-20',  TO-DO: Consider factorizing 'mths_since_last_record'
#'mths_since_last_record:21-31',
#'mths_since_last_record:32-80',
#'mths_since_last_record:81-86',
#'mths_since_last_record:>=86'
#]]

In [7]:
inputs_train_with_ref_cat.head()

Unnamed: 0,grade:A,grade:B,grade:C,grade:D,grade:E,grade:F,grade:G,home_ownership:RENT_OTHER_NONE_ANY,home_ownership:OWN,home_ownership:MORTGAGE,...,annual_inc:30K-40K,annual_inc:40K-50K,annual_inc:50K-60K,annual_inc:60K-70K,annual_inc:70K-80K,annual_inc:80K-90K,annual_inc:90K-100K,annual_inc:100K-120K,annual_inc:120K-140K,annual_inc:>140K
321284,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
68126,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
137748,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
228145,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
320995,0,1,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


In [8]:
# Here we store the names of the reference/benchmark category dummy variables in a list.
ref_categories = ['grade:G',
'home_ownership:RENT_OTHER_NONE_ANY',
'addr_state:ND_NE_IA_NV_FL_HI_AL',
'verification_status:Verified',
'purpose:educ__sm_b__wedd__ren_en__mov__house',
'initial_list_status:f',
'term:60',
'emp_length:0',
'mths_since_issue_d:>84',
'int_rate:>20.281',
'mths_since_earliest_cr_line:<140',
'delinq_2yrs:>=4',
'inq_last_6mths:>6',
'open_acc:0',
'pub_rec:0-2',
'total_acc:<=27',
'annual_inc:<20K']

# Since the followings' variables have not been
# featured 2 cells previously (my decision), attempt
# to include them in the exempt list here will cause 
# an error

#'acc_now_delinq:0',
#'total_rev_hi_lim:<=5K',
#'dti:>35',
#'mths_since_last_delinq:0-3',
#'mths_since_last_record:0-2']

In [9]:
inputs_train = inputs_train_with_ref_cat.drop(ref_categories, axis = 1)
# From the dataframe with input variables, we drop the variables with variable names in the list with reference categories. 
inputs_train.head()

Unnamed: 0,grade:A,grade:B,grade:C,grade:D,grade:E,grade:F,home_ownership:OWN,home_ownership:MORTGAGE,addr_state:NM_VA,addr_state:NY,...,annual_inc:30K-40K,annual_inc:40K-50K,annual_inc:50K-60K,annual_inc:60K-70K,annual_inc:70K-80K,annual_inc:80K-90K,annual_inc:90K-100K,annual_inc:100K-120K,annual_inc:120K-140K,annual_inc:>140K
321284,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
68126,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
137748,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
228145,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
320995,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
# I want a copy of input_train with the 77 columns
# for future use.
inputs_train.to_csv("inputs_train_lower_dimension.csv")

## (Automatic) Feature Selection

In [11]:
""" import numpy as np
import pandas as pd
import re
import plotly.graph_objects as go
from plotly.subplots import make_subplots """

inputs_train = pd.read_csv('inputs_train_lower_dimension.csv', index_col = 0)
inputs_train.head()
 

Unnamed: 0,grade:A,grade:B,grade:C,grade:D,grade:E,grade:F,home_ownership:OWN,home_ownership:MORTGAGE,addr_state:NM_VA,addr_state:NY,...,annual_inc:30K-40K,annual_inc:40K-50K,annual_inc:50K-60K,annual_inc:60K-70K,annual_inc:70K-80K,annual_inc:80K-90K,annual_inc:90K-100K,annual_inc:100K-120K,annual_inc:120K-140K,annual_inc:>140K
321284,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
68126,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
137748,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
228145,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
320995,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [12]:
from sklearn.feature_selection import SelectKBest, chi2

# To run UNSUPERVISED fS, dont stipulate response var/matrix "y" in fit_transform(X,y).
# See https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
# and https://medium.com/@amerjabar0000/feature-selection-for-dummies-a-simple-introduction-9cf8a9ad737a

# Courtesy: https://www.datatechnotes.com/2021/02/seleckbest-feature-selection-example-in-python.html

selection = SelectKBest(chi2, k=50)   # << --- Default k is 10
input_train_features_selected = selection.fit_transform(inputs_train, loan_data_targets_train)
#input_train_features_selected.shape

# To see those features selected...
filter = selection.get_support()
names_of_selected_features = list(inputs_train.columns.values[filter])
print("Selected features: ", names_of_selected_features)

input_train_features_selected = inputs_train.loc[: ,names_of_selected_features]
print("Shape of training data after f.selection: ", input_train_features_selected.shape, "\n")
input_train_features_selected.head()

#input_train_features_select_filter = SelectKBest(chi2).fit_transform(inputs_train, loan_data_targets_train)

Selected features:  ['grade:A', 'grade:B', 'grade:C', 'grade:D', 'grade:E', 'grade:F', 'home_ownership:MORTGAGE', 'addr_state:TX', 'addr_state:IL_CT', 'addr_state:KS_SC_CO_VT_AK_MS', 'addr_state:WV_NH_WY_DC_ME_ID', 'verification_status:Not Verified', 'verification_status:Source Verified', 'purpose:credit_card', 'purpose:debt_consolidation', 'purpose:oth__med__vacation', 'purpose:major_purch__car__home_impr', 'initial_list_status:w', 'term:36', 'emp_length:5-6', 'emp_length:10', 'mths_since_issue_d:<38', 'mths_since_issue_d:38-39', 'mths_since_issue_d:40-41', 'mths_since_issue_d:42-48', 'mths_since_issue_d:53-64', 'mths_since_issue_d:65-84', 'int_rate:<9.548', 'int_rate:9.548-12.025', 'int_rate:12.025-15.74', 'int_rate:15.74-20.281', 'mths_since_earliest_cr_line:141-164', 'mths_since_earliest_cr_line:165-247', 'mths_since_earliest_cr_line:271-352', 'mths_since_earliest_cr_line:>352', 'inq_last_6mths:0', 'inq_last_6mths:1-2', 'inq_last_6mths:3-6', 'open_acc:1-3', 'total_acc:28-51', 'annu

Unnamed: 0,grade:A,grade:B,grade:C,grade:D,grade:E,grade:F,home_ownership:MORTGAGE,addr_state:TX,addr_state:IL_CT,addr_state:KS_SC_CO_VT_AK_MS,...,annual_inc:20K-30K,annual_inc:30K-40K,annual_inc:40K-50K,annual_inc:50K-60K,annual_inc:70K-80K,annual_inc:80K-90K,annual_inc:90K-100K,annual_inc:100K-120K,annual_inc:120K-140K,annual_inc:>140K
321284,1,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
68126,0,0,1,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
137748,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
228145,0,1,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
320995,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [35]:
#   HOWEVER, WE COULD ALSO DIRECTLY INTEGRATE THE FEATURE SELECTION INTO A
#   M.L CLASSIFICATION PIPELINE STILL WITH SKLEAN.

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

pd.options.display.max_rows = None

reg = LogisticRegression()
ml_pipe = make_pipeline(selection, reg)  # << --- Use pre-existing selection filter or create anew
ml_pipe.fit(input_train_features_selected, loan_data_targets_train)
#   Hence, ml_pipe {SelectKBest > LogisticRegression}

#   Lets see the relevant fitted model stats...

#ml_pipe[-1].intercept_
#ml_pipe[-1].coef_

names =["Intercept"] + list(input_train_features_selected.columns.values)
coeffs = pd.DataFrame(np.transpose(ml_pipe[-1].coef_), columns=['coeffs'])['coeffs'].tolist()
intercept = list(ml_pipe[-1].intercept_)
coefficients = intercept + coeffs
d = {'Feature name': names, 'Coefficients': coefficients}

summary_table = pd.DataFrame(data=d)
print(summary_table)

  return f(**kwargs)


                           Feature name  Coefficients
0                             Intercept     -0.474730
1                               grade:A      1.172401
2                               grade:B      1.021787
3                               grade:C      0.794241
4                               grade:D      0.590267
5                               grade:E      0.354712
6                               grade:F      0.163350
7               home_ownership:MORTGAGE      0.095061
8                         addr_state:TX      0.154320
9                      addr_state:IL_CT      0.202951
10         addr_state:KS_SC_CO_VT_AK_MS      0.226162
11         addr_state:WV_NH_WY_DC_ME_ID      0.428616
12     verification_status:Not Verified      0.093380
13  verification_status:Source Verified      0.012565
14                  purpose:credit_card      0.311402
15           purpose:debt_consolidation      0.193708
16           purpose:oth__med__vacation      0.228254
17  purpose:major_purch__car