## Import Data

In [7]:
import numpy as np
import pandas as pd

inputs_train = pd.read_csv('inputs_train_lower_dimension.csv', index_col = 0)
loan_data_targets_train = pd.read_csv('loan_data_targets_train.csv', index_col = 0)
inputs_train.head() 

Unnamed: 0,grade:A,grade:B,grade:C,grade:D,grade:E,grade:F,home_ownership:OWN,home_ownership:MORTGAGE,addr_state:NM_VA,addr_state:NY,...,annual_inc:30K-40K,annual_inc:40K-50K,annual_inc:50K-60K,annual_inc:60K-70K,annual_inc:70K-80K,annual_inc:80K-90K,annual_inc:90K-100K,annual_inc:100K-120K,annual_inc:120K-140K,annual_inc:>140K
321284,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
68126,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
137748,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
228145,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
320995,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


## Build a Logistic Regression Model with P-Values

### Beyond auto feature selection, one thing we'd love to do is be able to decide what features are best retained from the chi2 or anova, etc. For example, based on computing p-values. That is, any var with p-value < 0.05 should be on our choice list, i.e., it's statistically significant.

### Since we have also done fine-classing, where one or more of the finer categories that make up the original independent var are significant, then we'd retain all of the peer categories that make up the original independent variable. To do this, we have to compute p-values. Hence, we override the sklearn fit function as follows.

In [8]:
#   P values for sklearn logistic regression.

#   Class to display p-values for logistic regression in sklearn.

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
import scipy.stats as stat

class LogisticRegression_with_p_values:

    """
    This class overrides the linera_model.LinearRegression class
    function, fit(). The custom fit() fuction is then used to 
    return alongside other stats, p-values.

    """
    def __init__(self,*args,**kwargs):#,**kwargs):
        self.model = LogisticRegression(*args,**kwargs)#,**args)

    def fit(self,X,y):
        
        self.model.fit(X,y)
        
        #### Get p-values for the fitted model ####
        denom = (2.0 * (1.0 + np.cosh(self.model.decision_function(X))))
        denom = np.tile(denom,(X.shape[1],1)).T
        F_ij = np.dot((X / denom).T,X) ## Fisher Information Matrix
        Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
        sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
        z_scores = self.model.coef_[0] / sigma_estimates # z-score for eaach model coefficient
        p_values = [stat.norm.sf(abs(x)) * 2 for x in z_scores] ### two tailed test for p-values
        
        self.coef_ = self.model.coef_
        self.intercept_ = self.model.intercept_
        self.p_values = p_values

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

pd.options.display.max_rows = None

#   Integrate the class into a ML pipeline

reg = LogisticRegression_with_p_values()
selection = SelectKBest(chi2, k=inputs_train.shape[1])
ml_pipe = make_pipeline(selection, reg)

#   Hence, ml_pipe {SelectKBest > LogisticRegression_with_p_values}
ml_pipe.fit(inputs_train, loan_data_targets_train)

#   But glue the intercept and the coefficients together into a list
names =["Intercept"] + list(inputs_train.columns.values)
coefficients = pd.DataFrame(np.append(np.array(ml_pipe[-1].intercept_), np.transpose(ml_pipe[-1].coef_)), \
    columns=['coeffs'])['coeffs'].tolist()
d = {'Feature name': names, 'Coefficients': coefficients, 'P-Value': ['NaN'] + ml_pipe[-1].p_values}
summary_table = pd.DataFrame(data=d)
print(summary_table)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                           Feature name  Coefficients       P-Value
0                             Intercept     -1.045570           NaN
1                               grade:A      1.042289   8.33696e-30
2                               grade:B      0.874150   5.53017e-47
3                               grade:C      0.676292   3.48325e-33
4                               grade:D      0.515324   6.77241e-22
5                               grade:E      0.332800   3.97404e-12
6                               grade:F      0.157324     0.0016699
7                    home_ownership:OWN      0.091454   5.51849e-06
8               home_ownership:MORTGAGE      0.100867   1.68381e-15
9                      addr_state:NM_VA      0.048835      0.127286
10                        addr_state:NY      0.091661   0.000144693
11         addr_state:OK_TN_MO_LA_MD_NC      0.058300     0.0116192
12                        addr_state:CA      0.097879    3.7624e-06
13               addr_state:UT_KY_AZ_NJ      0.0

## Fit the Final Model

In [12]:
inputs_train_with_ref_cat = inputs_train.loc[: , ['grade:A',
'grade:B',
'grade:C',
'grade:D',
'grade:E',
'grade:F',
'home_ownership:OWN',
'home_ownership:MORTGAGE',
'addr_state:NM_VA',
'addr_state:NY',
'addr_state:OK_TN_MO_LA_MD_NC',
'addr_state:CA',
'addr_state:UT_KY_AZ_NJ',
'addr_state:AR_MI_PA_OH_MN',
'addr_state:RI_MA_DE_SD_IN',
'addr_state:GA_WA_OR',
'addr_state:WI_MT',
'addr_state:TX',
'addr_state:IL_CT',
'addr_state:KS_SC_CO_VT_AK_MS',
'addr_state:WV_NH_WY_DC_ME_ID',
'verification_status:Not Verified',
'verification_status:Source Verified',
'purpose:credit_card',
'purpose:debt_consolidation',
'purpose:oth__med__vacation',
'purpose:major_purch__car__home_impr',
'initial_list_status:w',
'term:36',
'emp_length:1',
'emp_length:2-4',
'emp_length:5-6',
'emp_length:7-9',
'emp_length:10',
'mths_since_issue_d:<38',
'mths_since_issue_d:38-39',
'mths_since_issue_d:40-41',
'mths_since_issue_d:42-48',
'mths_since_issue_d:49-52',
'mths_since_issue_d:53-64',
'mths_since_issue_d:65-84',
'int_rate:<9.548',
'int_rate:9.548-12.025',
'int_rate:12.025-15.74',
'int_rate:15.74-20.281',
'mths_since_earliest_cr_line:141-164',
'mths_since_earliest_cr_line:165-247',
'mths_since_earliest_cr_line:248-270',
'mths_since_earliest_cr_line:271-352',
'mths_since_earliest_cr_line:>352',
'delinq_2yrs:0',
'delinq_2yrs:1-3',
'inq_last_6mths:0',
'inq_last_6mths:1-2',
'inq_last_6mths:3-6',
'open_acc:1-3',
'open_acc:4-12',
'open_acc:13-17',
'open_acc:18-22',
'open_acc:23-25',
'open_acc:26-30',
'open_acc:>=31',
'pub_rec:3-4',
'pub_rec:>=5',
'total_acc:28-51',
'total_acc:>=52',
'annual_inc:20K-30K',
'annual_inc:30K-40K',
'annual_inc:40K-50K',
'annual_inc:50K-60K',
'annual_inc:60K-70K',
'annual_inc:70K-80K',
'annual_inc:80K-90K',
'annual_inc:90K-100K',
'annual_inc:100K-120K',
'annual_inc:120K-140K',
'annual_inc:>140K']]

# Here ref_categories is the remove list, such that it equals
# reference categories for the statistically insignificant vars
# (if any) and the variables themselves. For statistically insignificant,
# see list here: 
# https://docs.google.com/spreadsheets/d/1B_oSsxVBaeFmC2K4sKuF8u_MqL0bkW260CA7-NsDYBI/edit?usp=sharing

ref_categories = [
'delinq_2yrs:0',
'delinq_2yrs:1-3',
'open_acc:1-3',
'open_acc:4-12',
'open_acc:13-17',
'open_acc:18-22',
'open_acc:23-25',
'open_acc:26-30',
'open_acc:>=31',
'total_acc:28-51',
'total_acc:>=52',]

inputs_train = inputs_train_with_ref_cat.drop(ref_categories, axis = 1)
inputs_train.head()

Unnamed: 0,grade:A,grade:B,grade:C,grade:D,grade:E,grade:F,home_ownership:OWN,home_ownership:MORTGAGE,addr_state:NM_VA,addr_state:NY,...,annual_inc:30K-40K,annual_inc:40K-50K,annual_inc:50K-60K,annual_inc:60K-70K,annual_inc:70K-80K,annual_inc:80K-90K,annual_inc:90K-100K,annual_inc:100K-120K,annual_inc:120K-140K,annual_inc:>140K
321284,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
68126,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
137748,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
228145,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
320995,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [13]:
#   We fit the final model. No selections again.

reg2 = LogisticRegression_with_p_values()
ml_pipe = make_pipeline(reg2)
ml_pipe.fit(inputs_train, loan_data_targets_train)
names =["Intercept"] + list(inputs_train.columns.values)
coefficients = pd.DataFrame(np.append(np.array(ml_pipe[-1].intercept_), np.transpose(ml_pipe[-1].coef_)), \
    columns=['coeffs'])['coeffs'].tolist()
d = {'Feature name': names, 'Coefficients': coefficients, 'P-Value': ['NaN'] + ml_pipe[-1].p_values}
summary_table = pd.DataFrame(data=d)
print(summary_table)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                           Feature name  Coefficients       P-Value
0                             Intercept     -1.064552           NaN
1                               grade:A      1.013538   2.31089e-28
2                               grade:B      0.863516   8.34086e-47
3                               grade:C      0.669065    2.2344e-33
4                               grade:D      0.510834   2.65982e-22
5                               grade:E      0.331630   8.46209e-13
6                               grade:F      0.158028     0.0010351
7                    home_ownership:OWN      0.088225   1.14035e-05
8               home_ownership:MORTGAGE      0.095653   3.36505e-14
9                      addr_state:NM_VA      0.047503      0.137453
10                        addr_state:NY      0.086733   0.000308045
11         addr_state:OK_TN_MO_LA_MD_NC      0.055734     0.0155084
12                        addr_state:CA      0.098922    2.7175e-06
13               addr_state:UT_KY_AZ_NJ      0.0

## Save the Final Model 

In [14]:
#   Here we export our model to a 'SAV' file with file name 'pd_model.sav'
import pickle
pickle.dump(reg2, open('pd_model.sav', 'wb'))