In [1]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df = pd.read_csv('loan.csv')

In [3]:
df.head(5)

Unnamed: 0,Default,Checking_amount,Term,Credit_score,Gender,Marital_status,Car_loan,Personal_loan,Home_loan,Education_loan,Emp_status,Amount,Saving_amount,Emp_duration,Age,No_of_credit_acc
0,0,988,15,796,Female,Single,1,0,0,0,employed,1536,3455,12,38,1
1,0,458,15,813,Female,Single,1,0,0,0,employed,947,3600,25,36,1
2,0,158,14,756,Female,Single,0,1,0,0,employed,1678,3093,43,34,1
3,1,300,25,737,Female,Single,0,0,0,1,employed,1804,2449,0,29,1
4,1,63,24,662,Female,Single,0,0,0,1,unemployed,1184,2867,4,30,1


In [4]:
## Are the assumptions met?
## Sample size? 1000
## Outliers? Yes
df.describe()

Unnamed: 0,Default,Checking_amount,Term,Credit_score,Car_loan,Personal_loan,Home_loan,Education_loan,Amount,Saving_amount,Emp_duration,Age,No_of_credit_acc
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.3,362.411,17.815,760.475,0.353,0.474,0.056,0.112,1218.681,3179.266,49.389,31.209,2.546
std,0.458487,300.901202,3.240567,77.556214,0.478142,0.499573,0.230037,0.315524,305.753699,339.549751,37.759521,4.093174,1.652457
min,0.0,-665.0,9.0,376.0,0.0,0.0,0.0,0.0,244.0,2082.0,0.0,18.0,1.0
25%,0.0,164.75,16.0,725.75,0.0,0.0,0.0,0.0,1016.0,2951.0,15.0,29.0,1.0
50%,0.0,351.5,18.0,770.5,0.0,0.0,0.0,0.0,1225.5,3203.0,41.0,32.0,2.0
75%,1.0,553.5,20.0,812.0,1.0,1.0,0.0,0.0,1419.75,3402.25,85.0,34.0,3.0
max,1.0,1319.0,27.0,1029.0,1.0,1.0,1.0,1.0,2362.0,4108.0,120.0,42.0,9.0


# This is the Sklearn Method

In [None]:
X = df[['Credit_score']]
y = df['Default']
model = LogisticRegression().fit(X,y)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
model.predict([[500]])[0]

In [None]:
model.predict_proba([[500]])

# This is the Stats Models Method

In [None]:
## Let's start with a simple logistic model with Credit Score as our X and Default as the Y
X = df[['Credit_score']]
X = sm.add_constant(X)
y = df[['Default']]

In [None]:
model = sm.Logit(y, X).fit()
y_pred = model.predict(X)
resid = y - y_pred
print(model.summary())

In [None]:
plt.scatter(df['Credit_score'],y_pred)
plt.axhline(y=0, color='r', linestyle='-')
plt.axhline(y=1, color='r', linestyle='-')
plt.show()

In [None]:
model.predict([[1,500]])

# Let's Create a Full Logistic Regression Model

First we want to transform our categorical variables into dummy variables

In [5]:
df['female']=pd.get_dummies(df['Gender'])['Female']
df['married']=pd.get_dummies(df['Marital_status '])['Married']
df['employed']=pd.get_dummies(df['Emp_status'])['employed']

In [6]:
df.head(5)

Unnamed: 0,Default,Checking_amount,Term,Credit_score,Gender,Marital_status,Car_loan,Personal_loan,Home_loan,Education_loan,Emp_status,Amount,Saving_amount,Emp_duration,Age,No_of_credit_acc,female,married,employed
0,0,988,15,796,Female,Single,1,0,0,0,employed,1536,3455,12,38,1,1,0,1
1,0,458,15,813,Female,Single,1,0,0,0,employed,947,3600,25,36,1,1,0,1
2,0,158,14,756,Female,Single,0,1,0,0,employed,1678,3093,43,34,1,1,0,1
3,1,300,25,737,Female,Single,0,0,0,1,employed,1804,2449,0,29,1,1,0,1
4,1,63,24,662,Female,Single,0,0,0,1,unemployed,1184,2867,4,30,1,1,0,0


In [7]:
df.describe()

Unnamed: 0,Default,Checking_amount,Term,Credit_score,Car_loan,Personal_loan,Home_loan,Education_loan,Amount,Saving_amount,Emp_duration,Age,No_of_credit_acc,female,married,employed
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.3,362.411,17.815,760.475,0.353,0.474,0.056,0.112,1218.681,3179.266,49.389,31.209,2.546,0.31,0.548,0.308
std,0.458487,300.901202,3.240567,77.556214,0.478142,0.499573,0.230037,0.315524,305.753699,339.549751,37.759521,4.093174,1.652457,0.462725,0.49794,0.461898
min,0.0,-665.0,9.0,376.0,0.0,0.0,0.0,0.0,244.0,2082.0,0.0,18.0,1.0,0.0,0.0,0.0
25%,0.0,164.75,16.0,725.75,0.0,0.0,0.0,0.0,1016.0,2951.0,15.0,29.0,1.0,0.0,0.0,0.0
50%,0.0,351.5,18.0,770.5,0.0,0.0,0.0,0.0,1225.5,3203.0,41.0,32.0,2.0,0.0,1.0,0.0
75%,1.0,553.5,20.0,812.0,1.0,1.0,0.0,0.0,1419.75,3402.25,85.0,34.0,3.0,1.0,1.0,1.0
max,1.0,1319.0,27.0,1029.0,1.0,1.0,1.0,1.0,2362.0,4108.0,120.0,42.0,9.0,1.0,1.0,1.0


In [8]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Personal_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Emp_duration ','Age','No_of_credit_acc','female','married',
       'employed']]
X = sm.add_constant(X)
y = df[['Default']]

In [9]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

             feature         VIF
0              const  477.670587
1    Checking_amount    1.150100
2               Term    1.117594
3       Credit_score    1.168940
4           Car_loan   46.669305
5      Personal_loan   50.944167
6          Home_loan   11.598034
7     Education_loan   20.927025
8             Amount    1.031541
9      Saving_amount    1.192806
10     Emp_duration     1.115217
11               Age    1.345568
12  No_of_credit_acc    1.050759
13            female    2.223521
14           married    2.337169
15          employed    1.118671


Looks like Personal_loan has some high collinearity with the other variables. Let's drop it and check again

In [None]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Emp_duration ','Age','No_of_credit_acc','female','married',
       'employed']]
X = sm.add_constant(X)
y = df[['Default']]

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

Looks good now! We always will ignore the constant term when looking at the VIFs

In [10]:
model = sm.Logit(y, X).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.148824
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                Default   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      984
Method:                           MLE   Df Model:                           15
Date:                Sat, 13 Nov 2021   Pseudo R-squ.:                  0.7564
Time:                        23:00:14   Log-Likelihood:                -148.82
converged:                       True   LL-Null:                       -610.86
Covariance Type:            nonrobust   LLR p-value:                2.471e-187
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               40.7590      4.817      8.462      0.000      31.319      50.199
Checking_am

It appears that female is not statistically significant. Let's drop it

In [11]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Emp_duration ','Age','No_of_credit_acc','married',
       'employed']]
X = sm.add_constant(X)
y = df[['Default']]
model = sm.Logit(y, X).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.149073
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                Default   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      986
Method:                           MLE   Df Model:                           13
Date:                Sat, 13 Nov 2021   Pseudo R-squ.:                  0.7560
Time:                        23:00:31   Log-Likelihood:                -149.07
converged:                       True   LL-Null:                       -610.86
Covariance Type:            nonrobust   LLR p-value:                4.438e-189
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               39.0433      3.862     10.109      0.000      31.473      46.613
Checking_am

Married is insignificant. We will drop that one and refit

In [13]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Emp_duration ','Age','No_of_credit_acc',
       'employed']]
X = sm.add_constant(X)
y = df[['Default']]
model = sm.Logit(y, X).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.149230
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                Default   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      987
Method:                           MLE   Df Model:                           12
Date:                Sat, 13 Nov 2021   Pseudo R-squ.:                  0.7557
Time:                        23:01:18   Log-Likelihood:                -149.23
converged:                       True   LL-Null:                       -610.86
Covariance Type:            nonrobust   LLR p-value:                5.778e-190
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               38.9675      3.857     10.104      0.000      31.409      46.526
Checking_am

Employment Duration is insignificant. Drop and refit

In [14]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Age','No_of_credit_acc',
       'employed']]
X = sm.add_constant(X)
y = df[['Default']]
model = sm.Logit(y, X).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.149341
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                Default   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      988
Method:                           MLE   Df Model:                           11
Date:                Sat, 13 Nov 2021   Pseudo R-squ.:                  0.7555
Time:                        23:01:23   Log-Likelihood:                -149.34
converged:                       True   LL-Null:                       -610.86
Covariance Type:            nonrobust   LLR p-value:                6.872e-191
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               38.8124      3.833     10.127      0.000      31.300      46.324
Checking_am

In [15]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Age','employed']]
X = sm.add_constant(X)
y = df[['Default']]
model = sm.Logit(y, X).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.149817
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                Default   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      989
Method:                           MLE   Df Model:                           10
Date:                Sat, 13 Nov 2021   Pseudo R-squ.:                  0.7547
Time:                        23:01:34   Log-Likelihood:                -149.82
converged:                       True   LL-Null:                       -610.86
Covariance Type:            nonrobust   LLR p-value:                1.117e-191
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              38.7391      3.829     10.118      0.000      31.235      46.243
Checking_amoun

In [17]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Saving_amount','Age','employed']]
X = sm.add_constant(X)
y = df[['Default']]
model = sm.Logit(y, X).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.151061
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                Default   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      990
Method:                           MLE   Df Model:                            9
Date:                Sat, 13 Nov 2021   Pseudo R-squ.:                  0.7527
Time:                        23:02:51   Log-Likelihood:                -151.06
converged:                       True   LL-Null:                       -610.86
Covariance Type:            nonrobust   LLR p-value:                3.688e-192
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              39.7330      3.811     10.427      0.000      32.264      47.202
Checking_amoun

Let's see if Jerry would get approved for a loan. He has 700 in checking, 25 month term, 600 credit score, a car loan, no home loan, an education loan, 2000 in savings, is 40, and unemployed.

In [18]:
jerry = [1,700,25,600,1,0,1,2000,40,0]
model.predict(jerry)

array([0.81140153])

Jane has 1000 in checking, 18 month term on loan, 800 credit score, has a car loan, home loan, and student loan,
3500 in savings, is 39, and is employed

In [None]:
jane = [1,1000,18,800,1,1,1,3500,39,1] 
model.predict(jane)