In [1]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('bankloan.csv')
data

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.658720,0.821280,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1
...,...,...,...,...,...,...,...,...,...
695,36,2,6,15,27,4.6,0.262062,0.979938,1
696,29,2,6,4,21,11.5,0.369495,2.045505,0
697,33,1,15,3,32,7.6,0.491264,1.940736,0
698,45,1,19,22,77,8.4,2.302608,4.165392,0


In [4]:
fitur = data[['employ', 'debtinc', 'creddebt', 'othdebt']]
target = data[['default']]

In [5]:
fitur.describe()

Unnamed: 0,employ,debtinc,creddebt,othdebt
count,700.0,700.0,700.0,700.0
mean,8.388571,10.260571,1.553553,3.058209
std,6.658039,6.827234,2.117197,3.287555
min,0.0,0.4,0.011696,0.045584
25%,3.0,5.0,0.369059,1.044178
50%,7.0,8.6,0.854869,1.987567
75%,12.0,14.125,1.901955,3.923065
max,31.0,41.3,20.56131,27.0336


In [6]:
import statsmodels.api as sm

In [7]:
model_logit = sm.Logit(target, sm.add_constant(fitur))

  x = pd.concat(x[::order], 1)


In [8]:
hasil = model_logit.fit()

Optimization terminated successfully.
         Current function value: 0.411165
         Iterations 7


In [9]:
print(hasil.summary())

                           Logit Regression Results                           
Dep. Variable:                default   No. Observations:                  700
Model:                          Logit   Df Residuals:                      695
Method:                           MLE   Df Model:                            4
Date:                Tue, 14 Sep 2021   Pseudo R-squ.:                  0.2844
Time:                        21:53:18   Log-Likelihood:                -287.82
converged:                       True   LL-Null:                       -402.18
Covariance Type:            nonrobust   LLR p-value:                 2.473e-48
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2302      0.236     -5.210      0.000      -1.693      -0.767
employ        -0.2436      0.029     -8.456      0.000      -0.300      -0.187
debtinc        0.0885      0.021      4.200      0.0

# P(Y=1) = -1.2302 - 0.2436*employ + 0.0885*debtinc + 0.5041*creddebt       

In [10]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def hit_VIF(X) :
    vif = pd.DataFrame()
    vif['Variables'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values,i) for i in range (X.shape[1])]
    return vif

In [11]:
hit_VIF(fitur)

Unnamed: 0,Variables,VIF
0,employ,2.222753
1,debtinc,3.045977
2,creddebt,2.816577
3,othdebt,4.116876


# Interpret

In [12]:
np.exp(-0.2436*1)
# B1 = -0.2436 : When employment increase one year and no changes in another variable, the risk to become default most likely decrease 1.27 times

0.7838010920039364

In [13]:
np.exp(0.0885*1)
# B2 = 0.0885 : When ratio debt to income increase one and no changes in another variable, the risk to become default most likely increase 1.09 times

1.0925342526104793

In [14]:
np.exp(0.5041*5)
# B3 = 0.5041 :  When creddebt increase 5$ and no changes in another variable, the risk to become default most likely increase 12.43 times

12.434812515742879

LLR-Test:
<br>
B1, B2, B3, B4 : P-Value of the Log-Likelihood Ratio is below 0.05. We have enough evidence that minimal either employ, debtinc, creddebt, othdebt have significant effect to credit default rate.
<br>
Wald Test:

1. B0 : P-Value of the partial test is below 0.05. We need B0 in the model
1. B1 : P-Value of the partial test is below 0.05. With significant level of 5%, we have enough evidence that employ decrease default rate
1. B2 : P-Value of the partial test is below 0.05. With significant level of 5%, we have enough evidence that debtinc increase default rate.
1. B3 : P-Value of the partial test is below 0.05. With significant level of 5%, we have enough evidence that creddebt increase default rate.
1. B3 : P-Value of the partial test is greater than 0.05. With significant level of 5%, we dont have enough evidence that othdebt has significant effect to default rate

Parameter Estimate:

1. B1 = -0.2436 : Employment decrease default rate. When employment increase one year and no changes in another variable, the risk to become default most likely decrease 1.27 times 
1. B2 = 0.0885 : Ratio debt to income increase default rate. When ratio debt to income increase one and no changes in another variable, the risk to become default most likely increase 1.09 times
1. B3 = 0.5041 :  creddebt increase default rate. When creddebt increase 5$ and no changes in another variable, the risk to become default most likely increase 12.43 times
1. It is strongly recommended to interprete within the interval of predictor.

Coefficient Determination:
<br>
Pseudo R-Square = 28.44% : This model can explain 28.33% variation of default rate.

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    fitur[['employ', 'debtinc', 'creddebt']],
    target,
    stratify = target,
    test_size = 0.2,
    random_state = 2020
)

In [17]:
model_logit_train = sm.Logit(y_train, sm.add_constant(X_train))
hasil_train = model_logit_train.fit()

Optimization terminated successfully.
         Current function value: 0.411153
         Iterations 7


  x = pd.concat(x[::order], 1)


In [18]:
y_pred_proba = hasil_train.predict(sm.add_constant(X_test))

In [19]:
y_pred_proba

118    0.585242
309    0.308055
339    0.309282
686    0.398234
639    0.235796
         ...   
597    0.540755
58     0.011932
467    0.102155
148    0.059840
681    0.214318
Length: 140, dtype: float64

In [20]:
y_pred_class = np.where(y_pred_proba>0.5,1,0)
y_pred_class

array([1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0])

In [21]:
print('Akurasi Model Adalah : ', round(accuracy_score(y_test,y_pred_class)*100,2))

Akurasi Model Adalah :  82.14


In [22]:
data

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.658720,0.821280,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1
...,...,...,...,...,...,...,...,...,...
695,36,2,6,15,27,4.6,0.262062,0.979938,1
696,29,2,6,4,21,11.5,0.369495,2.045505,0
697,33,1,15,3,32,7.6,0.491264,1.940736,0
698,45,1,19,22,77,8.4,2.302608,4.165392,0


In [23]:
baru = data.sample(50)

In [24]:
baru.drop('default', axis = 1, inplace=True)

In [25]:
baru.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt
512,29,2,6,9,25,4.2,0.58485,0.46515
628,26,1,2,6,21,19.8,1.022868,3.135132
551,22,2,0,1,35,9.0,0.7812,2.3688
503,38,1,3,19,23,14.7,0.831726,2.549274
417,43,1,4,2,29,4.6,0.05336,1.28064


In [26]:
pred_data_baru = hasil_train.predict(sm.add_constant(baru[['employ','debtinc','creddebt']]))

  x = pd.concat(x[::order], 1)


In [27]:
pred_data_baru

512    0.118630
628    0.605039
551    0.467168
503    0.419813
417    0.142236
668    0.087264
398    0.538281
72     0.022837
401    0.137408
397    0.158375
273    0.054281
37     0.239821
5      0.200173
625    0.041087
693    0.313478
236    0.213799
459    0.045741
354    0.347463
527    0.321164
556    0.066400
387    0.014544
402    0.240012
404    0.046121
232    0.045000
227    0.141140
206    0.108228
81     0.278110
294    0.258858
685    0.340157
344    0.129435
584    0.950195
661    0.240488
1      0.192807
674    0.150988
506    0.006412
78     0.015137
699    0.225134
540    0.028061
615    0.007657
386    0.086224
62     0.327061
99     0.325046
70     0.661543
41     0.027555
591    0.401808
687    0.197575
480    0.067135
265    0.116156
305    0.038587
422    0.270449
dtype: float64