In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm

import warnings
warnings.filterwarnings("ignore")
from sklearn.utils.testing import ignore_warnings

* Problem = Melakukan prediksi apakah customer baru memiliki resiko gagal bayar
* prediction = kemungkinan seseorang gagal bayar
* Data yang diperlukan = data demografis, data transaksi, income, dll
* Tujuan = memaksimalkan keuntungan dari pemberian pinjaman 
* action = tidak memberikan pinjaman pada orang yang berpotensi gagal bayar
* value = peningkatan profit dan menghindari resiko

In [2]:
bankloan = pd.read_csv('bankloan.csv')
bankloan

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.658720,0.821280,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1
...,...,...,...,...,...,...,...,...,...
695,36,2,6,15,27,4.6,0.262062,0.979938,1
696,29,2,6,4,21,11.5,0.369495,2.045505,0
697,33,1,15,3,32,7.6,0.491264,1.940736,0
698,45,1,19,22,77,8.4,2.302608,4.165392,0


In [3]:
bankloan.describe()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,34.86,1.722857,8.388571,8.278571,45.601429,10.260571,1.553553,3.058209,0.261429
std,7.997342,0.928206,6.658039,6.824877,36.814226,6.827234,2.117197,3.287555,0.439727
min,20.0,1.0,0.0,0.0,14.0,0.4,0.011696,0.045584,0.0
25%,29.0,1.0,3.0,3.0,24.0,5.0,0.369059,1.044178,0.0
50%,34.0,1.0,7.0,7.0,34.0,8.6,0.854869,1.987567,0.0
75%,40.0,2.0,12.0,12.0,55.0,14.125,1.901955,3.923065,1.0
max,56.0,5.0,31.0,34.0,446.0,41.3,20.56131,27.0336,1.0


In [4]:
fitur = ['employ','debtinc','creddebt','othdebt']
target = 'default'

In [5]:
x = bankloan[fitur]
y = bankloan[target]

In [6]:
x.describe()

Unnamed: 0,employ,debtinc,creddebt,othdebt
count,700.0,700.0,700.0,700.0
mean,8.388571,10.260571,1.553553,3.058209
std,6.658039,6.827234,2.117197,3.287555
min,0.0,0.4,0.011696,0.045584
25%,3.0,5.0,0.369059,1.044178
50%,7.0,8.6,0.854869,1.987567
75%,12.0,14.125,1.901955,3.923065
max,31.0,41.3,20.56131,27.0336


Analisa :
* Employ   = lama bekerja para customer mulai dari 0 (fresh graduate/tidak pernah bekerja) sampai yang paling lama selama 31 tahun, dengan lama kerja rata rata selama 7 tahun
* Debtinc  = rasio antara income dengan cicilan ada yang memiliki rasio paling rendah sebesar 0,4 dan rasio tertinggi 41,3 dengan rata rata rasio pendapatan dan hutang sebesar 8,6
* Creddebt = customer memiliki hutang kartu kredit paling kecil sebanyak 0,01 dan paling besar sebesar 20,56, dengan rata rata hutang kartu kredit sebesar 0,85
* Othdebt  = Hutang ditempat lain dari customer memiliki rata rata 1,98, dengan hutang terkecil sebesar 0,04 dan hutang terbesar sebesar 27,03

overal rata rata customer memiliki hutang yang lebih besar pada othdebt dibandingkan dengan creddebt

In [7]:
sm_logit = sm.Logit(y, sm.add_constant(x))
result = sm_logit.fit()

Optimization terminated successfully.
         Current function value: 0.411165
         Iterations 7


In [8]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                default   No. Observations:                  700
Model:                          Logit   Df Residuals:                      695
Method:                           MLE   Df Model:                            4
Date:                Thu, 24 Jun 2021   Pseudo R-squ.:                  0.2844
Time:                        12:43:48   Log-Likelihood:                -287.82
converged:                       True   LL-Null:                       -402.18
Covariance Type:            nonrobust   LLR p-value:                 2.473e-48
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2302      0.236     -5.210      0.000      -1.693      -0.767
employ        -0.2436      0.029     -8.456      0.000      -0.300      -0.187
debtinc        0.0885      0.021      4.200      0.0

Apabila logistic regression, yang dilihat adalah Log-likelihood, apabila linear regresi f-test

### Test LLR
* LLR p-val Juga dilihat (<0.05), maka ada cukup bukti bahwa ada salah satu dari feature yang kita pakai mempengaruhi peluang seseorang untuk gagal bayar

### Partial test (Z-Test)
* const  = p val < 0.05, kita butuh b0 untuk model ini
* employ = p val < 0.05, ada cukup bukti bahwa employ menurunkan peluang gagal bayar
* debtinc = p val < 0.05, ada cukup bukti bahwa debtinc menaikkan peluang gagal bayar
* creddebt = p val < 0.05, ada cukup bukti bahwa creddebt menaikkan peluang gagal bayar
* othdebt  = p val > 0.05, tidak ada cukup bukti bahwa othdebt memiliki pengaruh signifikan terhadap pelaung gagal bayar

### Coefficient determination
* Pseudo R-square :0.2844, artinya model ini dapat menjelaskan 28.44 % dari variance kemungkinan gagal bayar

# Model Intepretation
fitur signifikan kecual othdebt ( p>|z| = 0.94)

### Feature Employ

In [9]:
β = 0.2436
np.exp(β)

1.2758338948511923

ketika employ (lama bekerja) bertambah 1 tahun dan tidak ada perubahan pada feature lain, maka peluang untuk default akan turun sebanyak 1,27 kali

### Feature Debtinc

In [10]:
β = 0.0885
np.exp(β)

1.0925342526104793

Ketika Debtinc bertambah 1 dan tidak ada perubahan pada feature lainnya, maka peluang untuk deafult akan naik sebanyak 1.092 kali

### Feature Creddebt

In [11]:
β = 0.5041
np.exp(β)

1.6554949043702933

Ketika creddebt bertambah 1 dan tidak ada perubahan pada feature lainnya, maka peluang untuk deafult akan naik sebanyak 1.092 kali

### Feature othdebt

hasil tidak signifikan karena lebih besar dari 0.05

# Multicollienarity

In [12]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):
    vif = pd.DataFrame()
    vif['variables']=X.columns
    vif['VIF']=[variance_inflation_factor(X.values,i) for i in range (X.shape[1])]
    
    return vif

In [13]:
calc_vif(x)

Unnamed: 0,variables,VIF
0,employ,2.222753
1,debtinc,3.045977
2,creddebt,2.816577
3,othdebt,4.116876


Hasil yang didapata dari test multicollienarity menunjukan hanya 1 yang memiliki nilai diatas 4 dari fitur fitur yang digunakan, namun untuk feature othdebt tetap bisa diterima dengan toleransi (hanya sedikit lebih)

## <center> Validation

In [14]:
from sklearn.model_selection import train_test_split  # Untuk split data menjadi train dan test
from sklearn.metrics import accuracy_score

In [15]:
(x,y)

(     employ  debtinc   creddebt   othdebt
 0        17      9.3  11.359392  5.008608
 1        10     17.3   1.362202  4.000798
 2        15      5.5   0.856075  2.168925
 3        15      2.9   2.658720  0.821280
 4         2     17.3   1.787436  3.056564
 ..      ...      ...        ...       ...
 695       6      4.6   0.262062  0.979938
 696       6     11.5   0.369495  2.045505
 697      15      7.6   0.491264  1.940736
 698      19      8.4   2.302608  4.165392
 699      12     14.7   2.994684  3.473316
 
 [700 rows x 4 columns],
 0      1
 1      0
 2      0
 3      0
 4      1
       ..
 695    1
 696    0
 697    0
 698    0
 699    0
 Name: default, Length: 700, dtype: int64)

In [16]:
x_train, x_test, y_train, y_test = train_test_split (
    x,y,
    stratify = y,  # agar ratio kelas di setiap subset data sama
    test_size = 0.2, 
    random_state = 2020) # apabila tidak menggunakan random state saat acak data tidak punya patokan (pencacah)

In [17]:
bankloan['default'].value_counts()

0    517
1    183
Name: default, dtype: int64

kita memiliki perbandingan 5 : 2, apabila tidak digunakan stratify akan ada kemungkinan nilai 1 di data train semua sedangkan di data test tidak ada, kalau menggunakan startify maka akan balance

In [18]:
y_train.value_counts()

0    414
1    146
Name: default, dtype: int64

In [19]:
y_test.value_counts()

0    103
1     37
Name: default, dtype: int64

In [20]:
sm_logit_train = sm.Logit(y_train, sm.add_constant(x_train))
result_train = sm_logit_train.fit()

Optimization terminated successfully.
         Current function value: 0.411145
         Iterations 7


In [21]:
y_predict_proba = result_train.predict(sm.add_constant(x_test))
y_predict_proba # Probability untuk default

118    0.584807
309    0.308932
339    0.308407
686    0.398796
639    0.236440
         ...   
597    0.544007
58     0.012081
467    0.102066
148    0.060111
681    0.214943
Length: 140, dtype: float64

Diubah ke bentuk kategorikal, karena target kategorikal

In [22]:
y_predict_class=[1 if i>0.5
                else 0 
                for i in y_predict_proba]

In [23]:
y_predict_class

[1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0]

# Chek akurasi score

In [24]:
print('model accuracy in test dataset: ',accuracy_score(y_test,y_predict_class))

model accuracy in test dataset:  0.8214285714285714


model kita mampu memprediksi dengan benar 8 out of 10 (82%)

# Latihan

In [25]:
df = pd.read_csv('white_wine.csv')
df.dropna(inplace=True)
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.0010,3.00,0.45,8.8,6.0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.9940,3.30,0.49,9.5,6.0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
514,7.8,0.28,0.32,9.0,0.036,34.0,115.0,1.0002,3.17,0.39,10.3,7.0
515,6.1,0.31,0.26,2.2,0.051,28.0,167.0,0.9926,3.37,0.47,10.4,6.0
516,6.8,0.18,0.37,1.6,0.055,47.0,154.0,0.9934,3.08,0.45,9.1,5.0
517,7.4,0.15,0.42,1.7,0.045,49.0,154.0,0.9920,3.00,0.60,10.4,6.0


target = quality > buat kolom target, jika quality >6 maka = 1, selain itu 0; 1 = good wine 0 = bad wine

fitur = density dan alkohol

tugas = Buatlah model untuk memprediksi kualitas wine dengan menggunakan log reg, lakukan validasi untuk melihat seberapa baik performa model yang dibuat

In [29]:
quality_ord=[1 if i>6
                else 0 
                for i in df['quality']]

In [30]:
df['quality_ord'] = quality_ord

In [31]:
fitur = ['density','alcohol']
target = 'quality_ord'

In [32]:
x = df[fitur]
y = df[target]

In [33]:
sm_logit = sm.Logit(y, sm.add_constant(x))
result = sm_logit.fit()

Optimization terminated successfully.
         Current function value: 0.085400
         Iterations 13


In [34]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:            quality_ord   No. Observations:                  519
Model:                          Logit   Df Residuals:                      516
Method:                           MLE   Df Model:                            2
Date:                Thu, 24 Jun 2021   Pseudo R-squ.:                  0.8237
Time:                        12:44:42   Log-Likelihood:                -44.323
converged:                       True   LL-Null:                       -251.46
Covariance Type:            nonrobust   LLR p-value:                 1.097e-90
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2253.7567    328.276     -6.865      0.000   -2897.165   -1610.348
density     2201.6603    321.421      6.850      0.000    1571.687    2831.634
alcohol        5.6002      0.798      7.018      0.0

In [35]:
x_train, x_test, y_train, y_test = train_test_split (
    x,y,
    stratify = y,  # agar ratio kelas di setiap subset data sama
    test_size = 0.2, 
    random_state = 2020)

In [36]:
sm_logit_train = sm.Logit(y_train, sm.add_constant(x_train))
result_train = sm_logit_train.fit()

Optimization terminated successfully.
         Current function value: 0.091062
         Iterations 13


In [37]:
y_predict_proba = result_train.predict(sm.add_constant(x_test))
y_predict_proba # Probability untuk default

84     9.266852e-05
486    9.096207e-07
476    9.933057e-01
327    1.884552e-04
440    9.926806e-01
           ...     
163    9.032858e-04
399    9.874021e-04
166    1.613145e-02
433    4.893133e-07
150    9.998851e-01
Length: 104, dtype: float64

In [39]:
y_predict_class=[1 if i>0.5
                else 0 
                for i in y_predict_proba]

In [40]:
y_predict_class

[0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1]

In [41]:
print('model accuracy in test dataset: ',accuracy_score(y_test,y_predict_class))

model accuracy in test dataset:  0.9807692307692307


Model memiliki akurasi 98%, so good, god damn

In [None]:
## Cara mudah
df['quality_ord'] = np.where(df['quality'] >6,1,0)