In [48]:
import json
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

## Load Model

In [2]:
# load file

# Modelling
with open('adaboost_model.pkl', 'rb') as file_1:
    best_estimator = pickle.load(file_1)


# Pre-processing

with open('scaler.pkl', 'rb') as file_2:
    scaler = pickle.load(file_2)
    
with open('winsoriser.pkl','rb') as file_3:
    winsoriser = pickle.load(file_3)
    
# List Numeric & Category
with open('num_cols_sc.txt', 'r') as file_4:
    num_cols_scaled = json.load(file_4)

with open('num_cols_nsc.txt', 'r') as file_5:
    num_cols_non_scaled = json.load (file_5)
    

# 11. Model Inferencing

In [28]:
# load dataset
data_inferences=pd.read_csv('h8dsft_P1M1_iqbal.csv')

In [29]:
df_inferences = data_inferences.head(10) 

In [30]:
df_inferences

Unnamed: 0,limit_balance,sex,education_level,marital_status,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,bill_amt_4,bill_amt_5,bill_amt_6,pay_amt_1,pay_amt_2,pay_amt_3,pay_amt_4,pay_amt_5,pay_amt_6,default_payment_next_month
0,80000.0,1,6,1,54.0,0.0,0.0,0.0,0.0,0.0,...,29296.0,26210.0,17643.0,2545.0,2208.0,1336.0,2232.0,542.0,348.0,1
1,200000.0,1,4,1,49.0,0.0,0.0,0.0,0.0,0.0,...,50146.0,50235.0,48984.0,1689.0,2164.0,2500.0,3480.0,2500.0,3000.0,0
2,20000.0,2,6,2,22.0,0.0,0.0,0.0,0.0,0.0,...,1434.0,500.0,0.0,4641.0,1019.0,900.0,0.0,1500.0,0.0,1
3,260000.0,2,4,2,33.0,0.0,0.0,0.0,0.0,0.0,...,27821.0,30767.0,29890.0,5000.0,5000.0,1137.0,5000.0,1085.0,5000.0,0
4,150000.0,1,4,2,32.0,0.0,0.0,0.0,-1.0,0.0,...,150464.0,143375.0,146411.0,4019.0,146896.0,157436.0,4600.0,4709.0,5600.0,0
5,300000.0,2,4,2,32.0,0.0,0.0,0.0,0.0,0.0,...,65150.0,-450.0,700.0,15235.0,1491.0,1303.0,0.0,2000.0,1400.0,0
6,130000.0,1,1,1,45.0,0.0,0.0,0.0,0.0,0.0,...,62377.0,63832.0,65099.0,2886.0,2908.0,2129.0,2354.0,2366.0,2291.0,0
7,200000.0,1,1,1,58.0,0.0,0.0,0.0,0.0,0.0,...,124647.0,126921.0,129167.0,7822.0,4417.0,4446.0,4597.0,4677.0,4698.0,0
8,500000.0,1,1,1,39.0,0.0,0.0,0.0,0.0,0.0,...,174500.0,137406.0,204975.0,54209.0,4607.0,4603.0,5224.0,207440.0,7509.0,0
9,230000.0,1,1,1,48.0,0.0,0.0,0.0,0.0,0.0,...,105508.0,108101.0,110094.0,7000.0,6607.0,3773.0,4290.0,4164.0,2000.0,0


In [31]:
df_inferences_use = df_inferences.drop(['default_payment_next_month'] ,axis=1)

## Cardinality

In [33]:
# Handling cardinality
df_inferences_use = df_inferences_use.replace({'education_level' : {1 : 1, 2 : 2, 3 : 3, 4 : 4, 5 : 4, 6 : 4, 0 :4}, 
                           'marital_status' : {1 : 1, 2 : 2, 3 : 3, 0 : 3}})
df_inferences_use['pay_0'] = df_inferences_use['pay_0'].replace([-1, -2], [0, 0])
df_inferences_use['pay_2'] = df_inferences_use['pay_2'].replace([-1, -2], [0, 0])
df_inferences_use['pay_3'] = df_inferences_use['pay_3'].replace([-1, -2], [0, 0])
df_inferences_use['pay_4'] = df_inferences_use['pay_4'].replace([-1, -2], [0, 0])
df_inferences_use['pay_5'] = df_inferences_use['pay_5'].replace([-1, -2], [0, 0])
df_inferences_use['pay_6'] = df_inferences_use['pay_6'].replace([-1, -2], [0, 0])


## Outlier

In [35]:
df_inferences_out= winsoriser.transform(df_inferences_use)

## Spliting data before Scaling

In [36]:
df_inferences_sc=df_inferences_out[num_cols_scaled]
df_inferences_nsc=df_inferences_out[num_cols_non_scaled]

## Feature Scaling

In [37]:
df_inferences_sc=scaler.transform(df_inferences_sc)

In [38]:
df_inferences_sc = pd.DataFrame(df_inferences_sc, columns=num_cols_scaled)

In [39]:
df_inferences_sc.reset_index(drop=True ,  inplace=True)
df_inferences_nsc.reset_index(drop=True  , inplace=True)

In [40]:
df_con = pd.concat([df_inferences_sc , df_inferences_nsc] ,axis=1)

In [41]:
feature = df_con.loc[:,['pay_0','pay_2','pay_3','pay_4','pay_5','pay_6','bill_amt_1','bill_amt_2','bill_amt_3','bill_amt_4','bill_amt_5','bill_amt_6',
                                   'pay_amt_1','pay_amt_2','pay_amt_3','pay_amt_4','pay_amt_5','pay_amt_6']]

## Modeling

In [42]:
y_predict_inferences = best_estimator.predict(feature)

In [43]:
y_predict_inferences 

array([1, 0, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [44]:
df_y_predict_inferences = pd.DataFrame(y_predict_inferences, columns=['Prediction'])

In [45]:
df_inferences_final = pd.concat([df_inferences , df_y_predict_inferences] , axis=1)

In [46]:
df_inferences_final

Unnamed: 0,limit_balance,sex,education_level,marital_status,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,bill_amt_5,bill_amt_6,pay_amt_1,pay_amt_2,pay_amt_3,pay_amt_4,pay_amt_5,pay_amt_6,default_payment_next_month,Prediction
0,80000.0,1,6,1,54.0,0.0,0.0,0.0,0.0,0.0,...,26210.0,17643.0,2545.0,2208.0,1336.0,2232.0,542.0,348.0,1,1
1,200000.0,1,4,1,49.0,0.0,0.0,0.0,0.0,0.0,...,50235.0,48984.0,1689.0,2164.0,2500.0,3480.0,2500.0,3000.0,0,0
2,20000.0,2,6,2,22.0,0.0,0.0,0.0,0.0,0.0,...,500.0,0.0,4641.0,1019.0,900.0,0.0,1500.0,0.0,1,1
3,260000.0,2,4,2,33.0,0.0,0.0,0.0,0.0,0.0,...,30767.0,29890.0,5000.0,5000.0,1137.0,5000.0,1085.0,5000.0,0,0
4,150000.0,1,4,2,32.0,0.0,0.0,0.0,-1.0,0.0,...,143375.0,146411.0,4019.0,146896.0,157436.0,4600.0,4709.0,5600.0,0,0
5,300000.0,2,4,2,32.0,0.0,0.0,0.0,0.0,0.0,...,-450.0,700.0,15235.0,1491.0,1303.0,0.0,2000.0,1400.0,0,0
6,130000.0,1,1,1,45.0,0.0,0.0,0.0,0.0,0.0,...,63832.0,65099.0,2886.0,2908.0,2129.0,2354.0,2366.0,2291.0,0,1
7,200000.0,1,1,1,58.0,0.0,0.0,0.0,0.0,0.0,...,126921.0,129167.0,7822.0,4417.0,4446.0,4597.0,4677.0,4698.0,0,0
8,500000.0,1,1,1,39.0,0.0,0.0,0.0,0.0,0.0,...,137406.0,204975.0,54209.0,4607.0,4603.0,5224.0,207440.0,7509.0,0,0
9,230000.0,1,1,1,48.0,0.0,0.0,0.0,0.0,0.0,...,108101.0,110094.0,7000.0,6607.0,3773.0,4290.0,4164.0,2000.0,0,0


berbandingan 10 data awal dan data prediction menunjukan hanya ada 1 prediction yang salah

# 12. Kesimpulan

Exploratory Data Analysis (EDA) : 

* Sex `Female` mendominasi seluruh data dengan perbandingan 60.76 % : 39.24 terhadap `male`
* tipe education_level 2 (university) menjadi yang terbanyak dalam default_payment_next_month berkatagori (0) , jadi dapat disimpulkan **lulusan universitas mempunyai track record kredit yang baik**
* jumlah `limit_balance` yang ditanggung perusahaan sebesar 484190000.0 



Modeling : 

* Diantara seluruh model yang sudah di test `AdaBoost` mempunyai hasil yang lebih baik dibanding model yang lain
* hasil lebih baik disini based on `std` 
* model `Adaboost` lebih baik setelah dihypertuning karena menghasilkan nilai `recall` yang lebih baik
* model `AdaBost` tergolong overvitting

nilai `Accuracy`

* karena distribusi kelas target tidak seimbang maka kurang bijak jika menggunakan accuracy sebagai patokan untuk menilai model baik/tidak 

nilai `precision`  

* Jika customer yang harus nya dapat membayar kredit tapi terprediksi tidak dapat membayar lalu perusahaan memblokir kreditnya, maka customer akan merasa kecewa dan mempengaruhi tingkat kepercayaan kepada perusahaan.



nilai `recall` 

* Jika customer yang harus nya **tidak dapat** membayar kredit terprediksi bisa membayar kredit, tentunya ini berbahaya bagi keuangan perusahaan

Kesimpulan Binis : saya lebih memilih model yang tinggi tingkat prediksi `Recall` nya. Karena menurut saya lebih baik kehilangan customer dari pada memberi customer kredit sementara ia tak bisa membayar. Jika seperti itu dapat mengganggu CashFlow perusahaan. Masalah customer bisa kita serahkan kepada Departemen Marketing untuk menarik Customer.

## Konseptual Problem

1. Apa yang dimaksud dengan criterion pada Decision Tree ? Jelaskan criterion yang kalian pakai dalam kasus ini !

2. Jelaskan apa yang dimaksud dengan pruning pada Tree-based model (alasan, definisi, jenis, dll) !

3. Bagaimana cara memilih K yang optimal pada KNN ?

4. Jelaskan apa yang dimaksud dengan Cross Validation !

5. Apa yang dimaksud dengan metrics-metrics berikut : Accuracy, Precision, Recall, F1 Score, dan kapan waktu yang tepat untuk menggunakannya ?

#### Jawaban

1.  metode yang digunakan untuk mengukur kualitas pemisahan atau pemilihan fitur terbaik pada setiap langkah dalam pembentukan pohon. Pada kasus ini saya memakai `Gini Impurity` karena dapat mengukur nilai peluang pemilihan kelas secara acak.

2. Pruning pada Tree teknik untuk mengurangi kompleksitas pohon dan mencegah overfitting dengan menghapus simpul yang tidak relevan. Pruning memperbaiki overfitting dengan menghapus bagian pohon yang tidak relevan, dan karenanya memperkecil pohon. Secara umum, pruning dapat dilakukan dengan dua cara, yaitu pre-pruning dan post-pruning.

3. Cara memilih K yang optimal pada KNN adalah dengan menggunakan metode RandomSearchCV

4. Cross - Validation adalah suatu metode untuk mengevaluasi kinerja model meggunakan data yang tersedia dan mencegah terjadinya overfiting 

5.  * Accuracy (Akurasi): Accuracy mengukur sejauh mana model berhasil mengklasifikasikan data dengan benar.
    * Precision (Presisi): Precision mengukur sejauh mana prediksi positif model adalah benar.
    * Recall (Recall atau Sensitivitas): Recall mengukur sejauh mana model dapat menemukan semua sampel positif yang ada.
    * F1 Score: F1 score adalah ukuran yang menggabungkan presisi (precision) dan recall 