<a href="https://colab.research.google.com/github/ingwerludwig/machine-learning-bpjs/blob/main/BPJS_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import dask.dataframe as dd
warnings.filterwarnings("ignore")
pd.set_option('max_column',None)

# Import Dataset

In [40]:
from google.colab import drive
drive.mount('/content/gdrive')
df = pd.read_csv('/content/gdrive/My Drive/BPJS_train.csv',chunksize=10000)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [41]:
test_df = pd.read_csv('/content/gdrive/My Drive/BPJS_test.csv',chunksize=1)

In [42]:
for chunk in test_df:
  display(chunk.info())
  break

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   1 non-null      int64  
 1   no_peserta                   1 non-null      float64
 2   no_keluarga                  1 non-null      int64  
 3   bobot                        1 non-null      float64
 4   id_kunjungan_fktp            1 non-null      int64  
 5   tgl_dtg_fktp                 1 non-null      object 
 6   tgl_plg_fktp                 1 non-null      object 
 7   provinsi_fktp                1 non-null      int64  
 8   kab/kota_fktp                1 non-null      int64  
 9   kepemilikan_fktp             1 non-null      int64  
 10  jenis_fktp                   1 non-null      int64  
 11  tipe_fktp                    1 non-null      int64  
 12  tingkat_pelayanan_fktp       1 non-null      int64  
 13  jenis_poli_fktp         

None

In [43]:
for chunk in df:
  display(chunk.info())
  break

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   10000 non-null  int64  
 1   no_peserta                   10000 non-null  float64
 2   no_keluarga                  10000 non-null  int64  
 3   bobot                        10000 non-null  float64
 4   id_kunjungan_fktp            10000 non-null  object 
 5   tgl_dtg_fktp                 10000 non-null  object 
 6   tgl_plg_fktp                 10000 non-null  object 
 7   provinsi_fktp                10000 non-null  int64  
 8   kab/kota_fktp                10000 non-null  int64  
 9   kepemilikan_fktp             10000 non-null  int64  
 10  jenis_fktp                   10000 non-null  int64  
 11  tipe_fktp                    10000 non-null  int64  
 12  tingkat_pelayanan_fktp       10000 non-null  int64  
 13  jenis_poli_fktp  

None

In [25]:
full_df = pd.DataFrame()
for chunk in df:
    full_df = pd.concat([full_df,chunk])

In [44]:
print('Rows length : ' + (str(full_df.shape[0])))
print('Column length : '+ (str(full_df.shape[1])))

Rows length : 4056898
Column length : 27


# Column Analysis


Details Column Description
---


1. bobot --> (maybe urgency or not) <br>
2. kepemilikan_fktp --> (related to swasta, pemerintah, etc)<br>
reference [here](https://www.dinkes.jogjaprov.go.id/berita/detail/kebijakan-akreditasi-fasilitas-kesehatan-tingkat-pertama-klinik-pratama)<br>
3. jenis_fktp --> (related to hospital, clinic, etc)<br>
reference [here](https://lifepal.co.id/media/fktp-adalah-fasilitas-kesehatan-tingkat-pertama/)
4. tipe_fktp (?) <br>
5. tingkat_pelayanan_fktp --> (?) <br>
6. segmen_peserta --> (related to Pekerja Penerima Upah (PPU), PD Pemda, Pekerja Bukan Penerima Upah (PBPU) dan Bukan Pekerja (BP), serta Penerima Bantuan Iuran Jaminan Kesehatan (PBI JK)) <br>
reference [here](https://money.kompas.com/read/2022/09/16/093043026/iuran-bpjs-kesehatan-2022-sesuai-jenis-kepesertaan#:~:text=Terdapat%20empat%20jenis%20kepesertaan%20BPJS,Jaminan%20Kesehatan%20(PBI%20JK).)
7. nama_diagnosis_fkp15	--> (doctor diagnosis) <br>
8. kode_diagnosis_ICD10 --> (related to nama_diagnosis nya) <br>
9. kode_diagnosis --> (related to kode_diagnosis_ICD10) <br>
10. kode_nama_ICD10 --> (related to diagnosis) <br>
11. jenis_kunjungan_fktp --> (related to kontrol, rawat jalan, etc)<br>
reference [here](https://blog.assist.id/perbedaan-kunjungan-sakit-dengan-perawatan-saat-mendaftarkan-kunjungan-pasien/)

<br>

Column that may not be required
---
1. id_kunjungan_fktp (maybe not required) <br>
2. no_keluarga (maybe not required) <br>
3. tgl_dtg_fktp (maybe not required) <br>
4. tgl_plg_fktp (maybe not required) <br>


# Exploratory Data Analysis

In [48]:
temp_df = full_df.iloc[1:1000000]

In [49]:
df_peserta_occurences = temp_df[['no_peserta']]
df_peserta_occurences = temp_df.groupby(['no_peserta'])['no_peserta'].count().reset_index(name="banyak_kedatangan")
df_peserta_occurences.sort_values(by=['banyak_kedatangan'],ascending=False)

Unnamed: 0,no_peserta,banyak_kedatangan
325993,100846584.0,67
120975,37301192.0,57
196054,60566306.0,56
44708,13821901.0,54
237566,73374830.0,53
...,...,...
103883,31984491.0,1
103881,31984411.0,1
103880,31984182.0,1
103879,31984056.0,1


Patient maybe come to medical center more than once as their needs

In [None]:
df_sehat = temp_df[(temp_df.kelas_status == 'Sehat')]
df_sehat

Unnamed: 0.1,Unnamed: 0,no_peserta,no_keluarga,bobot,id_kunjungan_fktp,tgl_dtg_fktp,tgl_plg_fktp,provinsi_fktp,kab/kota_fktp,kepemilikan_fktp,jenis_fktp,tipe_fktp,tingkat_pelayanan_fktp,jenis_poli_fktp,segmen_peserta,kode_nama_ICD10,kode_diagnosis_ICD10,kode_diagnosis,nama_diagnosis_fkp15,provisin_faskes_rujukan,kab/kota_faskes_rujukan,kepemilikan_faskes_rujukan,jenis_faskes_tujuan_rujukan,tipe_faskes_tujuan_rujukan,poli_faskes_tujuan_rujukan,jenis_kunjungan_fktp,kelas_status
6,6,96816042.0,58294471,11.555749,920145688,2019-07-17,2019-07-17,18,1801,9,3,2,1,12.0,2,9999,,9999,9999,98,9998,98,98,98,98,2,Sehat
7,7,52210982.0,14650656,17.963938,1047912445,2019-11-07,2019-11-07,33,3374,9,2,1,1,12.0,5,9999,,9999,9999,98,9998,98,98,98,98,2,Sehat
8,8,80877043.0,80877043,1.890941,881495888,2019-06-13,2019-06-13,51,5103,9,3,2,1,13.0,2,9999,,9999,9999,98,9998,98,98,98,98,2,Sehat
10,10,52579871.0,73011766,15.022474,852281724,2019-05-04,2019-05-04,63,6309,3,1,4,1,13.0,5,9999,,9999,9999,98,9998,98,98,98,98,2,Sehat
12,12,33986049.0,33986049,27.628746,1187702881,2020-03-17,2020-03-17,73,7317,9,3,2,1,13.0,2,9999,,9999,9999,98,9998,98,98,98,98,2,Sehat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999985,999985,172482958.0,172482952,6.093031,1381154808,2020-11-07,2020-11-07,13,1373,3,1,3,1,12.0,3,9999,,9999,9999,98,9998,98,98,98,98,2,Sehat
999990,999990,19914558.0,63597457,44.016899,1161274986,2020-02-15,2020-02-15,61,6101,3,1,4,1,12.0,4,9999,,9999,9999,98,9998,98,98,98,98,2,Sehat
999991,999991,76094629.0,54632889,16.283100,1295213060,2020-08-11,2020-08-11,16,1605,3,1,3,1,28.0,5,9999,,9999,9999,98,9998,98,98,98,98,2,Sehat
999995,999995,35886009.0,64538089,67.338501,1264746246,2020-07-02,2020-07-02,74,7471,3,1,3,1,12.0,2,9999,,9999,9999,98,9998,98,98,98,98,2,Sehat


9999 and 9998 are indicate patients have already cured

In [None]:
df.isnull().sum()

Unnamed: 0                           0
no_peserta                           0
no_keluarga                          0
bobot                                0
id_kunjungan_fktp                    0
tgl_dtg_fktp                         0
tgl_plg_fktp                         0
provinsi_fktp                        0
kab/kota_fktp                        0
kepemilikan_fktp                     0
jenis_fktp                           0
tipe_fktp                            0
tingkat_pelayanan_fktp               0
jenis_poli_fktp                      1
segmen_peserta                       0
kode_nama_ICD10                      0
kode_diagnosis_ICD10           1380241
kode_diagnosis                       0
nama_diagnosis_fkp15                 0
provisin_faskes_rujukan              0
kab/kota_faskes_rujukan              0
kepemilikan_faskes_rujukan           0
jenis_faskes_tujuan_rujukan          0
tipe_faskes_tujuan_rujukan           0
poli_faskes_tujuan_rujukan           0
jenis_kunjungan_fktp     

In [None]:
print('Null data diagnosis classification and has been cured : '+ str(round((len(df[(df.nama_diagnosis_fkp15 == '9999') & (df.kelas_status == 'Sehat')])/df.shape[0])*100,2))  + '%')

Null data diagnosis classification and has been cured : 34.02%


# Check all unique values independent variable for further explanation 

# Feature Engineering

In [None]:
id_columns = ['no_keluarga','id_kunjungan_fktp','tgl_dtg_fktp','tgl_plg_fktp']
df = df.drop(id_columns,axis=1)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,bobot,provinsi_fktp,kab/kota_fktp,kepemilikan_fktp,jenis_fktp,tipe_fktp,tingkat_pelayanan_fktp,jenis_poli_fktp,segmen_peserta,kode_nama_ICD10,kode_diagnosis_ICD10,kode_diagnosis,nama_diagnosis_fkp15,provisin_faskes_rujukan,kab/kota_faskes_rujukan,kepemilikan_faskes_rujukan,jenis_faskes_tujuan_rujukan,tipe_faskes_tujuan_rujukan,poli_faskes_tujuan_rujukan,jenis_kunjungan_fktp,kelas_status
0,0,20.064983,61,6108,3,1,4,1,12.0,2,9999,,9999,9999,98,9998,98,98,98,98,2,Sehat
1,1,2.626307,51,5171,9,3,2,1,1.0,4,773,K29,K297,"Gastritis, unspecified",98,9998,98,98,98,98,1,Belum_Sehat
2,2,1.05183,35,3516,9,2,1,1,3.0,4,1757,Z30,Z309,"Contraceptive management, unspecified",98,9998,98,98,98,98,1,Belum_Sehat
3,3,364.741455,34,3402,3,1,4,1,1.0,5,621,I10,I10,Essential (primary) hypertension,98,9998,98,98,98,98,1,Belum_Sehat
4,4,1.050523,35,3509,3,1,4,1,1.0,1,622,I11,I110,Hypertensive heart disease with (congestive) h...,98,9998,98,98,98,98,1,Belum_Sehat
