# I. PEMAHAMAN DOMAIN DAN TUJUAN

## 1.1. Pemahaman Penyakit Ginjal Kronis

Penyakit Ginjal Kronis (PGK) adalah suatu proses patofisiologis dengan etiologi yang beragam, mengakibatkan penurunan fungsi ginjal yang progresif, penurunan fungsi ini bersifat kronis dan irreversible.

Data didapatkan dari UCI Machine Learning Repository. Berisikan 400 data dan memiliki 25 atribut, terdiri dari 1 kelas target dan 24 atribut. 

## 1.2. Tujuan

Identifikasi PGK dengan atribut yang paling berpengaruh

# II. PEMBUATAN DATASET DAN TARGET

## 2.1 Data UCI

Data didapat dari: https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

Data tersebut di-import dalam mariaDB

In [1]:
import numpy as np
import pandas as pd
import pymysql as pskl
import matplotlib.pyplot as plt

# Menampilkan semua array
# np.set_printoptions(threshold=np.nan)

In [2]:
def dataset(retrieve="numeric", id=0, target=0): #hasil bertipe dataframe
    """
    parameters
    ----------
    retrieve: (all, numeric, polinom)
        all    : semua kolom
        numeric: hanya kolom numeric
        polinom: hanya kolom polinom
    id: (0, 1)
        0: tanpa kolom id
        1: dengan kolom id
    class: (0, 1)
        0: tanpa kolom class
        1: dengan kolom class
    
    """
    connection = pskl.connect(host="localhost", user="root", passwd="", database="knn")
    cursor = connection.cursor()
    if(retrieve=="all"):
        cols = ["id", "age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        retrieve = "SELECT id, age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wbcc, rbcc, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane, class FROM ckd_preprocessing3"
    elif(retrieve=="numeric"):
        cols = ["id", "age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "class"]
        retrieve = "SELECT id, age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wbcc, rbcc, class FROM ckd_preprocessing3"
    elif(retrieve=="polinom"):
        cols = ["id", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        retrieve = "SELECT id, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane, class FROM ckd_preprocessing3"
    resolveall = cursor.execute(retrieve)
    rows_tupple = cursor.fetchall()
    data = pd.DataFrame(list(rows_tupple))
    data.columns = cols
    if(id==0):
        data = data.drop(["id"], axis=1)
    if(target==0):
        data = data.drop(["class"], axis=1)
    data = data.fillna(value=np.nan) #mengubah missing value menjadi NaN
    return data

In [3]:
dataset(retrieve="all").head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,...,,normal,notpresent,notpresent,no,no,no,good,no,no
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,...,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,...,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no


## 2.2. Menentukan Atribut Target

Atribut "class" sebagai target dataset

# III. DATA CLEANING DAN PREPROCESSING

## 3.1 Menghilangkan Outliers

referensi: http://digilib.unila.ac.id/20585/4/II.%20TINJAUAN%20PUSTAKA.pdf

In [4]:
def outliers(df, k1=0.25, k3=0.75):
    """
    parameters:
    -----------
    df: input tipe dataframe, hanya menerima numeric
    
    mendeteksi data yang diluar batas bawah dan batas atas
    batas bawah = k1 - (k3-k1)*1.5
    batas atas = k3 + (k3-k1)*1.5
    """
    pencilan = df.apply(lambda x: (x < df[x.name].quantile(k1)-((df[x.name].quantile(k3)-df[x.name].quantile(k1))*1.5))|(x > df[x.name].quantile(k3)+((df[x.name].quantile(k3)-df[x.name].quantile(k1))*1.5)), axis=0)
    return pencilan

In [5]:
df = dataset()
pencilan = outliers(df)
pencilan.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,True,True,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
def outliers_removing(df, pencilan):
    """
    paramters:
    ----------
    df: input tipe dataframe
    pencilan: input tipe dataframe, bertipe boolean
    
    output:
    -------
    mengembalikan dataframe yang sudah menghilangkan semua outliers (semua baris yang berisi nilai pencilan true)
    """
    filtered_df = df[~(pencilan).any(axis=1)]
    return filtered_df

In [7]:
df = dataset(retrieve="all", id=1)

filtered_df = outliers_removing(df, pencilan)
filtered_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,,,notpresent,notpresent,yes,yes,no,good,yes,no
13,14,68.0,70.0,,,,98.0,86.0,4.6,135.0,...,,,notpresent,notpresent,yes,yes,yes,poor,yes,no
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,,normal,notpresent,notpresent,no,no,no,good,no,no


# IV. TRANSFORMASI DATA

## 4.1 Data Nominal Menjadi Numeric

Mengubah atribut / kolom polinom menjadi numerik

Dikarenakan semua kolom nominal hanya memiliki 2 kelas, dapat di transformasi menjadi 0 dan 1

In [8]:
def encode(df, column="all_nominal"):
    """
    paramters:
    ----------
    df: input tipe dataframe
    column: input tipe string, menentukan kolom nominal mana yang akan di transformasi menjadi numerik
    
    output:
    ----------
    mengembalikan dataframe dengan kolom tertentu yang sudah di encode selain "NaN"
    
    """
    #python melakukan pass by reference, sehingga dibuat copy agar df sebelumnya tidak berubah
    
    encoded_df = df.copy()
    if(column!="all_nominal"):
        a = encoded_df[column].unique().tolist()
        c = [x for x in a if str(x) != 'nan']
        l=[]
        i=0
        for x in c:
            l.append(i)
            i+=1
        encoded_df[column] = encoded_df[column].replace(c, l)
    
    elif(column=="all_nominal"):
        all_nominal = ["rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"]
        for col in all_nominal:
            encoded_df = encode(encoded_df, col) #rekursif
            
    return encoded_df

In [21]:
encoded_df = encode(filtered_df, "all_nominal")
encoded_df

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.020,1.0,0.0,121.0,36.0,1.20,,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,51.0,80.0,1.010,2.0,0.0,106.0,26.0,1.40,,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.10,142.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,14,68.0,70.0,,,,98.0,86.0,4.60,135.0,...,,,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.20,138.0,...,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
17,18,47.0,80.0,,,,114.0,87.0,5.20,139.0,...,,,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
19,20,62.0,60.0,1.015,1.0,0.0,100.0,31.0,1.60,,...,,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
23,24,21.0,70.0,1.010,0.0,0.0,,,,,...,,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
25,26,61.0,60.0,1.025,0.0,0.0,108.0,75.0,1.90,141.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
26,27,75.0,80.0,1.015,0.0,0.0,156.0,45.0,2.40,140.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
len(encoded_df.columns)

25

In [11]:
from sklearn.preprocessing import Imputer

In [23]:
imputer = Imputer(missing_values="NaN", strategy="mean", axis = 0)    
imputer = imputer.fit(encoded_df[["rbc"]])
imputer

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [36]:
filledmissing_df = imputer.transform(encoded_df[["rbc"]])
filledmissing_df

array([[ 0.08383234],
       [ 0.        ],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 1.        ],
       [ 0.08383234],
       [ 0.        ],
       [ 0.08383234],
       [ 1.        ],
       [ 0.        ],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 1.        ],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 1.        ],
       [ 0.08383234],
       [ 0.        ],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.        ],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.        ],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 1.        ],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0.08383234],
       [ 0

In [59]:
df_change = filledmissing_df.ravel()
df_change

array([ 0.08383234,  0.        ,  0.08383234,  0.08383234,  0.08383234,
        0.08383234,  0.08383234,  0.08383234,  0.08383234,  0.08383234,
        1.        ,  0.08383234,  0.        ,  0.08383234,  1.        ,
        0.        ,  0.08383234,  0.08383234,  0.08383234,  0.08383234,
        0.08383234,  1.        ,  0.08383234,  0.08383234,  0.08383234,
        1.        ,  0.08383234,  0.        ,  0.08383234,  0.08383234,
        0.08383234,  0.08383234,  0.08383234,  0.        ,  0.08383234,
        0.08383234,  0.        ,  0.08383234,  0.08383234,  0.08383234,
        1.        ,  0.08383234,  0.08383234,  0.08383234,  0.08383234,
        0.08383234,  0.08383234,  1.        ,  1.        ,  0.08383234,
        1.        ,  0.08383234,  0.08383234,  0.08383234,  0.08383234,
        0.08383234,  0.08383234,  0.08383234,  0.08383234,  1.        ,
        1.        ,  0.08383234,  0.08383234,  1.        ,  0.08383234,
        0.        ,  0.08383234,  1.        ,  0.08383234,  0.08

In [54]:
change = filledmissing_df.ravel().tolist()

In [67]:
change_df = pd.DataFrame(filledmissing_df.ravel())
change_df[0]

0      0.083832
1      0.000000
2      0.083832
3      0.083832
4      0.083832
5      0.083832
6      0.083832
7      0.083832
8      0.083832
9      0.083832
10     1.000000
11     0.083832
12     0.000000
13     0.083832
14     1.000000
15     0.000000
16     0.083832
17     0.083832
18     0.083832
19     0.083832
20     0.083832
21     1.000000
22     0.083832
23     0.083832
24     0.083832
25     1.000000
26     0.083832
27     0.000000
28     0.083832
29     0.083832
         ...   
221    0.000000
222    0.000000
223    0.000000
224    0.000000
225    0.000000
226    0.000000
227    0.000000
228    0.000000
229    0.000000
230    0.000000
231    0.000000
232    0.083832
233    0.000000
234    0.000000
235    0.000000
236    0.000000
237    0.000000
238    0.000000
239    0.000000
240    0.000000
241    0.000000
242    0.000000
243    0.000000
244    0.000000
245    0.000000
246    0.000000
247    0.000000
248    0.000000
249    0.000000
250    0.000000
Name: 0, Length: 251, dt

In [81]:
encoded_df["rbc"] = df_change
encoded_df

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.020,1.0,0.0,121.0,36.0,1.20,,...,0.083832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,51.0,80.0,1.010,2.0,0.0,106.0,26.0,1.40,,...,0.000000,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.10,142.0,...,0.083832,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,14,68.0,70.0,,,,98.0,86.0,4.60,135.0,...,0.083832,,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.20,138.0,...,0.083832,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
17,18,47.0,80.0,,,,114.0,87.0,5.20,139.0,...,0.083832,,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
19,20,62.0,60.0,1.015,1.0,0.0,100.0,31.0,1.60,,...,0.083832,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
23,24,21.0,70.0,1.010,0.0,0.0,,,,,...,0.083832,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
25,26,61.0,60.0,1.025,0.0,0.0,108.0,75.0,1.90,141.0,...,0.083832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
26,27,75.0,80.0,1.015,0.0,0.0,156.0,45.0,2.40,140.0,...,0.083832,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [88]:
def missing_handling(df, column="all", method="mean"):
    copy_df = df
    imputer = Imputer(missing_values="NaN", strategy=method, axis = 0)    
    
    if(column!="all"):
        imputer = imputer.fit(copy_df[[column]])
        filledmissing_df = imputer.transform(copy_df[[column]])
        df_change = filledmissing_df.ravel()    
        copy_df[column] = df_change
        
    elif(column=="all"):
        all_col = ["age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"]
        for col in all_col:
            copy_df = missing_handling(copy_df, col, method) #rekursif
    return copy_df

In [93]:
a = missing_handling(encoded_df, "all")
a

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.020000,1.000000,0.0,121.000000,36.000000,1.200000,140.50495,...,0.083832,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,51.0,80.0,1.010000,2.000000,0.0,106.000000,26.000000,1.400000,140.50495,...,0.000000,0.000000,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,6,60.0,90.0,1.015000,3.000000,0.0,74.000000,25.000000,1.100000,142.00000,...,0.083832,0.100917,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,14,68.0,70.0,1.019432,0.458515,0.0,98.000000,86.000000,4.600000,135.00000,...,0.083832,0.100917,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
16,17,47.0,70.0,1.015000,2.000000,0.0,99.000000,46.000000,2.200000,138.00000,...,0.083832,0.000000,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
17,18,47.0,80.0,1.019432,0.458515,0.0,114.000000,87.000000,5.200000,139.00000,...,0.083832,0.100917,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
19,20,62.0,60.0,1.015000,1.000000,0.0,100.000000,31.000000,1.600000,140.50495,...,0.083832,1.000000,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
23,24,21.0,70.0,1.010000,0.000000,0.0,116.254386,38.378571,1.290966,140.50495,...,0.083832,0.000000,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
25,26,61.0,60.0,1.025000,0.000000,0.0,108.000000,75.000000,1.900000,141.00000,...,0.083832,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
26,27,75.0,80.0,1.015000,0.000000,0.0,156.000000,45.000000,2.400000,140.00000,...,0.083832,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
