# I. PEMAHAMAN DOMAIN DAN TUJUAN

## 1.1. Pemahaman Penyakit Ginjal Kronis

Penyakit Ginjal Kronis (PGK) adalah suatu proses patofisiologis dengan etiologi yang beragam, mengakibatkan penurunan fungsi ginjal yang progresif, penurunan fungsi ini bersifat kronis dan irreversible.

Data didapatkan dari UCI Machine Learning Repository. Berisikan 400 data dan memiliki 25 atribut, terdiri dari 1 kelas target dan 24 atribut. 

## 1.2. Tujuan

Identifikasi PGK dengan atribut yang paling berpengaruh

# II. PEMBUATAN DATASET DAN TARGET

## 2.1 Data UCI

Data didapat dari: https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

Data tersebut di-import dalam mariaDB

In [5]:
import numpy as np
import pandas as pd
import pymysql as pskl
import matplotlib.pyplot as plt

# Menampilkan semua array
# np.set_printoptions(threshold=np.nan)

In [6]:
def dataset(retrieve="numeric", id=0, target=0): #hasil bertipe dataframe
    """
    parameters
    ----------
    retrieve: (all, numeric, polinom)
        all    : semua kolom
        numeric: hanya kolom numeric
        polinom: hanya kolom polinom
    id: (0, 1)
        0: tanpa kolom id
        1: dengan kolom id
    class: (0, 1)
        0: tanpa kolom class
        1: dengan kolom class
    
    """
    connection = pskl.connect(host="localhost", user="root", passwd="", database="knn")
    cursor = connection.cursor()
    if(retrieve=="all"):
        cols = ["id", "age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        retrieve = "SELECT id, age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wbcc, rbcc, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane, class FROM ckd_preprocessing3"
    elif(retrieve=="numeric"):
        cols = ["id", "age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "class"]
        retrieve = "SELECT id, age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wbcc, rbcc, class FROM ckd_preprocessing3"
    elif(retrieve=="polinom"):
        cols = ["id", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        retrieve = "SELECT id, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane, class FROM ckd_preprocessing3"
    resolveall = cursor.execute(retrieve)
    rows_tupple = cursor.fetchall()
    data = pd.DataFrame(list(rows_tupple))
    data.columns = cols
    if(id==0):
        data = data.drop(["id"], axis=1)
    if(target==0):
        data = data.drop(["class"], axis=1)
    data = data.fillna(value=np.nan) #mengubah missing value menjadi NaN
    return data

In [7]:
dataset(retrieve="all").head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,...,,normal,notpresent,notpresent,no,no,no,good,no,no
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,...,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,...,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no


## 2.2. Menentukan Atribut Target

Atribut "class" sebagai target dataset

# III. DATA CLEANING DAN PREPROCESSING

## 3.1 Menghilangkan Outliers

referensi: http://digilib.unila.ac.id/20585/4/II.%20TINJAUAN%20PUSTAKA.pdf

In [8]:
def outliers(df, k1=0.25, k3=0.75):
    """
    parameters:
    -----------
    df: input tipe dataframe, hanya menerima numeric
    
    mendeteksi data yang diluar batas bawah dan batas atas
    batas bawah = k1 - (k3-k1)*1.5
    batas atas = k3 + (k3-k1)*1.5
    """
    pencilan = df.apply(lambda x: (x < df[x.name].quantile(k1)-((df[x.name].quantile(k3)-df[x.name].quantile(k1))*1.5))|(x > df[x.name].quantile(k3)+((df[x.name].quantile(k3)-df[x.name].quantile(k1))*1.5)), axis=0)
    return pencilan

In [9]:
df = dataset()
pencilan = outliers(df)
pencilan.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,True,True,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [43]:
def outliers_removing(df, pencilan):
    """
    paramters:
    ----------
    df: input tipe dataframe
    pencilan: input tipe dataframe, bertipe boolean
    
    output:
    -------
    mengembalikan dataframe yang sudah menghilangkan semua outliers (semua baris yang berisi nilai pencilan true)
    """
    filtered_df = df[~(pencilan).any(axis=1)]
    return filtered_df

In [44]:
df = dataset(retrieve="all", id=1)

filtered_df = outliers_removing(df, pencilan)
filtered_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,,,notpresent,notpresent,yes,yes,no,good,yes,no
13,14,68.0,70.0,,,,98.0,86.0,4.6,135.0,...,,,notpresent,notpresent,yes,yes,yes,poor,yes,no
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,,normal,notpresent,notpresent,no,no,no,good,no,no


# IV. TRANSFORMASI DATA

## 4.1 Data Nominal Menjadi Numeric

Mengubah atribut / kolom polinom menjadi numerik

Dikarenakan semua kolom nominal hanya memiliki 2 kelas, dapat di transformasi menjadi 0 dan 1

In [45]:
def encode(df, column="all_nominal"):
    """
    paramters:
    ----------
    df: input tipe dataframe
    column: input tipe string, menentukan kolom nominal mana yang akan di transformasi menjadi numerik
    
    output:
    ----------
    mengembalikan dataframe dengan kolom tertentu yang sudah di encode selain "NaN"
    
    """
    #python melakukan pass by reference, sehingga dibuat copy agar df sebelumnya tidak berubah
    
    copy_df = df.copy()
    if(column!="all_nominal"):
        a = copy_df[column].unique().tolist()
        c = [x for x in a if str(x) != 'nan']
        l=[]
        i=0
        for x in c:
            l.append(i)
            i+=1
        copy_df[column] = copy_df[column].replace(c, l)
    
    elif(column=="all_nominal"):
        all_nominal = ["rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"]
        for col in all_nominal:
            copy_df = encode(copy_df, col) #rekursif
            
    return copy_df

In [46]:
encoded_df = encode(filtered_df, "all_nominal")
encoded_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,14,68.0,70.0,,,,98.0,86.0,4.6,135.0,...,,,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


## 4.2 Penanganan Data yang Hilang

Mengubah NaN menjadi strategi tertentu, dalam hal ini diubah menjadi rata - rata dari setiap kolom

In [14]:
from sklearn.preprocessing import Imputer

In [41]:
def missing_handling(df, column="all", method="mean"):
    """
    paramters:
    ----------
    df: input tipe dataframe
    column: menentukan kolom numerik mana yang akan dilakukan penanganan missing value
    method: strategi penanganan missing value
    
    output:
    ----------
    mengembalikan dataframe dengan kolom yang sudah dilakukan penanganan NaN
    """
    
    copy_df = df.copy()
    imputer = Imputer(missing_values="NaN", strategy=method, axis = 0)    
    
    if(column!="all"):
        imputer = imputer.fit(copy_df[[column]])
        filledmissing_df = imputer.transform(copy_df[[column]])
        df_change = filledmissing_df.ravel()    
        copy_df[column] = df_change
        
    elif(column=="all"):
        all_col = ["age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"]
        for col in all_col:
            copy_df = missing_handling(copy_df, col, method) #rekursif
    return copy_df

In [42]:
missing_handling_df = missing_handling(encoded_df, "all")
missing_handling_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,140.50495,...,0.083832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,140.50495,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,0.083832,0.100917,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,14,68.0,70.0,1.019432,0.458515,0.0,98.0,86.0,4.6,135.0,...,0.083832,0.100917,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,0.083832,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


## 4.3 Normalisasi

In [29]:
from sklearn.preprocessing import MinMaxScaler

In [30]:
normalization = MinMaxScaler(feature_range=(0,1))

In [32]:
normalization_array = normalization.fit_transform(missing_handling_df[["age"]])
normalization_array

array([[ 0.46835443],
       [ 0.50632911],
       [ 0.62025316],
       [ 0.72151899],
       [ 0.4556962 ],
       [ 0.4556962 ],
       [ 0.64556962],
       [ 0.12658228],
       [ 0.63291139],
       [ 0.81012658],
       [ 0.72151899],
       [ 0.78481013],
       [ 0.82278481],
       [ 0.7721519 ],
       [ 0.73417722],
       [ 0.44303797],
       [ 0.43037975],
       [ 0.46835443],
       [ 0.        ],
       [ 0.53164557],
       [ 0.53164557],
       [ 0.30379747],
       [ 0.05063291],
       [ 0.55696203],
       [ 0.41772152],
       [ 0.70886076],
       [ 0.6835443 ],
       [ 0.44303797],
       [ 0.70886076],
       [ 0.74683544],
       [ 0.56962025],
       [ 0.79746835],
       [ 0.34177215],
       [ 0.46835443],
       [ 0.74683544],
       [ 0.49367089],
       [ 0.75949367],
       [ 0.6835443 ],
       [ 0.64556962],
       [ 0.6835443 ],
       [ 0.29113924],
       [ 0.75949367],
       [ 0.07594937],
       [ 0.55696203],
       [ 0.43037975],
       [ 0

In [34]:
change = normalization_array.ravel()
change

array([ 0.46835443,  0.50632911,  0.62025316,  0.72151899,  0.4556962 ,
        0.4556962 ,  0.64556962,  0.12658228,  0.63291139,  0.81012658,
        0.72151899,  0.78481013,  0.82278481,  0.7721519 ,  0.73417722,
        0.44303797,  0.43037975,  0.46835443,  0.        ,  0.53164557,
        0.53164557,  0.30379747,  0.05063291,  0.55696203,  0.41772152,
        0.70886076,  0.6835443 ,  0.44303797,  0.70886076,  0.74683544,
        0.56962025,  0.79746835,  0.34177215,  0.46835443,  0.74683544,
        0.49367089,  0.75949367,  0.6835443 ,  0.64556962,  0.6835443 ,
        0.29113924,  0.75949367,  0.07594937,  0.55696203,  0.43037975,
        0.5443038 ,  0.65822785,  0.49028853,  0.01265823,  0.4556962 ,
        0.49028853,  0.49028853,  0.55696203,  0.62025316,  0.5443038 ,
        0.81012658,  0.44303797,  0.78481013,  0.37974684,  0.62025316,
        0.6835443 ,  0.82278481,  0.35443038,  0.49367089,  0.69620253,
        0.64556962,  0.88607595,  0.64556962,  0.44303797,  0.03

In [54]:
missing_handling_df["age"] = change
missing_handling_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,0.468354,80.0,1.02,1.0,0.0,121.0,36.0,1.2,140.50495,...,0.083832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.506329,80.0,1.01,2.0,0.0,106.0,26.0,1.4,140.50495,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,6,0.620253,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,0.083832,0.100917,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,14,0.721519,70.0,1.019432,0.458515,0.0,98.0,86.0,4.6,135.0,...,0.083832,0.100917,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
16,17,0.455696,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,0.083832,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [55]:
def normalization(df, column="all", f_range=(0,1)):
    """
    paramters:
    ----------
    df: input tipe dataframe
    column: menentukan kolom numerik mana yang akan dilakukan penanganan missing value
    range: range normalisasi
    
    output:
    ----------
    mengembalikan dataframe dengan kolom yang sudah dinormalisasi
    """
    
    copy_df = df.copy()
    scale = MinMaxScaler(feature_range=f_range)    
    
    if(column!="all"):
        normalization_array = scale.fit_transform(copy_df[[column]])
        df_change = normalization_array.ravel()    
        copy_df[column] = df_change
        
    elif(column=="all"):
        all_col = ["age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"]
        for col in all_col:
            copy_df = normalization(copy_df, col, f_range) #rekursif
    return copy_df

In [58]:
normalization_df = normalization(missing_handling_df, column="all", f_range=(0,3))
normalization_df

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,1.405063,2.0,2.250000,0.750000,0.0,0.859551,0.750000,0.500000,1.860594,...,0.251497,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,1.518987,2.0,0.750000,1.500000,0.0,0.606742,0.461538,0.625000,1.860594,...,0.000000,0.000000,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0
5,6,1.860759,3.0,1.500000,2.250000,0.0,0.067416,0.432692,0.437500,2.040000,...,0.251497,0.302752,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
13,14,2.164557,1.0,2.164847,0.343886,0.0,0.471910,2.192308,2.625000,1.200000,...,0.251497,0.302752,0.0,0.0,0.0,0.0,3.0,3.0,3.0,0.0
16,17,1.367089,1.0,1.500000,1.500000,0.0,0.488764,1.038462,1.125000,1.560000,...,0.251497,0.000000,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0
17,18,1.367089,2.0,2.164847,0.343886,0.0,0.741573,2.221154,3.000000,1.680000,...,0.251497,0.302752,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0
19,20,1.936709,0.0,1.500000,0.750000,0.0,0.505618,0.605769,0.750000,1.860594,...,0.251497,3.000000,3.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0
23,24,0.379747,1.0,0.750000,0.000000,0.0,0.779568,0.818613,0.556854,1.860594,...,0.251497,0.000000,0.0,0.0,3.0,3.0,0.0,3.0,0.0,3.0
25,26,1.898734,0.0,3.000000,0.000000,0.0,0.640449,1.875000,0.937500,1.920000,...,0.251497,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
26,27,2.430380,2.0,1.500000,0.000000,0.0,1.449438,1.009615,1.250000,1.800000,...,0.251497,0.000000,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
