# I. PEMAHAMAN DOMAIN DAN TUJUAN

## 1.1. Pemahaman Penyakit Ginjal Kronis

Penyakit Ginjal Kronis (PGK) adalah suatu proses patofisiologis dengan etiologi yang beragam, mengakibatkan penurunan fungsi ginjal yang progresif, penurunan fungsi ini bersifat kronis dan irreversible.

Data didapatkan dari UCI Machine Learning Repository. Berisikan 400 data dan memiliki 25 atribut, terdiri dari 1 kelas target dan 24 atribut. 

## 1.2. Tujuan

Identifikasi PGK dengan atribut yang paling berpengaruh

# II. PEMBUATAN DATASET DAN TARGET

## 2.1 Data UCI

Data didapat dari: https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

Data tersebut di-import dalam mariaDB

In [1]:
import numpy as np
import pandas as pd
import pymysql as pskl
import matplotlib.pyplot as plt

# Menampilkan semua array
# np.set_printoptions(threshold=np.nan)

In [2]:
def dataset(retrieve="numeric", id=0, target=0): #hasil bertipe dataframe
    """
    parameters
    ----------
    retrieve: (all, numeric, polinom)
        all    : semua kolom
        numeric: hanya kolom numeric
        polinom: hanya kolom polinom
    id: (0, 1)
        0: tanpa kolom id
        1: dengan kolom id
    class: (0, 1)
        0: tanpa kolom class
        1: dengan kolom class
    
    """
    connection = pskl.connect(host="localhost", user="root", passwd="", database="knn")
    cursor = connection.cursor()
    if(retrieve=="all"):
        cols = ["id", "age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        retrieve = "SELECT id, age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wbcc, rbcc, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane, class FROM ckd_preprocessing3"
    elif(retrieve=="numeric"):
        cols = ["id", "age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "class"]
        retrieve = "SELECT id, age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wbcc, rbcc, class FROM ckd_preprocessing3"
    elif(retrieve=="polinom"):
        cols = ["id", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        retrieve = "SELECT id, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane, class FROM ckd_preprocessing3"
    resolveall = cursor.execute(retrieve)
    rows_tupple = cursor.fetchall()
    data = pd.DataFrame(list(rows_tupple))
    data.columns = cols
    if(id==0):
        data = data.drop(["id"], axis=1)
    if(target==0):
        data = data.drop(["class"], axis=1)
    data = data.fillna(value=np.nan) #mengubah missing value menjadi NaN
    return data

In [3]:
dataset(retrieve="all").head(5)

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,...,,normal,notpresent,notpresent,no,no,no,good,no,no
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,...,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,...,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no


## 2.2. Menentukan Atribut Target

Atribut "class" sebagai target dataset

# III. DATA CLEANING DAN PREPROCESSING

## 3.1 Menghilangkan Outliers

referensi: http://digilib.unila.ac.id/20585/4/II.%20TINJAUAN%20PUSTAKA.pdf

In [4]:
def outliers(df, k1=0.25, k3=0.75):
    """
    parameters:
    -----------
    df: input tipe dataframe, hanya menerima numeric
    
    mendeteksi data yang diluar batas bawah dan batas atas
    batas bawah = k1 - (k3-k1)*1.5
    batas atas = k3 + (k3-k1)*1.5
    """
    pencilan = df.apply(lambda x: (x < df[x.name].quantile(k1)-((df[x.name].quantile(k3)-df[x.name].quantile(k1))*1.5))|(x > df[x.name].quantile(k3)+((df[x.name].quantile(k3)-df[x.name].quantile(k1))*1.5)), axis=0)
    return pencilan

In [5]:
df = dataset()
pencilan = outliers(df)
pencilan.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,True,True,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [14]:
def outliers_removing(df, pencilan):
    """
    paramters:
    ----------
    df: input tipe dataframe
    pencilan: input tipe dataframe, bertipe boolean
    
    output:
    -------
    mengembalikan dataframe yang sudah menghilangkan semua outliers (semua baris yang berisi nilai pencilan true)
    """
    filtered_df = df[~(pencilan).any(axis=1)]
    return filtered_df

In [15]:
df = dataset(retrieve="all", id=1)

filtered_df = outliers_removing(df, pencilan)
filtered_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,,,notpresent,notpresent,yes,yes,no,good,yes,no
13,14,68.0,70.0,,,,98.0,86.0,4.6,135.0,...,,,notpresent,notpresent,yes,yes,yes,poor,yes,no
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,,normal,notpresent,notpresent,no,no,no,good,no,no


# IV. TRANSFORMASI DATA

## 4.1 Data Nominal Menjadi Numeric

Mengubah atribut / kolom polinom menjadi numerik

Dikarenakan semua kolom nominal hanya memiliki 2 kelas, dapat di transformasi menjadi 0 dan 1

In [16]:
def encode(df, column):
    """
    paramters:
    ----------
    df: input tipe dataframe
    column: input tipe string, menentukan kolom nominal mana yang akan di transformasi menjadi numerik
    
    output:
    ----------
    mengembalikan dataframe dengan kolom tertentu yang sudah di encode selain "NaN"
    
    """
    #python melakukan pass by reference, sehingga dibuat copy agar df sebelumnya tidak berubah
    encoded_df = df.copy()
    
    a = encoded_df[column].unique().tolist()
    c = [x for x in a if str(x) != 'nan']
    
    l=[]
    i=0
    for x in c:
        l.append(i)
        i+=1
    
    encoded_df[column] = encoded_df[column].replace(c, l)

    return encoded_df

In [22]:
encoded_df = encode(filtered_df, "ba")
encoded_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,,normal,notpresent,0.0,yes,yes,no,good,no,no
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,normal,normal,notpresent,0.0,no,no,no,good,no,no
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,,,notpresent,0.0,yes,yes,no,good,yes,no
13,14,68.0,70.0,,,,98.0,86.0,4.6,135.0,...,,,notpresent,0.0,yes,yes,yes,poor,yes,no
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,,normal,notpresent,0.0,no,no,no,good,no,no


In [19]:
filtered_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,,,notpresent,notpresent,yes,yes,no,good,yes,no
13,14,68.0,70.0,,,,98.0,86.0,4.6,135.0,...,,,notpresent,notpresent,yes,yes,yes,poor,yes,no
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,,normal,notpresent,notpresent,no,no,no,good,no,no


In [20]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
1,2,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,...,,normal,notpresent,notpresent,no,no,no,good,no,no
2,3,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,...,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes
3,4,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,...,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no
