# I. PEMAHAMAN DOMAIN DAN TUJUAN

## 1.1. Pemahaman Penyakit Ginjal Kronis

Penyakit Ginjal Kronis (PGK) adalah suatu proses patofisiologis dengan etiologi yang beragam, mengakibatkan penurunan fungsi ginjal yang progresif, penurunan fungsi ini bersifat kronis dan irreversible.

Data didapatkan dari UCI Machine Learning Repository. Berisikan 400 data dan memiliki 25 atribut, terdiri dari 1 kelas target dan 24 atribut. 

## 1.2. Tujuan

Identifikasi PGK dengan atribut yang paling berpengaruh

# II. PEMBUATAN DATASET DAN TARGET

## 2.1 Data UCI

Data didapat dari: https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

Data tersebut di-import dalam mariaDB

In [1]:
import numpy as np
import pandas as pd
import pymysql as pskl
import matplotlib.pyplot as plt

# Menampilkan semua array
# np.set_printoptions(threshold=np.nan)

In [2]:
def dataset(retrieve="numeric", id=0, target=0): #hasil bertipe dataframe
    """
    parameters
    ----------
    retrieve: (all, numeric, polinom)
        all    : semua kolom
        numeric: hanya kolom numeric
        polinom: hanya kolom polinom
    id: (0, 1)
        0: tanpa kolom id
        1: dengan kolom id
    class: (0, 1)
        0: tanpa kolom class
        1: dengan kolom class
    
    """
    connection = pskl.connect(host="localhost", user="root", passwd="", database="knn")
    cursor = connection.cursor()
    if(retrieve=="all"):
        cols = ["id", "age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        retrieve = "SELECT id, age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wbcc, rbcc, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane, class FROM ckd_preprocessing3"
    elif(retrieve=="numeric"):
        cols = ["id", "age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "class"]
        retrieve = "SELECT id, age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wbcc, rbcc, class FROM ckd_preprocessing3"
    elif(retrieve=="polinom"):
        cols = ["id", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        retrieve = "SELECT id, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane, class FROM ckd_preprocessing3"
    resolveall = cursor.execute(retrieve)
    rows_tupple = cursor.fetchall()
    data = pd.DataFrame(list(rows_tupple))
    data.columns = cols
    if(id==0):
        data = data.drop(["id"], axis=1)
    if(target==0):
        data = data.drop(["class"], axis=1)
    data = data.fillna(value=np.nan) #mengubah missing value menjadi NaN
    return data

In [3]:
dataset(retrieve="all").head(10)

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,...,,normal,notpresent,notpresent,no,no,no,good,no,no
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,...,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,...,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no
5,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,3.2,...,,,notpresent,notpresent,yes,yes,no,good,yes,no
6,68.0,70.0,1.01,0.0,0.0,100.0,54.0,24.0,104.0,4.0,...,,normal,notpresent,notpresent,no,no,no,good,no,no
7,24.0,,1.015,2.0,4.0,410.0,31.0,1.1,,,...,normal,abnormal,notpresent,notpresent,no,yes,no,good,yes,no
8,52.0,100.0,1.015,3.0,0.0,138.0,60.0,1.9,,,...,normal,abnormal,present,notpresent,yes,yes,no,good,no,yes
9,53.0,90.0,1.02,2.0,0.0,70.0,107.0,7.2,114.0,3.7,...,abnormal,abnormal,present,notpresent,yes,yes,no,poor,no,yes


## 2.2. Menentukan Atribut Target

Atribut "class" sebagai target dataset

# III. DATA CLEANING DAN PREPROCESSING

## 3.1 Menghilangkan Outliers

referensi: http://digilib.unila.ac.id/20585/4/II.%20TINJAUAN%20PUSTAKA.pdf

In [4]:
def outliers(df, k1=0.25, k3=0.75):
    """
    parameters:
    -----------
    df: input tipe dataframe, hanya menerima numeric
    
    mendeteksi data yang diluar batas bawah dan batas atas
    batas bawah = k1 - (k3-k1)*1.5
    batas atas = k3 + (k3-k1)*1.5
    """
    pencilan = df.apply(lambda x: (x < df[x.name].quantile(k1)-((df[x.name].quantile(k3)-df[x.name].quantile(k1))*1.5))|(x > df[x.name].quantile(k3)+((df[x.name].quantile(k3)-df[x.name].quantile(k1))*1.5)), axis=0)
    return pencilan

In [5]:
df = dataset()
pencilan = outliers(df)
pencilan.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,True,True,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
def outliers_removing(df, pencilan):
    """
    paramters:
    ----------
    df: input tipe dataframe, mengembalikan bentuk asli
    outliers: input tipe dataframe, bertipe boolean
    
    mengembalikan dataframe awal kemudian menghilangkan semua outliers (semua baris yang berisi nilai true)
    """
    filtered_df = df[~(pencilan).any(axis=1)]
    return filtered_df

In [7]:
df = dataset(retrieve="all", id=1)

filtered_df = outliers_removing(df, pencilan)
filtered_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,,,notpresent,notpresent,yes,yes,no,good,yes,no
13,14,68.0,70.0,,,,98.0,86.0,4.6,135.0,...,,,notpresent,notpresent,yes,yes,yes,poor,yes,no
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,,normal,notpresent,notpresent,no,no,no,good,no,no


# IV. TRANSFORMASI DATA

## 4.1 Data Nominal Menjadi Numeric

Mengubah atribut / kolom polinom menjadi numerik

Dikarenakan semua kolom nominal hanya memiliki 2 kelas, dapat di transformasi menjadi 0 dan 1

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
filtered_df["rbc"].value_counts()

normal      153
abnormal     14
Name: rbc, dtype: int64

In [10]:
filtered_df["pc"].value_counts()

normal      196
abnormal     22
Name: pc, dtype: int64

In [11]:
filtered_df["pcc"].value_counts()

notpresent    236
present        11
Name: pcc, dtype: int64

In [12]:
filtered_df["ba"].value_counts()

notpresent    241
present         6
Name: ba, dtype: int64

In [13]:
filtered_df["htn"].value_counts()

no     200
yes     49
Name: htn, dtype: int64

In [14]:
filtered_df["dm"].value_counts()

no     207
yes     42
Name: dm, dtype: int64

In [15]:
filtered_df["cad"].value_counts()

no     241
yes      8
Name: cad, dtype: int64

In [16]:
filtered_df["appet"].value_counts()

good    224
poor     26
Name: appet, dtype: int64

In [17]:
filtered_df["pe"].value_counts()

no     220
yes     30
Name: pe, dtype: int64

In [18]:
filtered_df["ane"].value_counts()

no     235
yes     15
Name: ane, dtype: int64

In [19]:
filtered_df.dtypes

id         int64
age      float64
bp       float64
sg       float64
al       float64
su       float64
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
rbc       object
pc        object
pcc       object
ba        object
htn       object
dm        object
cad       object
appet     object
pe        object
ane       object
dtype: object

In [20]:
filtered_df["pcv"].dtypes

dtype('float64')

In [21]:
len(filtered_df.columns)

25

In [22]:
filtered_df.iloc[:,0].dtypes

dtype('int64')

In [23]:
for i in range(len(filtered_df.columns)):
    print(filtered_df.iloc[:,i].dtypes)

int64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
object
object
object
object
object
object
object
object
object
object


In [24]:
filtered_df.dtypes

id         int64
age      float64
bp       float64
sg       float64
al       float64
su       float64
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
rbc       object
pc        object
pcc       object
ba        object
htn       object
dm        object
cad       object
appet     object
pe        object
ane       object
dtype: object

In [25]:
encode = LabelEncoder()
filtered_df

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.020,1.0,0.0,121.0,36.0,1.20,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
4,5,51.0,80.0,1.010,2.0,0.0,106.0,26.0,1.40,,...,normal,normal,notpresent,notpresent,no,no,no,good,no,no
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.10,142.0,...,,,notpresent,notpresent,yes,yes,no,good,yes,no
13,14,68.0,70.0,,,,98.0,86.0,4.60,135.0,...,,,notpresent,notpresent,yes,yes,yes,poor,yes,no
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.20,138.0,...,,normal,notpresent,notpresent,no,no,no,good,no,no
17,18,47.0,80.0,,,,114.0,87.0,5.20,139.0,...,,,notpresent,notpresent,yes,no,no,poor,no,no
19,20,62.0,60.0,1.015,1.0,0.0,100.0,31.0,1.60,,...,,abnormal,present,notpresent,yes,no,yes,good,no,no
23,24,21.0,70.0,1.010,0.0,0.0,,,,,...,,normal,notpresent,notpresent,no,no,no,poor,no,yes
25,26,61.0,60.0,1.025,0.0,0.0,108.0,75.0,1.90,141.0,...,,normal,notpresent,notpresent,yes,yes,no,good,no,yes
26,27,75.0,80.0,1.015,0.0,0.0,156.0,45.0,2.40,140.0,...,,normal,notpresent,notpresent,yes,yes,no,poor,no,no


In [26]:
filtered_df = filtered_df.replace({"rbc": {"normal": 0, "abnormal": 1}})

In [27]:
filtered_df.head(15)

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,,normal,notpresent,notpresent,yes,yes,no,good,no,no
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,0.0,normal,notpresent,notpresent,no,no,no,good,no,no
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,,,notpresent,notpresent,yes,yes,no,good,yes,no
13,14,68.0,70.0,,,,98.0,86.0,4.6,135.0,...,,,notpresent,notpresent,yes,yes,yes,poor,yes,no
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,,normal,notpresent,notpresent,no,no,no,good,no,no
17,18,47.0,80.0,,,,114.0,87.0,5.2,139.0,...,,,notpresent,notpresent,yes,no,no,poor,no,no
19,20,62.0,60.0,1.015,1.0,0.0,100.0,31.0,1.6,,...,,abnormal,present,notpresent,yes,no,yes,good,no,no
23,24,21.0,70.0,1.01,0.0,0.0,,,,,...,,normal,notpresent,notpresent,no,no,no,poor,no,yes
25,26,61.0,60.0,1.025,0.0,0.0,108.0,75.0,1.9,141.0,...,,normal,notpresent,notpresent,yes,yes,no,good,no,yes
26,27,75.0,80.0,1.015,0.0,0.0,156.0,45.0,2.4,140.0,...,,normal,notpresent,notpresent,yes,yes,no,poor,no,no


In [28]:
filtered_df["pc"].dtypes

dtype('O')

In [29]:
filtered_df.pc.unique()

array(['normal', nan, 'abnormal'], dtype=object)

In [30]:
filtered_df["pc"].unique()

array(['normal', nan, 'abnormal'], dtype=object)

In [31]:
a = filtered_df["pc"]
a.head()

0     normal
4     normal
5        NaN
13       NaN
16    normal
Name: pc, dtype: object

In [32]:
for x in a.unique():
    print(x)

normal
nan
abnormal


In [33]:
c = a.unique()
type(c)

numpy.ndarray

In [34]:
d = c.tolist()
d

['normal', nan, 'abnormal']

In [35]:
e = [x for x in d if str(x) != 'nan']
e

['normal', 'abnormal']

In [36]:
import math
b = []
count = 0
# c = c[~np.isnan(c)]

for x in e:
    b.append(count)
    count+=1
b

[0, 1]

In [37]:
filtered_df.pc.unique()

array(['normal', nan, 'abnormal'], dtype=object)

In [38]:
def encode(df, column):
    a = df[column].unique().tolist()
    c = [x for x in a if str(x) != 'nan']
    
    l=[]
    i=0
    for x in c:
        l.append(i)
        i+=1
        
    df[column] = df[column].replace(c, l)

    return df
encode(filtered_df, "ba")

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,1,48.0,80.0,1.020,1.0,0.0,121.0,36.0,1.20,,...,,normal,notpresent,0.0,yes,yes,no,good,no,no
4,5,51.0,80.0,1.010,2.0,0.0,106.0,26.0,1.40,,...,0.0,normal,notpresent,0.0,no,no,no,good,no,no
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.10,142.0,...,,,notpresent,0.0,yes,yes,no,good,yes,no
13,14,68.0,70.0,,,,98.0,86.0,4.60,135.0,...,,,notpresent,0.0,yes,yes,yes,poor,yes,no
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.20,138.0,...,,normal,notpresent,0.0,no,no,no,good,no,no
17,18,47.0,80.0,,,,114.0,87.0,5.20,139.0,...,,,notpresent,0.0,yes,no,no,poor,no,no
19,20,62.0,60.0,1.015,1.0,0.0,100.0,31.0,1.60,,...,,abnormal,present,0.0,yes,no,yes,good,no,no
23,24,21.0,70.0,1.010,0.0,0.0,,,,,...,,normal,notpresent,0.0,no,no,no,poor,no,yes
25,26,61.0,60.0,1.025,0.0,0.0,108.0,75.0,1.90,141.0,...,,normal,notpresent,0.0,yes,yes,no,good,no,yes
26,27,75.0,80.0,1.015,0.0,0.0,156.0,45.0,2.40,140.0,...,,normal,notpresent,0.0,yes,yes,no,poor,no,no
