# I. PEMAHAMAN DOMAIN DAN TUJUAN

## 1.1. Pemahaman Penyakit Ginjal Kronis

>Penyakit Ginjal Kronis (PGK) adalah suatu proses patofisiologis dengan etiologi yang beragam, mengakibatkan penurunan fungsi ginjal yang progresif, penurunan fungsi ini bersifat kronis dan irreversible.

>Data didapatkan dari UCI Machine Learning Repository. Berisikan 400 data dan memiliki 25 atribut, terdiri dari 1 kelas target dan 24 atribut. 

## 1.2. Tujuan

>Identifikasi PGK dengan atribut yang paling berpengaruh

# II. PEMBUATAN DATASET DAN TARGET

## 2.1 Data UCI

Data didapat dari: https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

Data tersebut di-import dalam mariaDB

In [1]:
import numpy as np
import pandas as pd
import pymysql as pskl
import matplotlib.pyplot as plt

# Menampilkan semua array
# np.set_printoptions(threshold=np.nan)

In [2]:
def dataset(retrieve="all", id=0, target=0): #hasil bertipe dataframe
    """
    parameters
    ----------
    retrieve: (all, numeric, polinom)
        all    : semua kolom
        numeric: hanya kolom numeric
        polinom: hanya kolom polinom
    id: (0, 1)
        0: tanpa kolom id
        1: dengan kolom id
    class: (0, 1)
        0: tanpa kolom class
        1: dengan kolom class
    
    """
    connection = pskl.connect(host="localhost", user="root", passwd="", database="knn")
    cursor = connection.cursor()
    if(retrieve=="all"):
        cols = ["id", "age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        retrieve = "SELECT id, age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wbcc, rbcc, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane, class FROM ckd_preprocessing3"
    elif(retrieve=="numeric"):
        cols = ["id", "age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "class"]
        retrieve = "SELECT id, age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wbcc, rbcc, class FROM ckd_preprocessing3"
    elif(retrieve=="polinom"):
        cols = ["id", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        retrieve = "SELECT id, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane, class FROM ckd_preprocessing3"
    resolveall = cursor.execute(retrieve)
    rows_tupple = cursor.fetchall()
    data = pd.DataFrame(list(rows_tupple))
    data.columns = cols
    if(id==0):
        data = data.drop(["id"], axis=1)
    if(target==0):
        data = data.drop(["class"], axis=1)
    data = data.fillna(value=np.nan) #mengubah missing value menjadi NaN
    return data

In [3]:
numeric_df = dataset(retrieve="numeric")
numeric_df.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2
1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,11.3,38.0,6000.0,
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6


In [4]:
numeric_df.shape

(400, 14)

## 2.2. Menentukan Atribut Target

>Atribut "class" sebagai target dataset

outputnya target

# III. DATA CLEANING DAN PREPROCESSING

## 3.1 Menghilangkan Outliers

>referensi: http://digilib.unila.ac.id/20585/4/II.%20TINJAUAN%20PUSTAKA.pdf

In [5]:
def outliers(df, k1=0.25, k3=0.75):
    """
    parameters:
    -----------
    df: input tipe dataframe, hanya menerima numeric
    
    mendeteksi data yang diluar batas bawah dan batas atas
    batas bawah = k1 - (k3-k1)*1.5
    batas atas = k3 + (k3-k1)*1.5
    """
    pencilan = df.apply(lambda x: (x < df[x.name].quantile(k1)-((df[x.name].quantile(k3)-df[x.name].quantile(k1))*1.5))|(x > df[x.name].quantile(k3)+((df[x.name].quantile(k3)-df[x.name].quantile(k1))*1.5)), axis=0)
    return pencilan

In [6]:
pencilan = outliers(numeric_df)
pencilan.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,True,True,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [7]:
def outliers_removing(df, pencilan):
    """
    paramters:
    ----------
    df: input tipe dataframe
    pencilan: input tipe dataframe, bertipe boolean
    
    output:
    -------
    mengembalikan dataframe yang sudah menghilangkan semua outliers (semua baris yang berisi nilai pencilan true)
    """
    filtered_df = df[~(pencilan).any(axis=1)]
    return filtered_df

In [8]:
df = dataset(retrieve="all", id=1, target=1)

filtered_df = outliers_removing(df, pencilan)
filtered_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,normal,notpresent,notpresent,yes,yes,no,good,no,no,ckd
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,,notpresent,notpresent,yes,yes,no,good,yes,no,ckd
13,14,68.0,70.0,,,,98.0,86.0,4.6,135.0,...,,notpresent,notpresent,yes,yes,yes,poor,yes,no,ckd
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,normal,notpresent,notpresent,no,no,no,good,no,no,ckd


In [9]:
filtered_df.shape

(251, 26)

# IV. TRANSFORMASI DATA

## 4.1 Data Nominal Menjadi Numeric

>Mengubah atribut / kolom polinom menjadi numerik

>Dikarenakan semua kolom nominal hanya memiliki 2 kelas (binary), hasil transformasi menjadi 0 dan 1

In [10]:
def encoding(df, column="all_nominal"):
    """
    paramters:
    ----------
    df: input tipe dataframe
    column: input tipe string, menentukan kolom nominal mana yang akan di transformasi menjadi numerik
    
    output:
    ----------
    mengembalikan dataframe dengan kolom tertentu yang sudah di encode selain "NaN"
    
    """
    #python melakukan pass by reference, sehingga dibuat copy agar df sebelumnya tidak berubah
    
    copy_df = df.copy()
    if(column!="all_nominal"):
        a = copy_df[column].unique().tolist()
        c = [x for x in a if str(x) != 'nan']
        l=[]
        i=0
        for x in c:
            l.append(i)
            i+=1
        copy_df[column] = copy_df[column].replace(c, l)
    
    elif(column=="all_nominal"):
        all_nominal = ["rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane","class"]
        for col in all_nominal:
            copy_df = encoding(copy_df, col) #rekursif
            
    return copy_df

In [11]:
encoded_df = encoding(filtered_df, "all_nominal")
encoded_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
13,14,68.0,70.0,,,,98.0,86.0,4.6,135.0,...,,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0


In [12]:
encoded_df.shape

(251, 26)

## 4.2 Penanganan Data yang Hilang

Mengubah NaN menjadi strategi tertentu, dalam hal ini diubah menjadi rata - rata dari setiap kolom

In [13]:
from sklearn.preprocessing import Imputer

In [14]:
def missing_handling(df, column="all", method="mean"):
    """
    paramters:
    ----------
    df: input tipe dataframe
    column: menentukan kolom numerik mana yang akan dilakukan penanganan missing value
    method: strategi penanganan missing value
    
    output:
    ----------
    mengembalikan dataframe dengan kolom yang sudah dilakukan penanganan NaN
    """
    
    copy_df = df
    imputer = Imputer(missing_values="NaN", strategy=method, axis = 0)    
    
    if(column!="all"):
        imputer = imputer.fit(copy_df[[column]])
        filledmissing_df = imputer.transform(copy_df[[column]])
        df_change = filledmissing_df.ravel()    
        copy_df[column] = df_change
        
    elif(column=="all"):
        all_col = ["age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "class"]
        for col in all_col:
            copy_df = missing_handling(copy_df, col, method) #rekursif
    return copy_df

In [15]:
missing_handling_df = missing_handling(encoded_df, "all")
missing_handling_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,1,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,140.50495,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,140.50495,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5,6,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,...,0.100917,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13,14,68.0,70.0,1.019432,0.458515,0.0,98.0,86.0,4.6,135.0,...,0.100917,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
16,17,47.0,70.0,1.015,2.0,0.0,99.0,46.0,2.2,138.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [16]:
missing_handling_df.shape

(251, 26)

## 4.3 Normalisasi

>Melakukan normalisasi data, agar range tiap atribut sama dan tidak ada yang dominan saat perhitungan

$$Xnorm = \frac{X - min(X)}{max(X)-min(X)}$$

In [17]:
from sklearn.preprocessing import MinMaxScaler

In [18]:
def normalizing(df, column="all", f_range=(0,1)):
    """
    paramters:
    ----------
    df: input tipe dataframe
    column: menentukan kolom numerik mana yang akan dilakukan penanganan missing value
    range: range normalisasi
    
    output:
    ----------
    mengembalikan dataframe dengan kolom yang sudah dinormalisasi
    """
    
    copy_df = df.copy()
    scale = MinMaxScaler(feature_range=f_range)    
    
    if(column!="all"):
        normalization_array = scale.fit_transform(copy_df[[column]])
        df_change = normalization_array.ravel()    
        copy_df[column] = df_change
        
    elif(column=="all"):
        all_col = ["age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wbcc", "rbcc", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"]
        for col in all_col:
            copy_df = normalizing(copy_df, col, f_range) #rekursif
    return copy_df

In [24]:
normalized_df = normalizing(missing_handling_df, column="all", f_range=(0,1))
normalized_df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,1,0.468354,0.666667,0.75,0.25,0.0,0.286517,0.25,0.166667,0.620198,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.506329,0.666667,0.25,0.5,0.0,0.202247,0.153846,0.208333,0.620198,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5,6,0.620253,1.0,0.5,0.75,0.0,0.022472,0.144231,0.145833,0.68,...,0.100917,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13,14,0.721519,0.333333,0.721616,0.114629,0.0,0.157303,0.730769,0.875,0.4,...,0.100917,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
16,17,0.455696,0.333333,0.5,0.5,0.0,0.162921,0.346154,0.375,0.52,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [27]:
normalized_df.shape

(251, 26)

In [248]:
path=r"C:\Users\Ikhsan\Desktop"
import os
normalized_df.to_csv(os.path.join(path,r'preprocessing.csv'))

# V. PENENTUAN TUGAS DATA MINING

>Tugas Data Mining yang dilakukan adalah klasifikasi

# VI. IMPLEMENTASI ALGORITMA DAN METODE

## 6.1 Klasifikasi

>Klasifikasi dengan algoritma kNN

In [34]:
import MyDir as my

## 6.2 Seleksi Atribut

Seleksi atribut dengan Backward Elimination

# VI. Pengembangan Algoritma Data Mining

In [177]:
#mengambil atribut

x = normalized_df.iloc[:,1:-1]
x.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,0.468354,0.666667,0.75,0.25,0.0,0.286517,0.25,0.166667,0.620198,0.36474,...,0.083832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.506329,0.666667,0.25,0.5,0.0,0.202247,0.153846,0.208333,0.620198,0.36474,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,0.620253,1.0,0.5,0.75,0.0,0.022472,0.144231,0.145833,0.68,0.0,...,0.083832,0.100917,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,0.721519,0.333333,0.721616,0.114629,0.0,0.157303,0.730769,0.875,0.4,0.064516,...,0.083832,0.100917,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
16,0.455696,0.333333,0.5,0.5,0.0,0.162921,0.346154,0.375,0.52,0.290323,...,0.083832,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [178]:
normalized_df["class"].value_counts()

1.0    150
0.0    101
Name: class, dtype: int64

In [179]:
#mengambil target

y = normalized_df["class"]
y.head()

0     0.0
4     0.0
5     0.0
13    0.0
16    0.0
Name: class, dtype: float64

In [180]:
from sklearn.model_selection import train_test_split

In [241]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [242]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(200, 24) (200,) (51, 24) (51,)


In [243]:
x_train = np.array(x_train)
y_train = np.array(y_train)

In [244]:
x_test = np.array(x_test)
y_test = np.array(y_test)

In [245]:
my_predictions = np.array([my.knn_predict(p, x_train, y_train, 5) for p in x_test])

In [246]:
akurasi_my_predictions = np.mean(my_predictions == y_test)*100
akurasi_my_predictions

98.0392156862745

In [258]:
a= pd.read_csv("preprocessing.csv")
a = a.iloc[:,1:]
a.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class
0,1,0.468354,0.666667,0.75,0.25,0.0,0.286517,0.25,0.166667,0.620198,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.506329,0.666667,0.25,0.5,0.0,0.202247,0.153846,0.208333,0.620198,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,6,0.620253,1.0,0.5,0.75,0.0,0.022472,0.144231,0.145833,0.68,...,0.100917,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,14,0.721519,0.333333,0.721616,0.114629,0.0,0.157303,0.730769,0.875,0.4,...,0.100917,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
4,17,0.455696,0.333333,0.5,0.5,0.0,0.162921,0.346154,0.375,0.52,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [259]:
from sklearn.model_selection import KFold