In [8]:
import pandas as pd
from IPython.display import display
from scipy import stats



In [2]:
df = pd.read_csv('kidney_disease.csv', index_col='id')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             330 non-null    object 
 16  wc              295 non-null    object 
 17  rc              270 non-null    obj

In [3]:
# filling numerical column's empty row with ffill method from panda
df['age'] = df['age'].fillna(method="ffill")

df['bp'] = df['bp'].fillna(method="ffill")

df['sg'] = df['sg'].fillna(method="ffill")

df['al'] = df['al'].fillna(method="ffill")

df['su'] = df['su'].fillna(method="ffill")

df['bgr'] = df['bgr'].fillna(method="ffill")

df['bu'] = df['bu'].fillna(method="ffill")

df['sc'] = df['sc'].fillna(method="ffill")

# filling with mean method as well since ffill still leave empty row

df['sod'] = df['sod'].fillna(method="ffill")
x = df["sod"].mean()
df["sod"].fillna(x, inplace = True)

df['pot'] = df['pot'].fillna(method="ffill")
x = df["pot"].mean()
df["pot"].fillna(x, inplace = True)

df['hemo'] = df['hemo'].fillna(method="ffill")

In [4]:
# converting necessary columns to numerical type, then filling empty row with ffill method from panda

df['pcv'] = pd.to_numeric(df['pcv'], errors='coerce')
df['pcv'] = df['pcv'].fillna(method="ffill")

df['wc'] = pd.to_numeric(df['wc'], errors='coerce')
df['wc'] = df['wc'].fillna(method="ffill")

df['rc'] = pd.to_numeric(df['rc'], errors='coerce')
df['rc'] = df['rc'].fillna(method="ffill")

In [5]:
# filling categorical column's nan with ffill method from panda

df['rbc'].fillna(method='ffill', inplace=True)
df['rbc'].fillna('normal', inplace=True)

df['pc'].fillna(method='ffill', inplace=True)
df['pc'].fillna('normal', inplace=True)

df['pcc'].fillna(method='ffill', inplace=True)
df['pcc'].fillna('normal', inplace=True)

df['ba'].fillna(method='ffill', inplace=True)
df['ba'].fillna('normal', inplace=True)

df['htn'].fillna(method='ffill', inplace=True)
df['htn'].fillna('normal', inplace=True)

df['dm'].fillna(method='ffill', inplace=True)
df['dm'].fillna('normal', inplace=True)

df['cad'].fillna(method='ffill', inplace=True)
df['cad'].fillna('normal', inplace=True)

df['appet'].fillna(method='ffill', inplace=True)
df['appet'].fillna('normal', inplace=True)

df['pe'].fillna(method='ffill', inplace=True)
df['pe'].fillna('normal', inplace=True)

df['ane'].fillna(method='ffill', inplace=True)
df['ane'].fillna('normal', inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             400 non-null    float64
 1   bp              400 non-null    float64
 2   sg              400 non-null    float64
 3   al              400 non-null    float64
 4   su              400 non-null    float64
 5   rbc             400 non-null    object 
 6   pc              400 non-null    object 
 7   pcc             400 non-null    object 
 8   ba              400 non-null    object 
 9   bgr             400 non-null    float64
 10  bu              400 non-null    float64
 11  sc              400 non-null    float64
 12  sod             400 non-null    float64
 13  pot             400 non-null    float64
 14  hemo            400 non-null    float64
 15  pcv             400 non-null    float64
 16  wc              400 non-null    float64
 17  rc              400 non-null    flo

In [6]:
# changing ckd\t value with ckd
for x in df.index:
  if df.loc[x, "classification"] == "ckd\t":
    df.loc[x, "classification"] = "ckd"
  if df.loc[x,"dm"] == "\tyes":
    df.loc[x,"dm"] = "yes"
  if df.loc[x,"dm"] == "\tno":
    df.loc[x,"dm"] = "no"
  if df.loc[x,"dm"] != "no":
    df.loc[x,"dm"] = "yes"
  if df.loc[x,"cad"] == "\tno":
    df.loc[x,"cad"] = "no"

df['classification'].value_counts()

ckd       250
notckd    150
Name: classification, dtype: int64

## Feature Extraction

Gunakan tes Chi-Square untuk menentukan data yang penting untuk data kategorikal

In [20]:
#Create contigency table for each object typed data in df
for col_names in df.drop('classification', axis=1).select_dtypes(include=['object']).columns:
  #p value is calculated by chi-square test
  p_value = stats.chi2_contingency(pd.crosstab(df[col_names], df['classification']))[1]
  print("P-value of %s : %E"%(col_names, p_value))
  #if p is smaller than 0.05, then we can reject the null hypothesis
  if p_value < 0.05:
    print("%s is significant"%(col_names))
  else:
    print("%s is not significant"%(col_names))

  



P-value of rbc : 7.575018E-21
rbc is significant
P-value of pc : 1.191701E-16
pc is significant
P-value of pcc : 2.779587E-07
pcc is significant
P-value of ba : 4.465875E-04
ba is significant
P-value of htn : 1.250081E-31
htn is significant
P-value of dm : 1.708208E-28
dm is significant
P-value of cad : 5.716852E-06
cad is significant
P-value of appet : 1.002805E-14
appet is significant
P-value of pe : 1.687786E-13
pe is significant
P-value of ane : 1.974729E-10
ane is significant


Karena semua kolom kategorikal ketika diuji dengan Chi Square menunjukkan fakta bahwa kolom klasifikasi dan kolom tersebut tidak saling independenden, kita bisa memasukkan semuanya dalam klasifikasi Naive Bayes