In [26]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

In [31]:
df = pd.read_csv('data/ckd.csv', index_col=0)
df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'abnormal_red_blood_cells',
              'abnormal_pus_cell', 'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea',
              'serum_creatinine', 'sodium', 'potassium', 'haemoglobin', 'packed_cell_volume', 
              'white_blood_cell_count', 'red_blood_cell_count', 'hypertension', 'diabetes_mellitus', 
              'coronary_artery_disease', 'poor_appetite', 'peda_edema', 'aanemia', 'class']
df.head()

Unnamed: 0_level_0,age,blood_pressure,specific_gravity,albumin,sugar,abnormal_red_blood_cells,abnormal_pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,poor_appetite,peda_edema,aanemia,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [32]:
df['packed_cell_volume'] = pd.to_numeric(df['packed_cell_volume'], errors='coerce')
df['white_blood_cell_count'] = pd.to_numeric(df['white_blood_cell_count'], errors='coerce')
df['red_blood_cell_count'] = pd.to_numeric(df['red_blood_cell_count'], errors='coerce')
df['diabetes_mellitus'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)
df['coronary_artery_disease'] = df['coronary_artery_disease'].replace(to_replace = '\tno', value='no')
df['class'] = df['class'].replace(to_replace = {'ckd\t': 'ckd', 'notckd': 'not ckd'})

categorical_columns = ['abnormal_red_blood_cells', 'abnormal_pus_cell', 'pus_cell_clumps', 
                       'bacteria', 'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 
                       'poor_appetite', 'peda_edema', 'aanemia', 'class']

replace_dict = {'normal': 0, 'abnormal': 1, 'notpresent': 0, 'present': 1, 'no': 0, 'yes': 1,
               'good': 0, 'poor': 1, 'not ckd': 0, 'ckd': 1}

for column in categorical_columns:
    df[column] = df[column].replace(replace_dict)

In [35]:
df_knn = pd.get_dummies(df.drop(columns=['class']), dummy_na=True)
scaler = MinMaxScaler(feature_range=(0, 1))
df_knn = pd.DataFrame(scaler.fit_transform(df_knn), columns = df_knn.columns)
knn_imputer = KNNImputer(n_neighbors=len(df_knn.columns))
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_knn), columns=df_knn.columns)
df_knn_imputed = scaler.inverse_transform(df_knn_imputed)
df_knn_imputed = pd.DataFrame(df_knn_imputed, columns=df_knn.columns)
df_knn_imputed['class'] = df['class']
df_knn_imputed[categorical_columns] = df_knn_imputed[categorical_columns].round(0)
df_knn_imputed

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,abnormal_red_blood_cells,abnormal_pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,poor_appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,0.0,0.0,0.0,0.0,121.0,36.0,1.2,137.708333,4.391667,15.4,44.0,7800.0,5.2,1.0,1.0,0.0,0.0,0.0,0.0,1
1,7.0,50.0,1.02,4.0,0.0,0.0,0.0,0.0,0.0,110.958333,18.0,0.8,140.5,4.1625,11.3,38.0,6000.0,5.329167,0.0,0.0,0.0,0.0,0.0,0.0,1
2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,423.0,53.0,1.8,134.291667,4.5125,9.6,31.0,7500.0,3.9875,0.0,1.0,0.0,1.0,0.0,1.0,1
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,1.0,0.0,0.0,1.0,1.0,1.0,1
4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,106.0,26.0,1.4,140.0,4.175,11.6,35.0,7300.0,4.6,0.0,0.0,0.0,0.0,0.0,0.0,1
5,60.0,90.0,1.015,3.0,0.0,0.0,0.0,0.0,0.0,74.0,25.0,1.1,142.0,3.2,12.2,39.0,7800.0,4.4,1.0,1.0,0.0,0.0,1.0,0.0,1
6,68.0,70.0,1.01,0.0,0.0,0.0,0.0,0.0,0.0,100.0,54.0,24.0,104.0,4.0,12.4,36.0,8183.333333,5.179167,0.0,0.0,0.0,0.0,0.0,0.0,1
7,24.0,82.5,1.015,2.0,4.0,0.0,1.0,0.0,0.0,410.0,31.0,1.1,136.958333,5.55,12.4,44.0,6900.0,5.0,0.0,1.0,0.0,0.0,1.0,0.0,1
8,52.0,100.0,1.015,3.0,0.0,0.0,1.0,1.0,0.0,138.0,60.0,1.9,133.291667,6.508333,10.8,33.0,9600.0,4.0,1.0,1.0,0.0,0.0,0.0,1.0,1
9,53.0,90.0,1.02,2.0,0.0,1.0,1.0,1.0,0.0,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,1.0,1.0,0.0,1.0,0.0,1.0,1


In [36]:
df_knn_imputed.to_csv('data/ckd_clean.csv', index=False)

In [37]:
df_clean = pd.read_csv('data/ckd_clean.csv')
df_clean

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,abnormal_red_blood_cells,abnormal_pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,poor_appetite,peda_edema,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,0.0,0.0,0.0,0.0,121.0,36.0,1.2,137.708333,4.391667,15.4,44.0,7800.0,5.2,1.0,1.0,0.0,0.0,0.0,0.0,1
1,7.0,50.0,1.02,4.0,0.0,0.0,0.0,0.0,0.0,110.958333,18.0,0.8,140.5,4.1625,11.3,38.0,6000.0,5.329167,0.0,0.0,0.0,0.0,0.0,0.0,1
2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,423.0,53.0,1.8,134.291667,4.5125,9.6,31.0,7500.0,3.9875,0.0,1.0,0.0,1.0,0.0,1.0,1
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,1.0,0.0,0.0,1.0,1.0,1.0,1
4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,106.0,26.0,1.4,140.0,4.175,11.6,35.0,7300.0,4.6,0.0,0.0,0.0,0.0,0.0,0.0,1
5,60.0,90.0,1.015,3.0,0.0,0.0,0.0,0.0,0.0,74.0,25.0,1.1,142.0,3.2,12.2,39.0,7800.0,4.4,1.0,1.0,0.0,0.0,1.0,0.0,1
6,68.0,70.0,1.01,0.0,0.0,0.0,0.0,0.0,0.0,100.0,54.0,24.0,104.0,4.0,12.4,36.0,8183.333333,5.179167,0.0,0.0,0.0,0.0,0.0,0.0,1
7,24.0,82.5,1.015,2.0,4.0,0.0,1.0,0.0,0.0,410.0,31.0,1.1,136.958333,5.55,12.4,44.0,6900.0,5.0,0.0,1.0,0.0,0.0,1.0,0.0,1
8,52.0,100.0,1.015,3.0,0.0,0.0,1.0,1.0,0.0,138.0,60.0,1.9,133.291667,6.508333,10.8,33.0,9600.0,4.0,1.0,1.0,0.0,0.0,0.0,1.0,1
9,53.0,90.0,1.02,2.0,0.0,1.0,1.0,1.0,0.0,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,1.0,1.0,0.0,1.0,0.0,1.0,1


In [38]:
rows_with_nan = df[df.isnull().any(axis=1)]
rows_with_nan

Unnamed: 0_level_0,age,blood_pressure,specific_gravity,albumin,sugar,abnormal_red_blood_cells,abnormal_pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,poor_appetite,peda_edema,aanemia,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
0,48.0,80.0,1.02,1.0,0.0,,0.0,0.0,0.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,1.0,1.0,0.0,0.0,0.0,0.0,1
1,7.0,50.0,1.02,4.0,0.0,,0.0,0.0,0.0,,18.0,0.8,,,11.3,38.0,6000.0,,0.0,0.0,0.0,0.0,0.0,0.0,1
2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,,0.0,1.0,0.0,1.0,0.0,1.0,1
4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,0.0,0.0,0.0,0.0,0.0,0.0,1
5,60.0,90.0,1.015,3.0,0.0,,,0.0,0.0,74.0,25.0,1.1,142.0,3.2,12.2,39.0,7800.0,4.4,1.0,1.0,0.0,0.0,1.0,0.0,1
6,68.0,70.0,1.01,0.0,0.0,,0.0,0.0,0.0,100.0,54.0,24.0,104.0,4.0,12.4,36.0,,,0.0,0.0,0.0,0.0,0.0,0.0,1
7,24.0,,1.015,2.0,4.0,0.0,1.0,0.0,0.0,410.0,31.0,1.1,,,12.4,44.0,6900.0,5.0,0.0,1.0,0.0,0.0,1.0,0.0,1
8,52.0,100.0,1.015,3.0,0.0,0.0,1.0,1.0,0.0,138.0,60.0,1.9,,,10.8,33.0,9600.0,4.0,1.0,1.0,0.0,0.0,0.0,1.0,1
10,50.0,60.0,1.01,2.0,4.0,,1.0,1.0,0.0,490.0,55.0,4.0,,,9.4,28.0,,,1.0,1.0,0.0,0.0,0.0,1.0,1
12,68.0,70.0,1.015,3.0,1.0,,0.0,1.0,0.0,208.0,72.0,2.1,138.0,5.8,9.7,28.0,12200.0,3.4,1.0,1.0,1.0,1.0,1.0,0.0,1
