In [1]:
import pandas as pd
import numpy as np
import statistics as st

In [2]:
# Read the ARFF file as plain text
file_path = "chronic_kidney_disease_1.arff"

with open(file_path, 'r') as f:
    arff_text = f.read()

# Extract the attribute section
col_start = arff_text.index("@relation") + len("@relation") + 1
col_end = arff_text.index("@data") + len("@data") + 1
column_str = arff_text[col_start:col_end]

# Get the attribue that will be used as column names in pandas dataframe
c = column_str.split("\n")
col = [i.split(" ")[1] for i in c if len(i.split(" ")) > 1]
col =[elem.replace("'", "") for elem in col]

# Split the data into rows and convert to a list of lists
data_start = arff_text.index("@data") + len("@data") + 1
data_str = arff_text[data_start:]
data = data_str.split("\n")
data = [i.split(",") for i in data]
data = data[1:401]

df = pd.DataFrame(data, columns= col)

# Print the DataFrame
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,?,normal,notpresent,notpresent,121,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,?,normal,notpresent,notpresent,?,...,38,6000,?,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,?,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
##check for null values
df.isnull().sum()

age      0
bp       0
sg       0
al       0
su       0
rbc      0
pc       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64

In [4]:
#replace ? with null to get an overview of the null values
df.replace('?', np.nan,inplace=True)

In [5]:
#check the amount of null values
df.isnull().sum()

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wbcc     106
rbcc     131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   age     391 non-null    object
 1   bp      388 non-null    object
 2   sg      353 non-null    object
 3   al      354 non-null    object
 4   su      351 non-null    object
 5   rbc     248 non-null    object
 6   pc      335 non-null    object
 7   pcc     396 non-null    object
 8   ba      396 non-null    object
 9   bgr     356 non-null    object
 10  bu      381 non-null    object
 11  sc      383 non-null    object
 12  sod     313 non-null    object
 13  pot     312 non-null    object
 14  hemo    348 non-null    object
 15  pcv     329 non-null    object
 16  wbcc    294 non-null    object
 17  rbcc    269 non-null    object
 18  htn     398 non-null    object
 19  dm      398 non-null    object
 20  cad     398 non-null    object
 21  appet   399 non-null    object
 22  pe      399 non-null    ob

In [7]:
##replace numerical columns that have null values to zero
def numericalNan_to_zero(df, columns):
    df[columns] = df[columns].replace(np.nan, 0)
    return df

In [8]:
##convert numerical columns to floats
def convert_columns_to_floats(df, columns):
    df[columns] = df[columns].astype(float)
    return df

In [9]:
columns_to_convert = ['age', 'bp', 'sg' , 'al' , 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc']

df = numericalNan_to_zero(df, columns_to_convert)

In [10]:
df[df['age']==0]

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
30,0,70,0.0,0,0,,,notpresent,notpresent,93,...,0,0,0.0,yes,yes,no,good,no,no,ckd
73,0,100,1.015,2,0,abnormal,abnormal,notpresent,notpresent,129,...,14,6300,0.0,yes,no,no,good,yes,yes,ckd
112,0,60,1.015,3,0,abnormal,abnormal,notpresent,notpresent,0,...,33,0,0.0,no,no,no,good,no,no,ckd
116,0,70,1.015,4,0,abnormal,normal,notpresent,notpresent,104,...,0,0,0.0,no,no,no,good,yes,no,ckd
117,0,70,1.02,0,0,,,notpresent,notpresent,219,...,37,9800,4.4,no,no,no,good,no,no,ckd
169,0,70,1.01,0,2,,normal,notpresent,notpresent,220,...,27,0,0.0,yes,yes,no,good,no,yes,ckd
191,0,70,1.01,3,0,normal,normal,notpresent,notpresent,110,...,26,9200,3.4,yes,yes,no,poor,no,no,ckd
203,0,90,0.0,0,0,,,notpresent,notpresent,207,...,0,0,0.0,yes,yes,no,good,no,yes,ckd
268,0,80,0.0,0,0,,,notpresent,notpresent,100,...,53,8500,4.9,no,no,no,good,no,no,notckd


In [11]:
columns_to_convert = ['age', 'bp', 'sg' , 'al' , 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc']

df = convert_columns_to_floats(df, columns_to_convert)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     400 non-null    float64
 1   bp      400 non-null    float64
 2   sg      400 non-null    float64
 3   al      400 non-null    float64
 4   su      400 non-null    float64
 5   rbc     248 non-null    object 
 6   pc      335 non-null    object 
 7   pcc     396 non-null    object 
 8   ba      396 non-null    object 
 9   bgr     400 non-null    float64
 10  bu      400 non-null    float64
 11  sc      400 non-null    float64
 12  sod     400 non-null    float64
 13  pot     400 non-null    float64
 14  hemo    400 non-null    float64
 15  pcv     400 non-null    float64
 16  wbcc    400 non-null    float64
 17  rbcc    400 non-null    float64
 18  htn     398 non-null    object 
 19  dm      398 non-null    object 
 20  cad     398 non-null    object 
 21  appet   399 non-null    object 
 22  pe

In [13]:
df.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,50.325,74.175,0.897863,0.9,0.395,131.7525,54.698,2.941875,107.61625,3.60925,10.898,31.9825,6178.5,3.16575
std,18.616951,18.766989,0.328075,1.31313,1.040038,87.995906,50.780641,5.651664,57.548745,3.410949,5.016848,16.962799,4490.489839,2.36621
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.75,70.0,1.01,0.0,0.0,93.0,25.0,0.9,124.0,3.275,9.375,26.0,0.0,0.0
50%,54.0,75.0,1.015,0.0,0.0,114.5,40.0,1.2,136.0,4.05,12.0,37.0,6900.0,4.0
75%,64.0,80.0,1.02,2.0,0.0,150.0,61.75,2.725,141.0,4.8,14.625,44.0,9400.0,5.1
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,8.0


In [14]:
##write a function to replace the numerical columns with a value of 0 with the mean of the column
def replace_zero_with_mean(df):
    numerical_columns = df.select_dtypes(include=np.number).columns
    for column in numerical_columns:
        df[column] = df[column].replace(0, df[column].mean())

    return df

In [15]:
df = replace_zero_with_mean(df)

In [16]:
df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1.0,0.395,,normal,notpresent,notpresent,121.0000,...,44.0,7800.0,5.20000,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.395,,normal,notpresent,notpresent,131.7525,...,38.0,6000.0,3.16575,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.000,normal,normal,notpresent,notpresent,423.0000,...,31.0,7500.0,3.16575,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.395,normal,abnormal,present,notpresent,117.0000,...,32.0,6700.0,3.90000,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.395,normal,normal,notpresent,notpresent,106.0000,...,35.0,7300.0,4.60000,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.9,0.395,normal,normal,notpresent,notpresent,140.0000,...,47.0,6700.0,4.90000,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.9,0.395,normal,normal,notpresent,notpresent,75.0000,...,54.0,7800.0,6.20000,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.9,0.395,normal,normal,notpresent,notpresent,100.0000,...,49.0,6600.0,5.40000,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.9,0.395,normal,normal,notpresent,notpresent,114.0000,...,51.0,7200.0,5.90000,no,no,no,good,no,no,notckd


In [17]:
df.isnull().sum()

age        0
bp         0
sg         0
al         0
su         0
rbc      152
pc        65
pcc        4
ba         4
bgr        0
bu         0
sc         0
sod        0
pot        0
hemo       0
pcv        0
wbcc       0
rbcc       0
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

In [18]:
##write a function to replace the nan non numerical columns with the mode of the column
def replace_nan_with_mode(df):
    non_numerical_columns = df.select_dtypes(exclude=np.number).columns
    for column in non_numerical_columns:
        df[column] = df[column].replace(np.nan, st.mode(df[column]))

    return df

In [19]:
df = replace_nan_with_mode(df)

In [20]:
##round the numerical values to 2 decimal places
def round_numerical_to_two(df):
    numerical_columns = df.select_dtypes(include=np.number).columns
    for column in numerical_columns:
        df[column] = df[column].round(2)

    return df    

df = round_numerical_to_two(df)

In [21]:
df.isnull().sum()

age      0
bp       0
sg       0
al       0
su       0
rbc      0
pc       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64

In [24]:
##save to csv
df.to_csv('ckd.csv', index=False)

In [25]:
df['sod']

0      107.62
1      107.62
2      107.62
3      111.00
4      107.62
        ...  
395    150.00
396    141.00
397    137.00
398    135.00
399    141.00
Name: sod, Length: 400, dtype: float64