In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('kidney_disease.csv')
numeric_columns = data.select_dtypes(include=['number']).columns  
print(data.head())

   id   age    bp     sg   al   su     rbc        pc         pcc          ba  \
0   0  48.0  80.0  1.020  1.0  0.0     NaN    normal  notpresent  notpresent   
1   1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent  notpresent   
2   2  62.0  80.0  1.010  2.0  3.0  normal    normal  notpresent  notpresent   
3   3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present  notpresent   
4   4  51.0  80.0  1.010  2.0  0.0  normal    normal  notpresent  notpresent   

   ...  pcv    wc   rc  htn   dm  cad appet   pe  ane classification  
0  ...   44  7800  5.2  yes  yes   no  good   no   no            ckd  
1  ...   38  6000  NaN   no   no   no  good   no   no            ckd  
2  ...   31  7500  NaN   no  yes   no  poor   no  yes            ckd  
3  ...   32  6700  3.9  yes   no   no  poor  yes  yes            ckd  
4  ...   35  7300  4.6   no   no   no  good   no   no            ckd  

[5 rows x 26 columns]


In [73]:
print(data.info)

<bound method DataFrame.info of       id   age    bp     sg   al   su     rbc        pc         pcc  \
0      0  48.0  80.0  1.020  1.0  0.0     NaN    normal  notpresent   
1      1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent   
2      2  62.0  80.0  1.010  2.0  3.0  normal    normal  notpresent   
3      3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present   
4      4  51.0  80.0  1.010  2.0  0.0  normal    normal  notpresent   
..   ...   ...   ...    ...  ...  ...     ...       ...         ...   
395  395  55.0  80.0  1.020  0.0  0.0  normal    normal  notpresent   
396  396  42.0  70.0  1.025  0.0  0.0  normal    normal  notpresent   
397  397  12.0  80.0  1.020  0.0  0.0  normal    normal  notpresent   
398  398  17.0  60.0  1.025  0.0  0.0  normal    normal  notpresent   
399  399  58.0  80.0  1.025  0.0  0.0  normal    normal  notpresent   

             ba  ...  pcv    wc   rc  htn   dm  cad appet   pe  ane  \
0    notpresent  ...   44  7800  5.2  yes  y

In [74]:
print(data.dtypes)

id                  int64
age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc                object
pc                 object
pcc                object
ba                 object
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv                object
wc                 object
rc                 object
htn                object
dm                 object
cad                object
appet              object
pe                 object
ane                object
classification     object
dtype: object


In [75]:
data[['pcv', 'wc', 'rc']] = data[['pcv', 'wc', 'rc']].apply(pd.to_numeric, errors='coerce')

In [76]:
data_cleaned = data
numeric_columns = ['age', 'bp','bgr','bu','sc','sod','pot','hemo','pcv','wc','rc']
data_cleaned[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())
non_numeric_columns = ['sg', 'al','su','rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane'] 
for column in non_numeric_columns:
    data_cleaned[column] = data[column].fillna(data[column].mode()[0])

print(data_cleaned.isnull().sum())

id                0
age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64


In [78]:
print(data.describe())

               id         age          bp          sg         al          su  \
count  400.000000  400.000000  400.000000  400.000000  400.00000  400.000000   
mean   199.500000   51.483376   76.469072    1.017712    0.90000    0.395000   
std    115.614301   16.974966   13.476298    0.005434    1.31313    1.040038   
min      0.000000    2.000000   50.000000    1.005000    0.00000    0.000000   
25%     99.750000   42.000000   70.000000    1.015000    0.00000    0.000000   
50%    199.500000   54.000000   78.234536    1.020000    0.00000    0.000000   
75%    299.250000   64.000000   80.000000    1.020000    2.00000    0.000000   
max    399.000000   90.000000  180.000000    1.025000    5.00000    5.000000   

              bgr          bu          sc         sod         pot        hemo  \
count  400.000000  400.000000  400.000000  400.000000  400.000000  400.000000   
mean   148.036517   57.425722    3.072454  137.528754    4.627244   12.526437   
std     74.782634   49.285887    5.6

In [82]:
# Drop rows with missing values
data_edited = data.dropna(inplace=True)


In [83]:
data_edited = data.drop_duplicates(inplace=True)

In [None]:
data = data.drop(['id'], axis = 1)

In [86]:
unique_classes = data['classification'].unique()
print("Unique classifications:", unique_classes)

Unique classifications: ['ckd' 'notckd']


In [None]:
data.replace('ckd\t', 'ckd', inplace = True)
unique_classes = data['classification'].unique()
print("Unique classifications:", unique_classes)