In [23]:
import pandas as pd
import numpy as np
from univariate import univar
dataset = pd.read_csv("kidney_disease.csv")

In [24]:
dataset.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [25]:
quan, qual  = univar.quanqual(dataset)

In [26]:
quan

['id', 'age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']

In [27]:
#simple approach
for column in quan: 
    dataset[column].fillna(dataset[column].mean(), inplace=True)

In [28]:
dataset[quan].isnull().sum()

id      0
age     0
bp      0
sg      0
al      0
su      0
bgr     0
bu      0
sc      0
sod     0
pot     0
hemo    0
dtype: int64

In [29]:
qual

['rbc',
 'pc',
 'pcc',
 'ba',
 'pcv',
 'wc',
 'rc',
 'htn',
 'dm',
 'cad',
 'appet',
 'pe',
 'ane',
 'classification']

In [32]:
#simple approach
for col in qual: 
    dataset[col].fillna(dataset[col].mode()[0], inplace=True)

In [33]:
dataset[qual].isnull().sum()

rbc               0
pc                0
pcc               0
ba                0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

In [6]:
#if we don't want to update quan & qual values separately rather the whole dataset direcly, then follow imputer method.
#let's try to fill the null values with it's mean value (no data is dependent here so we have replaced it with it's mean)
#Why only Quantitaive data? it is because we can't find mean for Qualitative date
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="mean")
quan_imputed = pd.DataFrame(imp.fit_transform(dataset[quan]), columns=quan)
quan_imputed

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
0,0.0,48.0,80.0,1.020,1.0,0.0,121.000000,36.0,1.2,137.528754,4.627244,15.4
1,1.0,7.0,50.0,1.020,4.0,0.0,148.036517,18.0,0.8,137.528754,4.627244,11.3
2,2.0,62.0,80.0,1.010,2.0,3.0,423.000000,53.0,1.8,137.528754,4.627244,9.6
3,3.0,48.0,70.0,1.005,4.0,0.0,117.000000,56.0,3.8,111.000000,2.500000,11.2
4,4.0,51.0,80.0,1.010,2.0,0.0,106.000000,26.0,1.4,137.528754,4.627244,11.6
...,...,...,...,...,...,...,...,...,...,...,...,...
395,395.0,55.0,80.0,1.020,0.0,0.0,140.000000,49.0,0.5,150.000000,4.900000,15.7
396,396.0,42.0,70.0,1.025,0.0,0.0,75.000000,31.0,1.2,141.000000,3.500000,16.5
397,397.0,12.0,80.0,1.020,0.0,0.0,100.000000,26.0,0.6,137.000000,4.400000,15.8
398,398.0,17.0,60.0,1.025,0.0,0.0,114.000000,50.0,1.0,135.000000,4.900000,14.2


In [7]:
#Check if there is stil any empty/null values in quantitavive data
quan_imputed.isnull().sum()

id      0
age     0
bp      0
sg      0
al      0
su      0
bgr     0
bu      0
sc      0
sod     0
pot     0
hemo    0
dtype: int64

In [8]:
#fill the empty/null value with mode vaue (i.e) most_frequent value since this is categorical or qualitative data
imp1 = SimpleImputer(strategy="most_frequent")
qual_imputed = pd.DataFrame(imp1.fit_transform(dataset[qual]), columns=qual)
qual_imputed

Unnamed: 0,rbc,pc,pcc,ba,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,normal,normal,notpresent,notpresent,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,normal,normal,notpresent,notpresent,38,6000,5.2,no,no,no,good,no,no,ckd
2,normal,normal,notpresent,notpresent,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,normal,abnormal,present,notpresent,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,normal,normal,notpresent,notpresent,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,normal,normal,notpresent,notpresent,47,6700,4.9,no,no,no,good,no,no,notckd
396,normal,normal,notpresent,notpresent,54,7800,6.2,no,no,no,good,no,no,notckd
397,normal,normal,notpresent,notpresent,49,6600,5.4,no,no,no,good,no,no,notckd
398,normal,normal,notpresent,notpresent,51,7200,5.9,no,no,no,good,no,no,notckd


In [9]:
#for qunative data, do the decriptive analysis and find the outlier columns and replace the outliers
descriptive = univar.univariate_table(quan_imputed, quan)
descriptive

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
Mean,199.5,51.4834,76.4691,1.01741,1.01695,0.450142,148.037,57.4257,3.07245,137.529,4.62724,12.5264
Median,199.5,54.0,78.2345,1.01741,1.0,0.0,126.0,44.0,1.4,137.529,4.62724,12.5264
Mode,0.0,60.0,80.0,1.02,0.0,0.0,148.037,57.4257,1.2,137.529,4.62724,12.5264
Q1:25%,99.75,42.0,70.0,1.015,0.0,0.0,101.0,27.0,0.9,135.0,4.0,10.875
Q2:50%,199.5,54.0,78.2345,1.01741,1.0,0.0,126.0,44.0,1.4,137.529,4.62724,12.5264
Q3:75%,299.25,64.0,80.0,1.02,2.0,0.450142,150.0,61.75,3.07245,141.0,4.8,14.625
99%,395.01,80.01,110.0,1.025,4.0,4.0,425.22,235.06,18.159,150.0,6.501,17.601
Q4:100%,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8
IQR,199.5,22.0,10.0,0.005,2.0,0.450142,49.0,34.75,2.17245,6.0,0.8,3.75
1.5Rule,299.25,33.0,15.0,0.0075,3.0,0.675214,73.5,52.125,3.25868,9.0,1.2,5.625


In [10]:
lesser_outliers, greater_outliers = univar.find_outliers_columns(quan, descriptive)

In [11]:
lesser_outliers

['age', 'bp', 'sg', 'bgr', 'sod', 'pot', 'hemo']

In [12]:
greater_outliers

['bp', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot']

In [13]:
univar.replace_outliers(quan_imputed, descriptive, lesser_outliers, greater_outliers)

In [14]:
#cross verify by repatinng the same steps to confirm all the outliers are removed
descriptive = univar.univariate_table(quan_imputed, quan)
lesser_outliers, greater_outliers = univar.find_outliers_columns(quan, descriptive)

In [15]:
lesser_outliers

[]

In [16]:
greater_outliers

[]