## Data Cleaning

### Importing essential libraries:

In [60]:
import pandas as pd 
import numpy as np

### Sample data:

In [61]:
df = pd.read_csv('hepatitis_csv.csv')
df.head()

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135.0,42.0,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96.0,32.0,4.0,,False,live
3,31,female,,True,False,False,False,True,False,False,False,False,False,0.7,46.0,52.0,4.0,80.0,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200.0,4.0,,False,live


### Checking for null values:

In [62]:
df.isnull().sum()

age                 0
sex                 0
steroid             1
antivirals          0
fatigue             1
malaise             1
anorexia            1
liver_big          10
liver_firm         11
spleen_palpable     5
spiders             5
ascites             5
varices             5
bilirubin           6
alk_phosphate      29
sgot                4
albumin            16
protime            67
histology           0
class               0
dtype: int64

### Dropping null values from certain columns:

In [63]:
df = df.dropna(subset=['steroid','fatigue','malaise','anorexia','liver_big','liver_firm','spleen_palpable','spiders','ascites','varices'])

In [64]:
df.isnull().sum()

age                 0
sex                 0
steroid             0
antivirals          0
fatigue             0
malaise             0
anorexia            0
liver_big           0
liver_firm          0
spleen_palpable     0
spiders             0
ascites             0
varices             0
bilirubin           4
alk_phosphate      24
sgot                2
albumin            13
protime            57
histology           0
class               0
dtype: int64

### Replacing null values with mean value:

In [65]:
df['bilirubin'] = df['bilirubin'].replace(np.nan, df['bilirubin'].mean())
df['alk_phosphate'] = df['alk_phosphate'].replace(np.nan, df['alk_phosphate'].mean())
df['sgot'] = df['sgot'].replace(np.nan, df['sgot'].mean())
df['albumin'] = df['albumin'].replace(np.nan, df['albumin'].mean())

In [66]:
df.isnull().sum()

age                 0
sex                 0
steroid             0
antivirals          0
fatigue             0
malaise             0
anorexia            0
liver_big           0
liver_firm          0
spleen_palpable     0
spiders             0
ascites             0
varices             0
bilirubin           0
alk_phosphate       0
sgot                0
albumin             0
protime            57
histology           0
class               0
dtype: int64

### As we can see that there are many null values in the column 'protime', the analysis will be more biased if we include that column. So, it is better to remove the 'protime' column for a clear analysis.

In [67]:
df['protime'].describe(include='all')

count     85.000000
mean      62.176471
std       22.915568
min        0.000000
25%       46.000000
50%       62.000000
75%       76.000000
max      100.000000
Name: protime, dtype: float64

### Removing the 'protime' column:

In [68]:
df = df.drop(columns=['protime'])

### Now, the data is clean without null values:

In [69]:
df.isnull().sum()

age                0
sex                0
steroid            0
antivirals         0
fatigue            0
malaise            0
anorexia           0
liver_big          0
liver_firm         0
spleen_palpable    0
spiders            0
ascites            0
varices            0
bilirubin          0
alk_phosphate      0
sgot               0
albumin            0
histology          0
class              0
dtype: int64

### Data types are misconsidered as 'object' instead of 'boolean':

In [71]:
df.dtypes

age                  int64
sex                 object
steroid             object
antivirals            bool
fatigue             object
malaise             object
anorexia            object
liver_big           object
liver_firm          object
spleen_palpable     object
spiders             object
ascites             object
varices             object
bilirubin          float64
alk_phosphate      float64
sgot               float64
albumin            float64
histology             bool
class               object
dtype: object

### Correcting those data types:

In [72]:
df[['steroid','fatigue','malaise','anorexia','liver_big','liver_firm','spleen_palpable','spiders','ascites','varices']] = df[['steroid','fatigue','malaise','anorexia','liver_big','liver_firm','spleen_palpable','spiders','ascites','varices']].astype(bool)

In [73]:
df.dtypes

age                  int64
sex                 object
steroid               bool
antivirals            bool
fatigue               bool
malaise               bool
anorexia              bool
liver_big             bool
liver_firm            bool
spleen_palpable       bool
spiders               bool
ascites               bool
varices               bool
bilirubin          float64
alk_phosphate      float64
sgot               float64
albumin            float64
histology             bool
class               object
dtype: object

### Saving the cleaned data as a 'csv' file:

In [74]:
df.to_csv('cleaned_hepatitis.csv',index=False)