# Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

# Reading data present in the CSV file

In [2]:
df_liver = pd.read_csv("indian_liver_patient.csv") #data of indian_liver_patient.csv (CSV File) is read into a DataFrame named as liver_df

In [3]:
df_liver.head() #extracting first five rows with all the columns

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [4]:
df_liver.info() #extracting details such as column name, number of rows, data type of a dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [5]:
df_liver.columns #retrieving columns of a dataframe

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Dataset'],
      dtype='object')

In [6]:
df_liver #viewing data present in dataframe

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


## Creating a copy of the dataframe

In [7]:
liver_df = df_liver #creating a seperate dataframe before carrying out the data cleaning process

# Data Cleaning

## Checking for the null values

In [8]:
liver_df.isnull() #Checking if the dataframe comprises of any null values

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
578,False,False,False,False,False,False,False,False,False,False,False
579,False,False,False,False,False,False,False,False,False,False,False
580,False,False,False,False,False,False,False,False,False,False,False
581,False,False,False,False,False,False,False,False,False,False,False


In [9]:
liver_df.isnull().sum() #Displaying the total null values in a column

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [10]:
liver_df[liver_df['Albumin_and_Globulin_Ratio'].isna()] #Viewing data with null values

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
209,45,Female,0.9,0.3,189,23,33,6.6,3.9,,1
241,51,Male,0.8,0.2,230,24,46,6.5,3.1,,1
253,35,Female,0.6,0.2,180,12,15,5.2,2.7,,2
312,27,Male,1.3,0.6,106,25,54,8.5,4.8,,2


In [11]:
liver_df = liver_df.dropna() #Dropping the null values
liver_df

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [12]:
liver_df.isnull().sum() #Displaying the total null values in a column

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

## Checking for negative values

In [13]:
(liver_df.drop('Gender', axis=1) < 0).sum() #Viewing total negative values in a column

Age                           0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

## Checking for duplicate values

### Viewing the duplicate values

In [14]:
liver_df.duplicated().sum() #Viewing total duplicate rows in the dataframe

13

In [15]:
liver_df[liver_df.duplicated()] #Viewing all the duplicate data present in the dataframe

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
19,40,Female,0.9,0.3,293,232,245,6.8,3.1,0.8,1
26,34,Male,4.1,2.0,289,875,731,5.0,2.7,1.1,1
34,38,Female,2.6,1.2,410,59,57,5.6,3.0,0.8,2
55,42,Male,8.9,4.5,272,31,61,5.8,2.0,0.5,1
62,58,Male,1.0,0.5,158,37,43,7.2,3.6,1.0,1
106,36,Male,5.3,2.3,145,32,92,5.1,2.6,1.0,2
108,36,Male,0.8,0.2,158,29,39,6.0,2.2,0.5,2
138,18,Male,0.8,0.2,282,72,140,5.5,2.5,0.8,1
143,30,Male,1.6,0.4,332,84,139,5.6,2.7,0.9,1
158,72,Male,0.7,0.1,196,20,35,5.8,2.0,0.5,1


### Removing duplicate values

In [16]:
liver_df.drop_duplicates(inplace=True) #Removing all the duplicate values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  liver_df.drop_duplicates(inplace=True) #Removing all the duplicate values


In [17]:
liver_df.reset_index(drop=True, inplace=True) #Resetting the index of the DataFrame, and using the default one instead.

In [18]:
liver_df #Viewing the dataframe

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
561,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
562,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
563,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
564,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [19]:
liver_df.shape #Viewing the shape of the dataframe

(566, 11)

In [20]:
liver_df.columns #Viewing the columns of the dataframe

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Dataset'],
      dtype='object')

# Converting Dataset of 1 and 2 into Patient with liver disease and Patient with no liver disease respectively

<b> Dataset Column </b>
<ul>
    <li> 1 - Patient with liver disease</li>
    <li> 2 - Patient with no disease</li>
</ul>

In [21]:
liver_df['Dataset_Details'] = liver_df['Dataset'].apply(lambda x:"Patient with liver disease" if x==1 else "Patient with no liver disease") #using lambda function, if x=="1" then return value as Patient with liver disease else return Patient with no liver disease.
liver_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  liver_df['Dataset_Details'] = liver_df['Dataset'].apply(lambda x:"Patient with liver disease" if x==1 else "Patient with no liver disease") #using lambda function, if x=="1" then return value as Patient with liver disease else return Patient with no liver disease.


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset,Dataset_Details
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1,Patient with liver disease
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1,Patient with liver disease
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1,Patient with liver disease
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1,Patient with liver disease
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1,Patient with liver disease


# Converting Gender into F:0 and M:1 and creating a new column called Gender_Binary

In [22]:
liver_df['Gender_Binary'] = liver_df['Gender'].apply(lambda x:0 if x=="Female" else 1) #using lambda function, if x=="Female" then return value as 1 else return 0.
liver_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  liver_df['Gender_Binary'] = liver_df['Gender'].apply(lambda x:0 if x=="Female" else 1) #using lambda function, if x=="Female" then return value as 1 else return 0.


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset,Dataset_Details,Gender_Binary
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1,Patient with liver disease,0
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1,Patient with liver disease,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1,Patient with liver disease,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1,Patient with liver disease,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1,Patient with liver disease,1
