# Data Cleaning

In [40]:
#Importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
#Load the datadset

df = pd.read_csv(r'IRIS.csv')

In [42]:
#Initial Data Exploration

df.head(10)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [43]:
print(df.columns.tolist())

['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']


In [44]:
#Data Cleaning 

#Missing values Handling
df.drop(columns=['SepalLengthCm', 'SepalWidthCm'], inplace=True)
df['Id'].fillna(df['Id'].median(), inplace=True)
categorical_cols=df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)
print(df.isnull().sum())

Id               0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Id'].fillna(df['Id'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [33]:
#Handling Duplicate Values

duplicate_count=df.duplicated().sum()
printf(f"Number of duplicate rows: {duplicate_count}")
df.drop_duplicate(inplace=True)
print(f"Data shape after removing duplicates: {df.shape}")

Number of duplicate rows: 0
Data shape after removing duplicates: (150, 2)


In [29]:
#Handling the null values
print("Null values before dropping:\n",df.isnull().sum())
df.dropna(inplace=True)
print(f"Shape after dropping nulls: {df.shape}")
print("Null values after dropping:\n", df.isnull().sum())

Null values before dropping:
 Id         0
Species    0
dtype: int64
Shape after dropping nulls: (150, 2)
Null values after dropping:
 Id         0
Species    0
dtype: int64


In [30]:
#Handling the outlier

Q1=df['Id'].quantile(0.25)
Q3=df['Id'].quantile(0.75)
IQR=Q3-Q1
lower_bound=Q1-1.5*IQR
upper_bound=Q3+1.5*IQR
df=df[(df['Id']>=lower_bound)&(df['Id']<=upper_bound)]
print(f"Data shape after removing outliers: {df.shape}")

Data shape after removing outliers: (150, 2)
