In [3]:
 import pandas as pd
 import numpy as np

In [4]:
# Load the dataset
data = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 7],
    'Name': ['Alice', 'Bob', 'Charlie', np.nan, 'Eve', 'Frank', 
'Grace', 'Grace'],
    'Age': [25, np.nan, 30, 45, 'unknown', 35, 30, 30],
    'Salary': [50000, 60000, np.nan, 70000, 80000, 'not available', 
70000, 70000],
    'Department': ['HR', 'Finance', 'HR', 'IT', np.nan, 'Finance', 
'IT', 'IT']
 }
df = pd.DataFrame(data)
df

Unnamed: 0,ID,Name,Age,Salary,Department
0,1,Alice,25,50000,HR
1,2,Bob,,60000,Finance
2,3,Charlie,30,,HR
3,4,,45,70000,IT
4,5,Eve,unknown,80000,
5,6,Frank,35,not available,Finance
6,7,Grace,30,70000,IT
7,7,Grace,30,70000,IT


In [5]:
 df.describe()

Unnamed: 0,ID
count,8.0
mean,4.375
std,2.263846
min,1.0
25%,2.75
50%,4.5
75%,6.25
max,7.0


In [6]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values

ID            0
Name          1
Age           1
Salary        1
Department    1
dtype: int64

In [8]:
# Fill missing values
df['Name'].fillna('Unknown', inplace=True) # fillna() -> find NaN and replace with another value

# Display the DataFrame after handling missing values
df.head(8)

Unnamed: 0,ID,Name,Age,Salary,Department
0,1,Alice,25,50000,HR
1,2,Bob,,60000,Finance
2,3,Charlie,30,,HR
3,4,Unknown,45,70000,IT
4,5,Eve,unknown,80000,
5,6,Frank,35,not available,Finance
6,7,Grace,30,70000,IT
7,7,Grace,30,70000,IT


In [9]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce') 
df

Unnamed: 0,ID,Name,Age,Salary,Department
0,1,Alice,25.0,50000,HR
1,2,Bob,,60000,Finance
2,3,Charlie,30.0,,HR
3,4,Unknown,45.0,70000,IT
4,5,Eve,,80000,
5,6,Frank,35.0,not available,Finance
6,7,Grace,30.0,70000,IT
7,7,Grace,30.0,70000,IT


In [10]:
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')
df

Unnamed: 0,ID,Name,Age,Salary,Department
0,1,Alice,25.0,50000.0,HR
1,2,Bob,,60000.0,Finance
2,3,Charlie,30.0,,HR
3,4,Unknown,45.0,70000.0,IT
4,5,Eve,,80000.0,
5,6,Frank,35.0,,Finance
6,7,Grace,30.0,70000.0,IT
7,7,Grace,30.0,70000.0,IT


In [11]:
df['Department'].fillna('Unknown', inplace=True)# fillna() -> 
df

Unnamed: 0,ID,Name,Age,Salary,Department
0,1,Alice,25.0,50000.0,HR
1,2,Bob,,60000.0,Finance
2,3,Charlie,30.0,,HR
3,4,Unknown,45.0,70000.0,IT
4,5,Eve,,80000.0,Unknown
5,6,Frank,35.0,,Finance
6,7,Grace,30.0,70000.0,IT
7,7,Grace,30.0,70000.0,IT


#### Convert Data Types

In [13]:
# Convert Age and Salary columns to numeric
df['Age'] = pd.to_numeric(df['Age'], errors='coerce') # pd.to_numeric()
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')
df

Unnamed: 0,ID,Name,Age,Salary,Department
0,1,Alice,25.0,50000.0,HR
1,2,Bob,,60000.0,Finance
2,3,Charlie,30.0,,HR
3,4,Unknown,45.0,70000.0,IT
4,5,Eve,,80000.0,Unknown
5,6,Frank,35.0,,Finance
6,7,Grace,30.0,70000.0,IT
7,7,Grace,30.0,70000.0,IT


In [14]:
# Display the data types
df.dtypes

ID              int64
Name           object
Age           float64
Salary        float64
Department     object
dtype: object

#### Remove Duplicates

In [15]:
# Remove duplicate rows based on ID
df = df.drop_duplicates(subset='ID')# drop_duplicates(subset ='Employee Code')

# Display the DataFrame after removing duplicates
df.head(10)

Unnamed: 0,ID,Name,Age,Salary,Department
0,1,Alice,25.0,50000.0,HR
1,2,Bob,,60000.0,Finance
2,3,Charlie,30.0,,HR
3,4,Unknown,45.0,70000.0,IT
4,5,Eve,,80000.0,Unknown
5,6,Frank,35.0,,Finance
6,7,Grace,30.0,70000.0,IT


#### Handle Outliers

In [23]:
from scipy import stats

# Calculate Z-scores
df.loc[:, 'Salary_zscore'] = stats.zscore(df['Salary'].fillna(df['Salary'].mean()))

# Define a threshold for outliers
threshold = 3

In [26]:
# Identify outliers
df_outliers = df[df['Salary_zscore'].abs() > threshold]

# Remove outliers
df_cleaned = df[df['Salary_zscore'].abs() <= threshold]

# Display the DataFrames for outliers and cleaned data
df_outliers, df_cleaned

(Empty DataFrame
 Columns: [ID, Name, Age, Salary, Department, Salary_zscore]
 Index: [],
    ID     Name   Age   Salary Department  Salary_zscore
 0   1    Alice  25.0  50000.0         HR      -1.856382
 1   2      Bob   NaN  60000.0    Finance      -0.696143
 2   3  Charlie  30.0      NaN         HR       0.000000
 3   4  Unknown  45.0  70000.0         IT       0.464095
 4   5      Eve   NaN  80000.0    Unknown       1.624334
 5   6    Frank  35.0      NaN    Finance       0.000000
 6   7    Grace  30.0  70000.0         IT       0.464095)