In [13]:
import pandas as pd
from scipy.stats import zscore

data = {
'ID' : [1,2,3, 4,5],
'Name' : ['Alice', 'Bob','Charlie', 'David', 'Eva'],
'Age' : [25,32,None,28,22],
'Salary' : [50000,60000,75000, None, 45000],
'City': [ 'new York', 'San Francisco', 'Los Argeles', 'chicago','Miami']
}


df = pd.DataFrame(data)
print("Task 1: Loaded Dataset")
print(df.head())

missing_cols = df.columns[df.isnull().any()]
missing_percentage = df[missing_cols].isnull().mean() * 100
print("\nTask 2: Missing Values")
print("Columms with missing values:", list(missing_cols))
print("Percentage of missing values: \n", missing_percentage)

df[ 'Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True) 


df.drop_duplicates(inplace=True) 

print("\nTask 4: Removed Duplicate Entries")
print(df)

df['Age'] = df['Age'].astype(int)
df['Salary'] = df['Salary'].astype(float)
print("\nTask 5: Converted Data Types")
print(df.dtypes)

unique_cities = df['City'].unique()
df['City'] = pd.Categorical(df['City'])
print("\nTask 6: Explored Categorical Data")
print("Unique Cities:", unique_cities)

df_outliers = df.copy()
numeric_Columns = ['Age', 'Salary']
z_scores = zscore(df_outliers[numeric_Columns])
threshold = 3
outliers = (abs(z_scores) > threshold).any(axis=1)

df_outliers = df_outliers[~outliers]
print("\nTask 7: Handling Outliers using Z-score")
print("Rows with outliers removed:")
print(df_outliers)

df_outliers.to_csv('cleaned_dataset.csv', index=False) 
print("\nTask 8: Saved the Cleaned Dataset to 'cleaned_dataset.csv'")



Task 1: Loaded Dataset
   ID     Name   Age   Salary           City
0   1    Alice  25.0  50000.0       new York
1   2      Bob  32.0  60000.0  San Francisco
2   3  Charlie   NaN  75000.0    Los Argeles
3   4    David  28.0      NaN        chicago
4   5      Eva  22.0  45000.0          Miami

Task 2: Missing Values
Columms with missing values: ['Age', 'Salary']
Percentage of missing values: 
 Age       20.0
Salary    20.0
dtype: float64

Task 4: Removed Duplicate Entries
   ID     Name    Age   Salary           City
0   1    Alice  25.00  50000.0       new York
1   2      Bob  32.00  60000.0  San Francisco
2   3  Charlie  26.75  75000.0    Los Argeles
3   4    David  28.00  55000.0        chicago
4   5      Eva  22.00  45000.0          Miami

Task 5: Converted Data Types
ID          int64
Name       object
Age         int32
Salary    float64
City       object
dtype: object

Task 6: Explored Categorical Data
Unique Cities: ['new York' 'San Francisco' 'Los Argeles' 'chicago' 'Miami']

Ta