In [None]:
# ----------------------------------------------
# DATA SCIENCE ASSIGNMENT - WEEK 2
# Data Cleaning & Preprocessing
# ----------------------------------------------

# Step 1: Import necessary libraries
import pandas as pd

# Step 2: Load the dataset
df = pd.read_csv('/content/students.csv', sep=';')
print("---- BEFORE CLEANING ----")
print(f"Shape: {df.shape}")
print(f"Duplicate Rows: {df.duplicated().sum()}")
print(f"Total Missing Values: {df.isnull().sum().sum()}")

# Step 3: Remove duplicate rows
df = df.drop_duplicates()

# Step 4: Handle missing values (numeric = mean, categorical = mode)
for col in df.columns:
    if df[col].dtype == 'object':  # categorical
        df[col] = df[col].fillna(df[col].mode()[0])
    else:  # numeric
        df[col] = df[col].fillna(df[col].mean())

# Step 5: Remove outliers using IQR method
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
initial_rows = df.shape[0]

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df = df[(df[col] >= Q1 - 1.5 * IQR) & (df[col] <= Q3 + 1.5 * IQR)]

rows_after = df.shape[0]

# Step 6: After cleaning summary
print("\n---- AFTER CLEANING ----")
print(f"Shape: {df.shape}")
print(f"Duplicate Rows: {df.duplicated().sum()}")
print(f"Total Missing Values: {df.isnull().sum().sum()}")
print(f"Rows Removed (outliers + duplicates): {initial_rows - rows_after}")



---- BEFORE CLEANING ----
Shape: (649, 33)
Duplicate Rows: 0
Total Missing Values: 0

---- AFTER CLEANING ----
Shape: (396, 33)
Duplicate Rows: 0
Total Missing Values: 0
Rows Removed (outliers + duplicates): 253

✅ Cleaned dataset saved as 'StudentsPerformance_Cleaned.csv'
