<a href="https://colab.research.google.com/github/kirthana729/EDA-DA1/blob/main/Module2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

# Load the dataset
data_path = '/content/elem94_95.csv'
df = pd.read_csv(data_path)

# --- Data Transformation Module ---

# 1. Data Deduplication
print("\n--- Data Deduplication ---")
print("Original shape:", df.shape)
df = df.drop_duplicates()
print("Shape after deduplication:", df.shape)

# 2. Replacing Values
print("\n--- Replacing Values ---")
# Example: Replace negative values in 'lunch' column with NaN (if applicable)
if 'lunch' in df.columns:
    df['lunch'] = df['lunch'].apply(lambda x: np.nan if x < 0 else x)
    print("Values in 'lunch' column replaced where negative values are found.")

# 3. Discretization and Binning
print("\n--- Discretization and Binning ---")
# Example: Binning 'lunch' column into categories
if 'lunch' in df.columns:
    bins = [0, 25, 50, 75, 100]  # Define bin edges
    labels = ['Low', 'Medium', 'High', 'Very High']  # Define bin labels
    df['lunch_bin'] = pd.cut(df['lunch'], bins=bins, labels=labels)
    print(df[['lunch', 'lunch_bin']].head())

# 4. Handling Missing Data
print("\n--- Handling Missing Data ---")
# Checking for missing data
print("Missing data before handling:")
print(df.isnull().sum())

# Traditional Methods: Fill missing values with column mean
print("\n--- Filling Missing Data with Column Mean ---")
# Select only numerical columns for calculating the mean
numeric_cols = df.select_dtypes(include=np.number).columns  # Get list of numerical columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())  # Apply fillna only to numerical columns
print("Missing data after handling:")
print(df.isnull().sum())




--- Data Deduplication ---
Original shape: (1848, 15)
Shape after deduplication: (1848, 15)

--- Replacing Values ---
Values in 'lunch' column replaced where negative values are found.

--- Discretization and Binning ---
       lunch lunch_bin
0  48.299999    Medium
1  51.500000      High
2  51.200001      High
3  33.700001    Medium
4  11.900000       Low

--- Handling Missing Data ---
Missing data before handling:
rownames      0
distid        0
schid         0
lunch         0
enrol         0
staff         0
exppp         0
avgsal        0
avgben        0
math4         0
story4        0
bs            0
lavgsal       0
lenrol        0
lstaff        0
lunch_bin    48
dtype: int64

--- Filling Missing Data with Column Mean ---
Missing data after handling:
rownames      0
distid        0
schid         0
lunch         0
enrol         0
staff         0
exppp         0
avgsal        0
avgben        0
math4         0
story4        0
bs            0
lavgsal       0
lenrol        0
lstaff    