In [2]:
# 2_Data_Cleaning_Template.ipynb

# ==========================
# Data Cleaning Workflow
# ==========================
# WHEN TO USE:
# After EDA, before modeling. Cleaning ensures data is consistent,
# free of missing values/duplicates, and ready for ML.

import pandas as pd
from sklearn.preprocessing import StandardScaler

In [None]:
# Load dataset
df = pd.read_csv("your_dataset.csv")

print("Before Cleaning:", df.shape)

In [None]:
# --------------------------
# Missing Values
# --------------------------
# WHEN TO USE:
# If dataset has gaps. Filling strategy depends on variable type.
# Median for numeric, Mode for categorical = safe baseline approach.
missing_report = df.isnull().mean()*100
print("Missing Value %:\n", missing_report)

for col in df.columns:
    if df[col].dtype in ["int64", "float64"]:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
# --------------------------
# Duplicates
# --------------------------
# WHEN TO USE:
# Common in scraped data or combined datasets. Prevents bias.
df = df.drop_duplicates()

In [None]:
# --------------------------
# Column Names
# --------------------------
# WHEN TO USE:
# Standardizing names avoids errors in SQL/ML later.
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(" ", "_")
              .str.replace("-", "_")
)

In [None]:
# --------------------------
# Categorical Encoding
# --------------------------
# WHEN TO USE:
# Needed before ML — algorithms need numbers, not text.
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype("category")

In [None]:

# --------------------------
# Feature Scaling (optional)
# --------------------------
# WHEN TO USE:
# Only for algorithms sensitive to scale (e.g., SVM, KNN, Gradient Descent models).
scaled_df = df.copy()
scaler = StandardScaler()
for col in scaled_df.select_dtypes(include=["float64","int64"]).columns:
    scaled_df[col] = scaler.fit_transform(scaled_df[[col]])

In [None]:
# --------------------------
# Save Outputs
# --------------------------
df.to_csv("cleaned_dataset.csv", index=False)
scaled_df.to_csv("scaled_dataset.csv", index=False)

print("After Cleaning:", df.shape)
print("✅ Cleaned and Scaled datasets saved")