In [7]:
import sys, os
sys.path.append(os.path.abspath("../project")) 

import pandas as pd
from src.cleaning import fill_missing_median, drop_missing, normalize_data


In [8]:
RAW_PATH = "../project/data/instructor_dirty.csv"
df = pd.read_csv(RAW_PATH)

print("Original dataset shape:", df.shape)
df.head()

Original dataset shape: (5, 5)


Unnamed: 0,numeric_col,category_col,price,date_str,category
0,10.0,A,$100,2025-08-01,Electronics
1,,B,$200,2025-08-02,Furniture
2,40.0,A,$150,,Toys
3,55.0,B,,2025-08-04,Clothing
4,70.0,C,$250,2025-08-05,


In [9]:
# Strategy A: fill missing values with median
df_filled = fill_missing_median(df)

# Strategy B: drop rows with missing values (for comparison)
df_dropped = drop_missing(df)

# Normalize numeric columns after filling missing values
df_clean = normalize_data(df_filled)

print("Original shape:", df.shape)
print("After filling missing values:", df_filled.shape)
print("After dropping missing values:", df_dropped.shape)
print("After normalization:", df_clean.shape)

Original shape: (5, 5)
After filling missing values: (5, 5)
After dropping missing values: (1, 5)
After normalization: (5, 5)


In [10]:
summary_raw = df.describe(include="all").T
summary_clean = df_clean.describe(include="all").T

print("Original data summary:")
display(summary_raw)

print("\nCleaned data summary:")
display(summary_clean)

Original data summary:


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
numeric_col,4.0,,,,43.75,25.617377,10.0,32.5,47.5,58.75,70.0
category_col,5.0,3.0,A,2.0,,,,,,,
price,4.0,4.0,$100,1.0,,,,,,,
date_str,4.0,4.0,2025-08-01,1.0,,,,,,,
category,4.0,4.0,Electronics,1.0,,,,,,,



Cleaned data summary:


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
numeric_col,5.0,,,,0.575,0.37081,0.0,0.5,0.625,0.75,1.0
category_col,5.0,3.0,A,2.0,,,,,,,
price,4.0,4.0,$100,1.0,,,,,,,
date_str,4.0,4.0,2025-08-01,1.0,,,,,,,
category,4.0,4.0,Electronics,1.0,,,,,,,


In [11]:
OUTPUT_PATH = "../data/processed/cleaned_dataset.csv"
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
df_clean.to_csv(OUTPUT_PATH, index=False)
print("✅ Cleaned dataset saved to:", OUTPUT_PATH)

✅ Cleaned dataset saved to: ../data/processed/cleaned_dataset.csv
