# Homework - Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [1]:
import os, sys, numpy as np, pandas as pd
from src import cleaning
csv_path = './data/raw/sample_data.csv'
os.makedirs('data/raw', exist_ok=True)

In [2]:
if not os.path.exists(csv_path):
    # Create Sample dataset
    np.random.seed(42)
    df = pd.DataFrame({
        'age': [34, 45, 29, 50, 38, np.nan, 41],
        'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
        'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
        'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
        'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
        'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
    })
    df.to_csv(csv_path, index=False)
    print(f"Sample dataset created at {csv_path}")
else:
    print(f"Sample dataset already exists at {csv_path}")
    df = pd.read_csv(csv_path)

df  # show the DataFrame

Sample dataset already exists at ./data/raw/sample_data.csv


Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,
5,,,0.65,12345,Unknown,5.0
6,41.0,49000.0,0.79,94105,San Francisco,


In [3]:
# Drop rows with <50% non-missing across all columns
df = cleaning.drop_missing(df, threshold=0.5)
df  # show

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,
5,,,0.65,12345,Unknown,5.0
6,41.0,49000.0,0.79,94105,San Francisco,


In [4]:
# Fill numeric columns with medians
df = cleaning.fill_missing_median(df)
df  # show

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,23.5
1,45.0,52000.0,0.91,10001,New York,42.0
2,29.0,42000.0,0.805,60614,Chicago,23.5
3,50.0,58000.0,0.76,94103,SF,23.5
4,38.0,52000.0,0.88,73301,Austin,23.5
5,39.5,52000.0,0.65,12345,Unknown,5.0
6,41.0,49000.0,0.79,94105,San Francisco,23.5


In [5]:
# Min-max normalize numeric columns to [0,1]
df = cleaning.normalize_data(df)
df  # show

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,0.238095,0.8125,0.653846,0.953688,Beverly,0.5
1,0.761905,0.625,1.0,0.0,New York,1.0
2,0.0,0.0,0.596154,0.601791,Chicago,0.5
3,1.0,1.0,0.423077,0.999976,SF,0.5
4,0.428571,0.625,0.884615,0.75264,Austin,0.5
5,0.5,0.625,0.0,0.02787,Unknown,0.0
6,0.571429,0.4375,0.538462,1.0,San Francisco,0.5


In [6]:
proc_data_location = './data/processed'
os.makedirs(proc_data_location, exist_ok=True)
df.to_csv(os.path.join(proc_data_location, 'sample_data_cleaned.csv'), index=False)
print("Saved:", os.path.join(proc_data_location, 'sample_data_cleaned.csv'))

Saved: ./data/processed/sample_data_cleaned.csv
