# Setup: Generate Sample Dataset

This cell creates the required folder structure (`data/raw/` and `data/processed/`) relative to the notebook, and generates the sample CSV dataset with missing values. 
This ensures the dataset is ready for cleaning functions and saves it to `data/raw/sample_data.csv`.

In [None]:
import os
import pandas as pd
import numpy as np

# Define folder paths relative to this notebook
raw_dir = '../data/raw'
processed_dir = '../data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')


# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [None]:
import pandas as pd
from src import cleaning

## Load Raw Dataset

In [None]:
df = pd.read_csv('../data/raw/sample_data.csv')
df.head()

## Apply Cleaning Functions

In [None]:
# TODO: Apply your functions here
# Example:
# df = cleaning.fill_missing_median(df, ['col1','col2'])
# df = cleaning.drop_missing(df, threshold=0.5)
# df = cleaning.normalize_data(df, ['col1','col2'])

## Save Cleaned Dataset

In [None]:
# df.to_csv('../data/processed/sample_data_cleaned.csv', index=False)

In [1]:
import pandas as pd
from src import cleaning

In [2]:
# Example just to test
data = {
    "age": [20, 25, None, 30, None],
    "salary": [50000, None, 60000, 55000, None]
}
df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Fill missing values
df_filled = cleaning.fill_missing_median(df)
print("\nAfter filling missing with median:")
print(df_filled)

# Drop columns with too many missing values
df_dropped = cleaning.drop_missing(df, col_threshold=0.5)
print("\nAfter dropping missing-heavy columns:")
print(df_dropped)

# Normalize
df_normalized = cleaning.normalize_data(df_filled)
print("\nAfter normalization:")
print(df_normalized)

Original Data:
    age   salary
0  20.0  50000.0
1  25.0      NaN
2   NaN  60000.0
3  30.0  55000.0
4   NaN      NaN

After filling missing with median:
    age   salary
0  20.0  50000.0
1  25.0  55000.0
2  25.0  60000.0
3  30.0  55000.0
4  25.0  55000.0

After dropping missing-heavy columns:
    age   salary
0  20.0  50000.0
1  25.0      NaN
2   NaN  60000.0
3  30.0  55000.0
4   NaN      NaN

After normalization:
   age  salary
0  0.0     0.0
1  0.5     0.5
2  0.5     1.0
3  1.0     0.5
4  0.5     0.5


In [3]:
import os, glob, pandas as pd
from pathlib import Path

# 1) Locate the repo root by walking up until we find a "data" folder
here = Path.cwd()
candidates = [here, here.parent, here.parent.parent, here.parent.parent.parent]
repo_root = None
for c in candidates:
    if (c / "data").exists():
        repo_root = c
        break

if repo_root is None:
    raise FileNotFoundError("Couldn't find a 'data' folder. Make sure you're running this inside your repo.")

raw_dir = repo_root / "data" / "raw"
processed_dir = repo_root / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

# 2) Pick a CSV in data/raw
csvs = sorted(glob.glob(str(raw_dir / "*.csv")))
if not csvs:
    raise FileNotFoundError(f"No CSV files found in {raw_dir}. Put your raw CSV there and run again.")

raw_path = Path(csvs[0])
print(f"Using raw file: {raw_path}")

# 3) Load and preview
df = pd.read_csv(raw_path)
print("Shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

# 4) Quick missing-value summary
na_pct = df.isna().mean().sort_values(ascending=False)
print("\nMissing % by column (top 15):")
display(na_pct.head(15))

Using raw file: /Users/ivysingal/bootcamp_ivy_singal/data/raw/api_yfinance_AAPL_20250820-2218.csv
Shape: (126, 7)

First 5 rows:


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume
0,,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
1,2025-02-21,244.95042419433594,245.5500030517578,248.69000244140625,245.22000122070312,245.9499969482422,53197400
2,2025-02-24,246.49664306640625,247.10000610351562,248.86000061035156,244.4199981689453,244.92999267578125,51326400
3,2025-02-25,246.43678283691406,247.0399932861328,250.0,244.91000366210938,248.0,48013300
4,2025-02-26,239.77308654785156,240.36000061035156,244.97999572753906,239.1300048828125,244.3300018310547,44433600



Missing % by column (top 15):


Date         0.007937
Adj Close    0.000000
Close        0.000000
High         0.000000
Low          0.000000
Open         0.000000
Volume       0.000000
dtype: float64

In [3]:
# Make sure we can import from your repo
import sys
from pathlib import Path

# Walk up to find the repo that contains "src/"
here = Path.cwd()
for c in [here, here.parent, here.parent.parent, here.parent.parent.parent]:
    if (c / "src").exists():
        sys.path.insert(0, str(c))  # add repo root to Python path
        break

import pandas as pd
from src import cleaning
print("Imported cleaning from:", cleaning.__file__)

Imported cleaning from: /Users/ivysingal/bootcamp_ivy_singal/src/cleaning.py


In [5]:
import glob
from pathlib import Path
import pandas as pd

# find repo root (folder that has "data")
here = Path.cwd()
repo_root = None
for c in [here, here.parent, here.parent.parent, here.parent.parent.parent]:
    if (c / "data").exists():
        repo_root = c
        break
assert repo_root is not None, "Can't find repo root with a 'data' folder."

raw_dir = repo_root / "data" / "raw"
csvs = sorted(glob.glob(str(raw_dir / "*.csv")))
assert csvs, f"No CSVs found in {raw_dir}"
raw_path = Path(csvs[0])

print("Using:", raw_path)
df = pd.read_csv(raw_path)
print("Shape:", df.shape)
df.head()

Using: /Users/ivysingal/bootcamp_ivy_singal/data/raw/api_yfinance_AAPL_20250820-2218.csv
Shape: (126, 7)


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume
0,,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
1,2025-02-21,244.95042419433594,245.5500030517578,248.69000244140625,245.22000122070312,245.9499969482422,53197400
2,2025-02-24,246.49664306640625,247.10000610351562,248.86000061035156,244.4199981689453,244.92999267578125,51326400
3,2025-02-25,246.43678283691406,247.0399932861328,250.0,244.91000366210938,248.0,48013300
4,2025-02-26,239.77308654785156,240.36000061035156,244.97999572753906,239.1300048828125,244.3300018310547,44433600


In [6]:
# 1) Fill missing values with medians
df_clean = cleaning.fill_missing_median(df)
print("✅ Missing values filled")

# 2) Drop columns >=50% missing + rows >=70% missing
df_clean = cleaning.drop_missing(df_clean, col_threshold=0.5, row_threshold=0.7)
print("✅ Dropped missing-heavy columns/rows")

# 3) Normalize numeric columns
df_clean = cleaning.normalize_data(df_clean)
print("✅ Normalized numeric columns")

# Preview cleaned data
df_clean.head()

✅ Missing values filled
✅ Dropped missing-heavy columns/rows
✅ Normalized numeric columns


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume
0,,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
1,2025-02-21,244.95042419433594,245.5500030517578,248.69000244140625,245.22000122070312,245.9499969482422,53197400
2,2025-02-24,246.49664306640625,247.10000610351562,248.86000061035156,244.4199981689453,244.92999267578125,51326400
3,2025-02-25,246.43678283691406,247.0399932861328,250.0,244.91000366210938,248.0,48013300
4,2025-02-26,239.77308654785156,240.36000061035156,244.97999572753906,239.1300048828125,244.3300018310547,44433600


In [7]:
# Ensure processed directory exists
processed_dir = repo_root / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

# Save cleaned dataset
out_path = processed_dir / "sample_data_cleaned.csv"
df_clean.to_csv(out_path, index=False)

print(f"✅ Cleaned dataset saved to: {out_path}")

✅ Cleaned dataset saved to: /Users/ivysingal/bootcamp_ivy_singal/data/processed/sample_data_cleaned.csv
