# Homework Starter — Stage 05: Data Storage
Name: 
Date: 

Objectives:
- Env-driven paths to `data/raw/` and `data/processed/`
- Save CSV and Parquet; reload and validate
- Abstract IO with utility functions; document choices

In [None]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

## 1) Create or Load a Sample DataFrame
You may reuse data from prior stages or create a small synthetic dataset.

In [None]:
import numpy as np
dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({'date': dates, 'ticker': ['AAPL']*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

## 2) Save CSV to data/raw/ and Parquet to data/processed/ (TODO)
- Use timestamped filenames.
- Handle missing Parquet engine gracefully.

In [None]:
def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# TODO: Save CSV
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
csv_path

# TODO: Save Parquet
pq_path = PROC / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path)
except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    pq_path = None
pq_path

## 3) Reload and Validate (TODO)
- Compare shapes and key dtypes.

In [None]:
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
    }
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
validate_loaded(df, df_csv)

In [None]:
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        validate_loaded(df, df_pq)
    except Exception as e:
        print('Parquet read failed:', e)

## 4) Utilities (TODO)
- Implement `detect_format`, `write_df`, `read_df`.
- Use suffix to route; create parent dirs if needed; friendly errors for Parquet.

In [None]:
import typing as t, pathlib

def detect_format(path: t.Union[str, pathlib.Path]):
    s = str(path).lower()
    if s.endswith('.csv'): return 'csv'
    if s.endswith('.parquet') or s.endswith('.pq') or s.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format: ' + s)

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path); p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    if fmt == 'csv':
        df.to_csv(p, index=False)
    else:
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return p

def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    if fmt == 'csv':
        return pd.read_csv(p, parse_dates=['date']) if 'date' in pd.read_csv(p, nrows=0).columns else pd.read_csv(p)
    else:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

# Demo
p_csv = RAW / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"
write_df(df, p_csv); read_df(p_csv).head()
try:
    write_df(df, p_pq)
    read_df(p_pq).head()
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)

## 5) Documentation (TODO)
- Update README with a **Data Storage** section (folders, formats, env usage).
- Summarize validation checks and any assumptions.

In [1]:
import os
from pathlib import Path

# 1) Move to the repo root even if the notebook launched inside /notebooks
cwd = Path.cwd()
if cwd.name == "notebooks":
    os.chdir(cwd.parent)
    print("Moved working directory up to repo root:")
else:
    print("Already at repo root:")
print("CWD =", Path.cwd())

# 2) Ensure the required data folders exist (safe if they already exist)
for p in [Path("data/raw"), Path("data/processed")]:
    p.mkdir(parents=True, exist_ok=True)
    print("✓ ensured folder:", p, "exists =", p.exists())

Moved working directory up to repo root:
CWD = /Users/ivysingal/bootcamp_ivy_singal
✓ ensured folder: data/raw exists = True
✓ ensured folder: data/processed exists = True


In [4]:
!pip install --upgrade pandas pyarrow fastparquet python-dotenv -q
print("Installed/updated pandas, pyarrow, fastparquet, python-dotenv")

Installed/updated pandas, pyarrow, fastparquet, python-dotenv


In [2]:
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

load_dotenv()  # reads .env at the repo root

# Resolve directories from env with safe defaults
raw_dir = Path(os.getenv("DATA_DIR_RAW") or "data/raw")
processed_dir = Path(os.getenv("DATA_DIR_PROCESSED") or "data/processed")

# tiny sample dataframe
df = pd.DataFrame({"id":[1,2,3], "name":["Alice","Bob","Charlie"]})

csv_path = raw_dir / "example.csv"
parquet_path = processed_dir / "example.parquet"

# CSV is straightforward
df.to_csv(csv_path, index=False)

# Parquet: try pyarrow, else fastparquet
try:
    df.to_parquet(parquet_path, index=False, engine="pyarrow")
except Exception as e1:
    print("pyarrow failed, trying fastparquet…", type(e1).__name__, "-", e1)
    df.to_parquet(parquet_path, index=False, engine="fastparquet")

print("Files saved:")
print(" -", csv_path.resolve())
print(" -", parquet_path.resolve())

Files saved:
 - /Users/ivysingal/bootcamp_ivy_singal/data/raw/example.csv
 - /Users/ivysingal/bootcamp_ivy_singal/data/processed/example.parquet


In [3]:
csv_back = pd.read_csv(csv_path)
parq_back = pd.read_parquet(parquet_path)

print("CSV shape:", csv_back.shape)
print("Parquet shape:", parq_back.shape)
display(csv_back.head())
display(parq_back.head())

CSV shape: (3, 2)
Parquet shape: (3, 2)


Unnamed: 0,id,name
0,1,Alice
1,2,Bob
2,3,Charlie


Unnamed: 0,id,name
0,1,Alice
1,2,Bob
2,3,Charlie


In [4]:
# ---- Stage 05 • Robust reload of example.csv/parquet ----
from pathlib import Path
import os, importlib.util
import pandas as pd

# 1) Detect project root (folder that contains /data)
ROOT = Path.cwd()
if not (ROOT / "data").exists():
    # notebook likely running from /notebooks — try parent
    if (ROOT.parent / "data").exists():
        ROOT = ROOT.parent
    else:
        # climb up a couple of levels just in case
        for up in [ROOT.parent, ROOT.parent.parent]:
            if (up / "data").exists():
                ROOT = up
                break
print("Using ROOT:", ROOT)

# 2) Resolve env-driven dirs relative to ROOT
DATA_DIR_RAW = ROOT / os.getenv("DATA_DIR_RAW", "data/raw")
DATA_DIR_PROCESSED = ROOT / os.getenv("DATA_DIR_PROCESSED", "data/processed")

csv_path = DATA_DIR_RAW / "example.csv"
parquet_path = DATA_DIR_PROCESSED / "example.parquet"

print("CSV path:", csv_path)
print("Parquet path:", parquet_path)
print("Exists? CSV:", csv_path.exists(), " Parquet:", parquet_path.exists())

# 3) Helpful hint if file isn't found locally (e.g., not pulled from Git)
if not parquet_path.exists():
    raise FileNotFoundError(
        f"Couldn't find {parquet_path}. If you see it on GitHub, make sure "
        f"your local repo has the latest changes (e.g., run `git pull` in the repo)."
    )

# 4) Read CSV and Parquet
df_csv = pd.read_csv(csv_path)

engine = "pyarrow" if importlib.util.find_spec("pyarrow") else (
    "fastparquet" if importlib.util.find_spec("fastparquet") else None
)
if engine is None:
    raise RuntimeError("Install pyarrow or fastparquet:  %pip install pyarrow")

df_parquet = pd.read_parquet(parquet_path, engine=engine)

print("Loaded shapes -> CSV:", df_csv.shape, " Parquet:", df_parquet.shape)

# 5) Quick equality check (ignore column order & dtype)
from pandas.testing import assert_frame_equal
try:
    assert_frame_equal(
        df_csv.reindex(sorted(df_csv.columns), axis=1).reset_index(drop=True),
        df_parquet.reindex(sorted(df_parquet.columns), axis=1).reset_index(drop=True),
        check_like=True,
        check_dtype=False
    )
    print("✅ Data matches between CSV and Parquet.")
except AssertionError as e:
    print("❌ Data mismatch:\n", e)

display(df_csv.head())
display(df_parquet.head())

Using ROOT: /Users/ivysingal/bootcamp_ivy_singal
CSV path: /Users/ivysingal/bootcamp_ivy_singal/data/raw/example.csv
Parquet path: /Users/ivysingal/bootcamp_ivy_singal/data/processed/example.parquet
Exists? CSV: True  Parquet: True
Loaded shapes -> CSV: (3, 2)  Parquet: (3, 2)
✅ Data matches between CSV and Parquet.


Unnamed: 0,id,name
0,1,Alice
1,2,Bob
2,3,Charlie


Unnamed: 0,id,name
0,1,Alice
1,2,Bob
2,3,Charlie


In [5]:
# ---- Stage 05 • Compression & Size Comparison ----
from pathlib import Path
import os, importlib.util
import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "data").exists():
    if (ROOT.parent / "data").exists():
        ROOT = ROOT.parent
DATA_DIR_RAW = ROOT / os.getenv("DATA_DIR_RAW", "data/raw")
DATA_DIR_PROCESSED = ROOT / os.getenv("DATA_DIR_PROCESSED", "data/processed")

csv_path = DATA_DIR_RAW / "example.csv"
parquet_path = DATA_DIR_PROCESSED / "example.parquet"

# load CSV (already verified in Step 3)
df_csv = pd.read_csv(csv_path)

# pick a parquet engine
engine = "pyarrow" if importlib.util.find_spec("pyarrow") else (
    "fastparquet" if importlib.util.find_spec("fastparquet") else None
)
if engine is None:
    raise RuntimeError("Install 'pyarrow' or 'fastparquet' and re-run this cell.")

# compressed variants
csv_gz_path = DATA_DIR_RAW / "example.csv.gz"
parquet_snappy_path = DATA_DIR_PROCESSED / "example.snappy.parquet"

# write compressed versions
df_csv.to_csv(csv_gz_path, index=False, compression="gzip")
df_csv.to_parquet(parquet_snappy_path, index=False, engine=engine, compression="snappy")

def size_kb(p: Path) -> float:
    return os.path.getsize(p) / 1024

rows = [
    ("CSV", str(csv_path.relative_to(ROOT)), size_kb(csv_path)),
    ("CSV (gzip)", str(csv_gz_path.relative_to(ROOT)), size_kb(csv_gz_path)),
    ("Parquet", str(parquet_path.relative_to(ROOT)), size_kb(parquet_path)),
    ("Parquet (snappy)", str(parquet_snappy_path.relative_to(ROOT)), size_kb(parquet_snappy_path)),
]

print("File sizes (KB):")
for kind, path, kb in rows:
    print(f"{kind:17s} {kb:8.2f}  -  {path}")

# quick takeaway
plain_ratio = size_kb(csv_path) / max(size_kb(parquet_path), 1e-9)
snappy_ratio = size_kb(csv_gz_path) / max(size_kb(parquet_snappy_path), 1e-9)
print("\nTakeaways:")
print(f"- CSV / Parquet (plain) size ratio: {plain_ratio:.2f}x")
print(f"- CSV.gz / Parquet.snappy size ratio: {snappy_ratio:.2f}x")

File sizes (KB):
CSV                   0.03  -  data/raw/example.csv
CSV (gzip)            0.07  -  data/raw/example.csv.gz
Parquet               1.62  -  data/processed/example.parquet
Parquet (snappy)      1.62  -  data/processed/example.snappy.parquet

Takeaways:
- CSV / Parquet (plain) size ratio: 0.02x
- CSV.gz / Parquet.snappy size ratio: 0.04x
