# Stage 05 — Data Storage
Date: Aug 15 (Fri) (B)

**Focus:** reproducible save/load; env-driven paths; CSV vs Parquet; raw vs processed.

_Brief cloud note_: S3/data lakes support large-scale storage and efficient querying (partitioning, column pruning). No deep dive today.

In [None]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
RAW_DIR = pathlib.Path(os.getenv("DATA_DIR_RAW", "data/raw"))
PROC_DIR = pathlib.Path(os.getenv("DATA_DIR_PROCESSED", "data/processed"))
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)
print("RAW_DIR:", RAW_DIR.resolve())
print("PROC_DIR:", PROC_DIR.resolve())

## Create a sample DataFrame

In [None]:
import numpy as np
dates = pd.date_range("2024-01-01", periods=10, freq="D")
df = pd.DataFrame({
    'date': dates,
    'ticker': ['AAPL']*10,
    'price': 150 + np.random.randn(10).cumsum()
})
df.info()

## Save to CSV (raw) and Parquet (processed)

In [None]:
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

csv_path = RAW_DIR / f"prices_{ts()}.csv"
df.to_csv(csv_path, index=False)
print("Saved CSV →", csv_path)

parq_path = PROC_DIR / f"prices_{ts()}.parquet"
try:
    df.to_parquet(parq_path)  # uses installed engine if available
    print("Saved Parquet →", parq_path)
except Exception as e:
    print("Parquet save failed (engine missing?). Skipping Parquet demo.")
    print("Error:", e)

## Reload & Validate

In [None]:
def validate_loaded(original: pd.DataFrame, reloaded: pd.DataFrame, cols=('date','ticker','price')):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'cols_present': all(c in reloaded.columns for c in cols)
    }
    # dtype sanity checks
    if 'price' in reloaded.columns:
        checks['price_is_numeric'] = pd.api.types.is_numeric_dtype(reloaded['price'])
    if 'date' in reloaded.columns:
        checks['date_is_datetime'] = pd.api.types.is_datetime64_any_dtype(reloaded['date'])
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
print('CSV validation:', validate_loaded(df, df_csv))

if parq_path.exists():
    try:
        df_parq = pd.read_parquet(parq_path)
        print('Parquet validation:', validate_loaded(df, df_parq))
    except Exception as e:
        print('Parquet read failed:', e)
else:
    print('Parquet file not present (skipped earlier).')

## IO Utilities (suffix-based)

In [None]:
from typing import Union

def ensure_dir(path: pathlib.Path):
    path.parent.mkdir(parents=True, exist_ok=True)

def detect_format(path: Union[str, pathlib.Path]):
    suf = str(path).lower()
    if suf.endswith('.csv'): return 'csv'
    if suf.endswith('.parquet') or suf.endswith('.pq') or suf.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format for: ' + str(path))

def write_df(df: pd.DataFrame, path: Union[str, pathlib.Path]):
    path = pathlib.Path(path)
    ensure_dir(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        df.to_csv(path, index=False)
    elif fmt == 'parquet':
        try:
            df.to_parquet(path)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return path

def read_df(path: Union[str, pathlib.Path]):
    path = pathlib.Path(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        return pd.read_csv(path, parse_dates=['date']) if 'date' in pd.read_csv(path, nrows=0).columns else pd.read_csv(path)
    elif fmt == 'parquet':
        try:
            return pd.read_parquet(path)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

# Demo utility usage
csv2 = RAW_DIR / f"prices_util_{ts()}.csv"
pq2  = PROC_DIR / f"prices_util_{ts()}.parquet"
write_df(df, csv2)
df2 = read_df(csv2)
print('Reloaded CSV via util, shape:', df2.shape)

try:
    write_df(df, pq2)
    df3 = read_df(pq2)
    print('Reloaded Parquet via util, shape:', df3.shape)
except RuntimeError as e:
    print('Parquet util demo skipped:', e)

### Summary
- Env-driven paths → portable IO.
- CSV vs Parquet → tradeoffs.
- Utilities abstract away format details.
- Next: document storage plan in README.