# Homework 5: Data Formats and Storage

This notebook demonstrates working with different data formats (CSV and Parquet) and environment-driven configuration.

## Setup and Imports

In [1]:
import os
from pathlib import Path
import datetime as dt
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

DATA_DIR_RAW = Path(os.getenv("DATA_DIR_RAW", "data/raw"))
DATA_DIR_PROCESSED = Path(os.getenv("DATA_DIR_PROCESSED", "data/processed"))

for _p in (DATA_DIR_RAW, DATA_DIR_PROCESSED):
    _p.mkdir(parents=True, exist_ok=True)

def safe_stamp():
    return dt.datetime.now().strftime("%Y%m%d-%H%M")

## Data Validation Function

In [2]:
def validate_df(df: pd.DataFrame, expect_cols: dict, name: str = "df") -> dict:
    """
    expect_cols: {'col_name': 'dtype_hint', ...}
      dtype_hint one of: 'datetime', 'float', 'int', 'string', or pandas dtype string
    """
    msgs = {"name": name, "missing_cols": [], "bad_dtypes": {}, "shape": df.shape}
    for c, t in expect_cols.items():
        if c not in df.columns:
            msgs["missing_cols"].append(c)
        else:
            s = df[c]
            ok = True
            if t == "datetime":
                try:
                    _ = pd.to_datetime(s, errors="coerce")
                    ok = _.notna().mean() > 0.9
                except Exception:
                    ok = False
            elif t == "float":
                ok = pd.to_numeric(s, errors="coerce").notna().mean() > 0.9
            elif t == "int":
                ok = pd.to_numeric(s, errors="coerce").fillna(0).astype("int64").dtype.kind == "i"
            elif t == "string":
                ok = s.dtype == "object" or pd.api.types.is_string_dtype(s)
            else:
                ok = str(s.dtype) == t
            if not ok:
                msgs["bad_dtypes"][c] = f"expected {t}, got {s.dtype}"
    return msgs

## Create Sample Data

In [3]:
df = pd.DataFrame({
    "date": pd.date_range("2024-11-01", periods=8, freq="D"),
    "symbol": ["AAPL"]*8,
    "adj_close": [182.1, 183.9, 184.2, 181.7, 185.0, 186.3, 185.8, 187.1]
})
df

Unnamed: 0,date,symbol,adj_close
0,2024-11-01,AAPL,182.1
1,2024-11-02,AAPL,183.9
2,2024-11-03,AAPL,184.2
3,2024-11-04,AAPL,181.7
4,2024-11-05,AAPL,185.0
5,2024-11-06,AAPL,186.3
6,2024-11-07,AAPL,185.8
7,2024-11-08,AAPL,187.1


## Save Data in Different Formats

In [4]:
csv_path = DATA_DIR_RAW / f"sample_{safe_stamp()}.csv"
parquet_path = DATA_DIR_PROCESSED / f"sample_{safe_stamp()}.parquet"

df.to_csv(csv_path, index=False)

parquet_engine = None
for engine in ("pyarrow", "fastparquet"):
    try:
        df.to_parquet(parquet_path, index=False, engine=engine)
        parquet_engine = engine
        break
    except Exception as e:
        last_err = e

if parquet_engine is None:
    raise RuntimeError(
        "Parquet save failed. Please install one engine:\n"
        "  pip install pyarrow   (recommended)\n"
        "  or pip install fastparquet\n"
        f"Last error: {last_err}"
    )

csv_path, parquet_path, parquet_engine

(PosixPath('data/raw/sample_20250818-0400.csv'),
 PosixPath('data/processed/sample_20250818-0400.parquet'),
 'pyarrow')

## Load and Validate Data

In [5]:
df_csv = pd.read_csv(csv_path, parse_dates=["date"])
df_parq = pd.read_parquet(parquet_path)

print("Shapes:", df.shape, df_csv.shape, df_parq.shape)
print(df_csv.dtypes)
print(df_parq.dtypes)

expect = {"date": "datetime", "symbol": "string", "adj_close": "float"}
print("Validate original:", validate_df(df, expect, "orig"))
print("Validate csv:", validate_df(df_csv, expect, "csv"))
print("Validate parquet:", validate_df(df_parq, expect, "parquet"))

Shapes: (8, 3) (8, 3) (8, 3)
date         datetime64[ns]
symbol               object
adj_close           float64
dtype: object
date         datetime64[ns]
symbol               object
adj_close           float64
dtype: object
Validate original: {'name': 'orig', 'missing_cols': [], 'bad_dtypes': {}, 'shape': (8, 3)}
Validate csv: {'name': 'csv', 'missing_cols': [], 'bad_dtypes': {}, 'shape': (8, 3)}
Validate parquet: {'name': 'parquet', 'missing_cols': [], 'bad_dtypes': {}, 'shape': (8, 3)}


## Utility Functions for File I/O

In [6]:
def write_df(df: pd.DataFrame, path: Path, index: bool=False):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    suf = path.suffix.lower()
    if suf == ".csv":
        df.to_csv(path, index=index)
        return {"path": str(path), "format": "csv"}
    elif suf == ".parquet":
        last_err = None
        for engine in ("pyarrow", "fastparquet"):
            try:
                df.to_parquet(path, index=index, engine=engine)
                return {"path": str(path), "format": "parquet", "engine": engine}
            except Exception as e:
                last_err = e
        raise RuntimeError(
            f"Failed to write parquet at {path}. "
            "Install an engine: `pip install pyarrow` (recommended) or `pip install fastparquet`. "
            f"Last error: {last_err}"
        )
    else:
        raise ValueError(f"Unsupported suffix: {suf} (use .csv or .parquet)")

def read_df(path: Path):
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"Missing file: {path}")
    suf = path.suffix.lower()
    if suf == ".csv":
        try:
            return pd.read_csv(path, parse_dates=["date"])
        except Exception:
            return pd.read_csv(path)
    elif suf == ".parquet":
        return pd.read_parquet(path)
    else:
        raise ValueError(f"Unsupported suffix: {suf} (use .csv or .parquet)")

## Test Utility Functions

In [7]:
p_csv2 = DATA_DIR_RAW / f"sample2_{safe_stamp()}.csv"
p_par2 = DATA_DIR_PROCESSED / f"sample2_{safe_stamp()}.parquet"

print(write_df(df, p_csv2))
print(write_df(df, p_par2))

df_csv2 = read_df(p_csv2)
df_par2 = read_df(p_par2)

print("Shapes:", df.shape, df_csv2.shape, df_par2.shape)
print("Validate csv2:", validate_df(df_csv2, {"date":"datetime","symbol":"string","adj_close":"float"}))
print("Validate par2:", validate_df(df_par2, {"date":"datetime","symbol":"string","adj_close":"float"}))

{'path': 'data/raw/sample2_20250818-0400.csv', 'format': 'csv'}
{'path': 'data/processed/sample2_20250818-0400.parquet', 'format': 'parquet', 'engine': 'pyarrow'}
Shapes: (8, 3) (8, 3) (8, 3)
Validate csv2: {'name': 'df', 'missing_cols': [], 'bad_dtypes': {}, 'shape': (8, 3)}
Validate par2: {'name': 'df', 'missing_cols': [], 'bad_dtypes': {}, 'shape': (8, 3)}
