# 00_data_test
Downloads SPY daily data, computes realized volatility, and validates that the data pipeline works.

### imports -- RUN THIS FIRST!!!!!!!

In [3]:
# data handling
import pandas as pd
import numpy as np
from pathlib import Path

# data source
import yfinance as yf

# visualization
import matplotlib.pyplot as plt

plt.style.use("seaborn-v0_8")
pd.set_option("display.max_columns", 20)
pd.set_option("display.width", 120)

### constants and path setup

In [7]:
# constants for data
TICKER = "SPY"
START_DATE = "2015-01-01"
END_DATE = "2025-01-01"

# project paths
ROOT = Path("..").resolve()
RAW_DIR = ROOT / "data" / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)
PRICES_PATH = RAW_DIR / f"prices_{TICKER}.parquet"

print(f"Project root: {ROOT}")
print(f"Raw data dir: {RAW_DIR}")
print(f"Prices parquet: {PRICES_PATH}")

Project root: /Users/helzeiah/Desktop/quantProjects/vol-arbitrage-ml
Raw data dir: /Users/helzeiah/Desktop/quantProjects/vol-arbitrage-ml/data/raw
Prices parquet: /Users/helzeiah/Desktop/quantProjects/vol-arbitrage-ml/data/raw/prices_SPY.parquet


### helper functions (the code)

In [6]:
def normalize_price_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Purpose:   standardize and clean the Yahoo Finance price data.
    Details:   - resets index so date is a column
               - renames columns to snake_case
               - ensures correct types and order
               - sorts by date and drops duplicates
    :param df: the pd.DataFrame to be normalized
    :return:   the cleaned DataFrame
    """
    df = df.copy()

    # reset the index
    df = df.reset_index().rename(columns={"Date": "date"})

    # map names to snake_case
    rename_map = {
        "Open": "open",
        "High": "high",
        "Low": "low",
        "Close": "close",
        "Adj Close": "adj_close",
        "Volume": "volume",
    }
    df = df.rename(columns=rename_map)

    # enforce proper column order
    cols = ["date", "open", "high", "low", "close", "adj_close", "volume"]
    df = df[cols]

    # ensure numeric data types
    df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)
    numeric_cols = ["open", "high", "low", "close", "adj_close", "volume"]
    for column in numeric_cols:
        df[column] = pd.to_numeric(df[column], errors="coerce")

    df = (
        df.sort_values("date")
        .drop_duplicates(subset=["date"], keep="last")
        .reset_index(drop=True)
    )

    return df


def compute_log_returns(adj_close: pd.Series) -> pd.Series:
    """
    Purpose:
    :param adj_close:
    :return:
    """
    s = adj_close.astype(float).clip(lower=1e-12)
    log_price = np.log(s)
    r = log_price.diff()
    return r

def compute_forward_realized_vol(log_returns: pd.Series, horizon: int = 10, annualize: bool = True) -> pd.Series:
    """
    Purpose:
    :param log_returns:
    :param horizon:
    :param annualize:
    :return:
    """

    backward_std = log_returns.rolling(window=horizon, min_periods=horizon).std(ddof=1)

    forward_std = backward_std.shift(-horizon)

    if annualize:
        forward_std = forward_std * np.sqrt(252)

    return forward_std.rename(f"rv{horizon}" if not annualize else f"rv{horizon}_ann")

In [None]:
""  # download the data
df = yf.download(TICKER, start=START_DATE, end=END_DATE)
# for the following, idk how to do them or why: so here is just the steps
# reset index
# normalize column names
# sort by date, drop duplicates
# ensure numeric types
# print head and info

In [None]:
# save as parquet to prices dir (dont know how)

In [None]:
# compute log returns (idk what this is or how to do it)