# Data Preparation

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

DATA_DIR = Path("../data")
print(Path.cwd())


def read_csv_series(name: str) -> pd.Series:
    """CSV: Date,<value>"""
    df = pd.read_csv(DATA_DIR / f"{name}_data.csv", parse_dates=["Date"])
    df = df.set_index("Date").sort_index()

    s = df.iloc[:, 0].astype(float)
    s.name = name

    # Drop duplicate timestamps if any
    s = s[~s.index.duplicated(keep="last")]
    return s


def to_monthly_mean_end(s: pd.Series) -> pd.Series:
    """Daily -> monthly average"""
    return s.resample("ME").mean()


def align_monthly_to_month_end(s: pd.Series) -> pd.Series:
    # If series is already monthly but on month-start/other day, this snaps it to month-end.
    return s.resample("ME").last()


def log_diff(s: pd.Series) -> pd.Series:
    """Log difference (approx continuous return / growth)."""
    return np.log(s).diff()


btc = read_csv_series("btc")
# eth = read_csv_series("eth")
sp500 = read_csv_series("sp500")
usdindex = read_csv_series("usdindex")

cpi = read_csv_series("cpi")
m2 = read_csv_series("m2")
fedfunds = read_csv_series("fedfunds")


btc_m = to_monthly_mean_end(btc)
# eth_m = to_monthly_mean_end(eth)
sp500_m = to_monthly_mean_end(sp500)
usdindex_m = to_monthly_mean_end(usdindex)

cpi_m = align_monthly_to_month_end(cpi)
m2_m = align_monthly_to_month_end(m2)
fedfunds_m = align_monthly_to_month_end(fedfunds).rename("fedfunds")

btc_logret = log_diff(btc_m).rename("btc_logret")
# eth_logret = log_diff(eth_m).rename("eth_logret")
sp500_logret = log_diff(sp500_m).rename("sp500_logret")
usdindex_logret = log_diff(usdindex_m).rename("usdindex_logret")

inflation = log_diff(cpi_m).rename("inflation")
m2_growth = log_diff(m2_m).rename("m2_growth")

fedfunds_diff = fedfunds_m.diff().rename("fedfunds_diff")

market_data_raw = pd.concat(
    [
        btc_logret,
        # eth_logret,
        sp500_logret,
        usdindex_logret,
        inflation,
        m2_growth,
        fedfunds_m,
        fedfunds_diff,
    ],
    axis=1,
)

market_data = market_data_raw.dropna()

print("\nFinal shape:", market_data.shape)
print("\nHead:\n", market_data.head())
print("\nTail:\n", market_data.tail())

In [None]:
OUT_DIR = Path("../processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)
market_data.to_csv(OUT_DIR / "market_data.csv", index=True)
print(f"Saved {OUT_DIR / 'market_data.csv'}")