In [1]:
# =========================
# Setup
# =========================
from pathlib import Path
import datetime as dt
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

# Display options
pd.set_option("display.width", 160)
pd.set_option("display.max_columns", 80)

ROOT = Path("..")
DATA = ROOT / "data"

# Features input (from 07_build_features.ipynb)
PROC_FEATURES = DATA / "processed" / "features"
FEATURES_PATH = PROC_FEATURES / "daily_features.parquet"

# Outputs
OUT_MODELS  = ROOT / "reports" / "models"
OUT_TABLES  = ROOT / "reports" / "tables" / "models"
OUT_MODELS.mkdir(parents=True, exist_ok=True)
OUT_TABLES.mkdir(parents=True, exist_ok=True)

# Event dates
EVENT_V3_LAUNCH = dt.date(2021, 5, 5)
EVENT_FTX       = dt.date(2022, 11, 10)

In [2]:
# =========================
# Helpers
# =========================
def write_model_outputs(res, name: str) -> pd.DataFrame:
    """
    Save a text summary and a tidy coefficient table.
    Returns the tidy coef DataFrame.
    """
    # Summary to text
    with open(OUT_TABLES / f"{name}_summary.txt", "w", encoding="utf-8") as f:
        f.write(res.summary().as_text())

    # Tidy coefs
    coefs = (
        pd.DataFrame({
            "term": res.params.index,
            "estimate": res.params.values,
            "std_error": res.bse.values,
            "t_value": res.tvalues.values,
            "p_value": res.pvalues.values,
        })
        .reset_index(drop=True)
    )
    coefs.to_parquet(OUT_MODELS / f"{name}_coefs.parquet", index=False)
    return coefs


def fit_fe_ols(formula: str, data: pd.DataFrame, name: str, cluster_col: str = "label"):
    """
    OLS with absorbed FE via categorical dummies in the formula.
      - Use C(label) and C(date) for two-way FE.
      - Cluster-robust SEs at cluster_col (default: label).
    """
    # NA drop for used columns
    # crude parse: collect tokens split by delimiters; ensures we drop NAs only on columns present
    tokens = (
        formula.replace("~", " ")
               .replace("+", " ")
               .replace("*", " ")
               .replace(":", " ")
               .replace("(", " ")
               .replace(")", " ")
               .split()
    )
    # keep plain column names only
    base_cols = [t for t in tokens if t not in {"C", "np.log", "I"} and "C(" not in t and "np." not in t and "I(" not in t]
    # always ensure cluster_col present
    use_cols = set([cluster_col, "label", "date"]).intersection(data.columns).union([c for c in base_cols if c in data.columns])
    d = data.dropna(subset=list(use_cols)).copy()
    if d.empty:
        raise ValueError(f"No rows left after NA-drop for variables: {sorted(use_cols)}")

    model = smf.ols(formula, data=d)
    res = model.fit(cov_type="cluster", cov_kwds={"groups": d[cluster_col]})
    coefs = write_model_outputs(res, name)
    print(f"[OK] {name}  n={int(res.nobs)}  R2={res.rsquared:.3f}  → {name}_coefs.parquet, {name}_summary.txt")
    return res, coefs


def add_event_time(df: pd.DataFrame, event_date: dt.date, col_name: str = "tau") -> pd.DataFrame:
    """Add integer event time τ = (date - event_date) in days."""
    d = df.copy()
    d[col_name] = (pd.to_datetime(d["date"]) - pd.to_datetime(event_date)).dt.days
    return d


def make_event_study_dummies(df: pd.DataFrame, tau_col: str, k_leads: int, k_lags: int, ref: int = -1):
    """
    Build lead/lag dummies τ==k for k in [-k_leads, ..., -2, 0, ..., k_lags], omitting 'ref' (default -1).
    Returns (df_with_dummies, list_of_dummy_names).
    """
    d = df.copy()
    ks = [k for k in range(-k_leads, k_lags + 1) if k != ref]
    names = []
    for k in ks:
        nm = f"D_tau_{k:+d}".replace("+", "p").replace("-", "m")
        d[nm] = (d[tau_col] == k).astype(int)
        names.append(nm)
    return d, names

In [3]:
# =========================
# Load features
# =========================
assert FEATURES_PATH.exists(), f"Missing features: {FEATURES_PATH}"
df = pd.read_parquet(FEATURES_PATH)
print(f"[loaded] {FEATURES_PATH} rows={len(df):,} from {df['date'].min()} to {df['date'].max()}")

# Safe numerics
for c in ["volumeUSD", "log_volumeUSD", "ret", "abs_ret", "v3_share", "v3_share_filled",
          "eth_median_effective_gas_price_gwei",
          "proxy_chl","proxy_cs","proxy_amihud","proxy_roll",
          "proxy_amihud_dex","proxy_roll_dex"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Backfill a couple of convenience columns if missing
if "log_volumeUSD" not in df.columns and "volumeUSD" in df.columns:
    df["log_volumeUSD"] = np.log(df["volumeUSD"].clip(lower=1.0))
if "abs_ret" not in df.columns and "ret" in df.columns:
    df["abs_ret"] = df["ret"].abs()

# Snapshot
print("== Snapshot ==")
print(f"labels: {df['label'].nunique()} | venues: {df['venue'].nunique()} | venue_types: {sorted(df['venue_type'].dropna().unique().tolist())}")
print(f"date range: {df['date'].min()} → {df['date'].max()}")

[loaded] ..\data\processed\features\daily_features.parquet rows=67,910 from 2021-03-01 to 2023-02-28
== Snapshot ==
labels: 20 | venues: 9 | venue_types: ['CEX', 'DEX']
date range: 2021-03-01 → 2023-02-28


In [4]:
# =========================
# FE panels (two–way FE)
# =========================
results_registry = {}

# 1) DEX: log volume ~ v3_share + gas + FE(label,date)
dex = df.loc[df["venue_type"] == "DEX"].copy()
need = {"log_volumeUSD","v3_share_filled","eth_median_effective_gas_price_gwei","label","date"}
if need.issubset(dex.columns) and not dex.dropna(subset=list(need)).empty:
    formula = "log_volumeUSD ~ v3_share_filled + eth_median_effective_gas_price_gwei + C(label) + C(date)"
    res, coefs = fit_fe_ols(formula, dex, "fe_logvol_v3share")
    results_registry["fe_logvol_v3share"] = {"nobs": int(res.nobs), "r2": float(res.rsquared)}
else:
    print("[skip] fe_logvol_v3share: missing vars or empty after NA drop.")

# 2) CEX: |ret| ~ v3_share + gas + FE(label,date)  (tests cross-venue spillover)
cex = df.loc[df["venue_type"] == "CEX"].copy()
need = {"abs_ret","v3_share_filled","eth_median_effective_gas_price_gwei","label","date"}
if need.issubset(cex.columns) and not cex.dropna(subset=list(need)).empty:
    formula = "abs_ret ~ v3_share_filled + eth_median_effective_gas_price_gwei + C(label) + C(date)"
    res, coefs = fit_fe_ols(formula, cex, "fe_absret_v3share_cex")
    results_registry["fe_absret_v3share_cex"] = {"nobs": int(res.nobs), "r2": float(res.rsquared)}
else:
    print("[skip] fe_absret_v3share_cex: missing vars or empty after NA drop.")

# 3) CEX proxies ~ log volume + gas + FE(label,date)
#    (run four separate regressions where available)
for prox in ["proxy_roll","proxy_chl","proxy_cs","proxy_amihud"]:
    need = {"log_volumeUSD","eth_median_effective_gas_price_gwei", prox, "label","date"}
    d = cex.copy()
    if need.issubset(d.columns) and not d.dropna(subset=list(need)).empty:
        formula = f"{prox} ~ log_volumeUSD + eth_median_effective_gas_price_gwei + C(label) + C(date)"
        res, coefs = fit_fe_ols(formula, d, f"fe_cex_{prox}")
        results_registry[f"fe_cex_{prox}"] = {"nobs": int(res.nobs), "r2": float(res.rsquared)}
    else:
        print(f"[skip] fe_cex_{prox}: missing vars or empty after NA drop.")

# 4) DEX proxies (Amihud/Roll) ~ log volume + gas + FE(label,date)
for prox in ["proxy_amihud", "proxy_roll", "proxy_amihud_dex", "proxy_roll_dex"]:
    if prox not in dex.columns:
        continue
    need = {"log_volumeUSD","eth_median_effective_gas_price_gwei", prox, "label","date"}
    d = dex.copy()
    if need.issubset(d.columns) and not d.dropna(subset=list(need)).empty:
        formula = f"{prox} ~ log_volumeUSD + eth_median_effective_gas_price_gwei + C(label) + C(date)"
        res, coefs = fit_fe_ols(formula, d, f"fe_dex_{prox}")
        results_registry[f"fe_dex_{prox}"] = {"nobs": int(res.nobs), "r2": float(res.rsquared)}
    else:
        print(f"[skip] fe_dex_{prox}: missing vars or empty after NA drop.")

print("\n[FE registry]")
print(results_registry)



[OK] fe_logvol_v3share  n=39987  R2=0.355  → fe_logvol_v3share_coefs.parquet, fe_logvol_v3share_summary.txt




[OK] fe_absret_v3share_cex  n=15766  R2=0.347  → fe_absret_v3share_cex_coefs.parquet, fe_absret_v3share_cex_summary.txt




[OK] fe_cex_proxy_roll  n=10461  R2=0.614  → fe_cex_proxy_roll_coefs.parquet, fe_cex_proxy_roll_summary.txt




[OK] fe_cex_proxy_chl  n=8110  R2=0.484  → fe_cex_proxy_chl_coefs.parquet, fe_cex_proxy_chl_summary.txt




[OK] fe_cex_proxy_cs  n=15817  R2=0.590  → fe_cex_proxy_cs_coefs.parquet, fe_cex_proxy_cs_summary.txt




[OK] fe_cex_proxy_amihud  n=15817  R2=0.173  → fe_cex_proxy_amihud_coefs.parquet, fe_cex_proxy_amihud_summary.txt




[OK] fe_dex_proxy_amihud  n=39535  R2=0.018  → fe_dex_proxy_amihud_coefs.parquet, fe_dex_proxy_amihud_summary.txt




[OK] fe_dex_proxy_roll  n=24173  R2=0.040  → fe_dex_proxy_roll_coefs.parquet, fe_dex_proxy_roll_summary.txt

[FE registry]
{'fe_logvol_v3share': {'nobs': 39987, 'r2': 0.35450832395772824}, 'fe_absret_v3share_cex': {'nobs': 15766, 'r2': 0.34668082008608136}, 'fe_cex_proxy_roll': {'nobs': 10461, 'r2': 0.6143700634752773}, 'fe_cex_proxy_chl': {'nobs': 8110, 'r2': 0.48421312958486684}, 'fe_cex_proxy_cs': {'nobs': 15817, 'r2': 0.5898203139778881}, 'fe_cex_proxy_amihud': {'nobs': 15817, 'r2': 0.1728997020480274}, 'fe_dex_proxy_amihud': {'nobs': 39535, 'r2': 0.018190514886500364}, 'fe_dex_proxy_roll': {'nobs': 24173, 'r2': 0.039907348873024384}}


In [5]:
# =========================
# DiD #1 — Uniswap v3 launch (2021-05-05)
# =========================

# A) DEX vs CEX: log volume ~ DEX*Post + gas + FE(label,date)
d = df.copy()
d["is_dex"] = (d["venue_type"] == "DEX").astype(int)
d["post_v3"] = (pd.to_datetime(d["date"]) >= pd.to_datetime(EVENT_V3_LAUNCH)).astype(int)

need = {"log_volumeUSD","eth_median_effective_gas_price_gwei","is_dex","post_v3","label","date"}
if need.issubset(d.columns) and not d.dropna(subset=list(need)).empty:
    formula = "log_volumeUSD ~ is_dex * post_v3 + eth_median_effective_gas_price_gwei + C(label) + C(date)"
    res, coefs = fit_fe_ols(formula, d, "did1_v3launch_logvol_dex_vs_cex", cluster_col="label")
else:
    print("[skip] did1_v3launch_logvol_dex_vs_cex: missing vars or empty after NA drop.")

# B) Event study (DEX only): drop daily FE to avoid perfect collinearity with event bins
K = 30
dex_es = df.loc[df["venue_type"] == "DEX"].copy()
dex_es = add_event_time(dex_es, EVENT_V3_LAUNCH, col_name="tau_v3")
dex_es, dummies = make_event_study_dummies(dex_es, "tau_v3", k_leads=K, k_lags=K, ref=-1)

need = {"log_volumeUSD","eth_median_effective_gas_price_gwei","label"} | set(dummies)
if need.issubset(dex_es.columns) and not dex_es.dropna(subset=list(need)).empty:
    rhs = " + ".join(dummies) + " + eth_median_effective_gas_price_gwei + C(label)"
    formula = f"log_volumeUSD ~ {rhs}"
    res, coefs = fit_fe_ols(formula, dex_es, "did1_v3launch_eventstudy_logvol_dex_only", cluster_col="label")
else:
    print("[skip] did1_v3launch_eventstudy_logvol_dex_only: missing vars or empty after NA drop.")



[OK] did1_v3launch_logvol_dex_vs_cex  n=55804  R2=0.494  → did1_v3launch_logvol_dex_vs_cex_coefs.parquet, did1_v3launch_logvol_dex_vs_cex_summary.txt
[OK] did1_v3launch_eventstudy_logvol_dex_only  n=39987  R2=0.324  → did1_v3launch_eventstudy_logvol_dex_only_coefs.parquet, did1_v3launch_eventstudy_logvol_dex_only_summary.txt




In [6]:
# =========================
# DiD #2 — FTX collapse (2022-11-10)
# =========================

# A) DEX vs CEX: log volume ~ DEX*Post + gas + FE(label,date)
d2 = df.copy()
d2["is_dex"] = (d2["venue_type"] == "DEX").astype(int)
d2["post_ftx"] = (pd.to_datetime(d2["date"]) >= pd.to_datetime(EVENT_FTX)).astype(int)

need = {"log_volumeUSD","eth_median_effective_gas_price_gwei","is_dex","post_ftx","label","date"}
if need.issubset(d2.columns) and not d2.dropna(subset=list(need)).empty:
    formula = "log_volumeUSD ~ is_dex * post_ftx + eth_median_effective_gas_price_gwei + C(label) + C(date)"
    res, coefs = fit_fe_ols(formula, d2, "did2_ftx_logvol_dex_vs_cex", cluster_col="label")
else:
    print("[skip] did2_ftx_logvol_dex_vs_cex: missing vars or empty after NA drop.")

# B) Event study (DEX only): drop daily FE to identify event-time bins
K = 30
dex_es2 = df.loc[df["venue_type"] == "DEX"].copy()
dex_es2 = add_event_time(dex_es2, EVENT_FTX, col_name="tau_ftx")
dex_es2, dummies2 = make_event_study_dummies(dex_es2, "tau_ftx", k_leads=K, k_lags=K, ref=-1)

need = {"log_volumeUSD","eth_median_effective_gas_price_gwei","label"} | set(dummies2)
if need.issubset(dex_es2.columns) and not dex_es2.dropna(subset=list(need)).empty:
    rhs = " + ".join(dummies2) + " + eth_median_effective_gas_price_gwei + C(label)"
    formula = f"log_volumeUSD ~ {rhs}"
    res, coefs = fit_fe_ols(formula, dex_es2, "did2_ftx_eventstudy_logvol_dex_only", cluster_col="label")
else:
    print("[skip] did2_ftx_eventstudy_logvol_dex_only: missing vars or empty after NA drop.")



[OK] did2_ftx_logvol_dex_vs_cex  n=55804  R2=0.493  → did2_ftx_logvol_dex_vs_cex_coefs.parquet, did2_ftx_logvol_dex_vs_cex_summary.txt
[OK] did2_ftx_eventstudy_logvol_dex_only  n=39987  R2=0.319  → did2_ftx_eventstudy_logvol_dex_only_coefs.parquet, did2_ftx_eventstudy_logvol_dex_only_summary.txt




In [7]:
# =========================
# Done
# =========================
print(f"[Done] Models estimated. Coefficients saved in: {OUT_MODELS}")
print(f"Summaries saved in: {OUT_TABLES}")

[Done] Models estimated. Coefficients saved in: ..\reports\models
Summaries saved in: ..\reports\tables\models
