In [1]:
# =========================
# 12_export_replication_package.ipynb
# Goal: Build a clean replication bundle (artifacts + manifest + env) and update README section
# =========================

from pathlib import Path
import datetime as dt
import hashlib
import json
import shutil
import subprocess
import pandas as pd
import numpy as np

# Display defaults (consistent with earlier notebooks)
pd.set_option("display.width", 160)
pd.set_option("display.max_columns", 60)

# -------------------------
# Paths
# -------------------------
ROOT = Path("..").resolve()
DATA = ROOT / "data"
PROC = DATA / "processed"
COMPARE = PROC / "compare"
FEATURES = PROC / "features"

REPORTS = ROOT / "reports"
MODELS  = REPORTS / "models"
TABLES  = REPORTS / "tables"
FIGS    = REPORTS / "figures"

SYSTEM  = REPORTS / "system"
SYSTEM.mkdir(parents=True, exist_ok=True)

DIST = ROOT / "dist"
DIST.mkdir(parents=True, exist_ok=True)

README = ROOT / "README.md"

stamp = dt.datetime.utcnow().strftime("%Y%m%d_%H%M")

In [2]:
# -------------------------
# Helpers
# -------------------------
def sha256sum(p: Path) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

def list_files(globs) -> list[Path]:
    out = []
    for g in globs:
        out.extend(sorted(ROOT.glob(g)))
    return [p for p in out if p.is_file()]

def write_text(p: Path, s: str) -> None:
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_text(s, encoding="utf-8")

def run_and_capture(cmd: list[str]) -> str:
    try:
        out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, text=True)
        return out
    except Exception as e:
        return f"[WARN] command failed: {' '.join(cmd)}\n{e}"

def copy_into_stage(src: Path, stage_root: Path, rel_under_root: Path | None = None) -> Path:
    """
    Copy `src` into the staging folder, preserving the repo-relative path.
    Ensures parent directories exist. Returns destination path.
    If `rel_under_root` is provided, it is used as the relative path under stage.
    """
    if rel_under_root is None:
        rel_under_root = src.relative_to(ROOT)
    dst = stage_root / rel_under_root
    dst.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(src, dst)
    return dst

In [3]:
# -------------------------
# 1) Freeze environment
# -------------------------
pip_freeze = run_and_capture(["python", "-m", "pip", "freeze"])
write_text(SYSTEM / f"pip_freeze_{stamp}.txt", pip_freeze)

conda_env = run_and_capture(["conda", "env", "export"])
write_text(SYSTEM / f"conda_env_{stamp}.yml", conda_env)

In [4]:
# -------------------------
# 2) Collect deliverables
# -------------------------
globs = [
    # processed data (small/essential)
    "data/processed/compare/daily_panel.parquet",
    "data/processed/compare/daily_panel_agg.parquet",
    "data/processed/features/daily_features.parquet",
    # models + tables
    "reports/models/*_coefs.parquet",
    "reports/tables/models/*_summary.txt",
    "reports/tables/robustness/*",
    # figures (descriptives, models, interpretation)
    "reports/figures/**/*.png",
    "reports/figures/**/*.svg",
    # system info (current export’s snapshots)
    f"reports/system/pip_freeze_{stamp}.txt",
    f"reports/system/conda_env_{stamp}.yml",
]

files = list_files(globs)
print(f"[info] files to package: {len(files)}")
for p in files[:10]:
    print(" -", p.relative_to(ROOT))
if len(files) > 10:
    print(" ...")

# Sanity: required minimum set
must_have = [
    COMPARE / "daily_panel.parquet",
    FEATURES / "daily_features.parquet",
]
missing = [p for p in must_have if not p.exists()]
if missing:
    raise FileNotFoundError(f"Missing required artifacts: {missing}")

[info] files to package: 57
 - data\processed\compare\daily_panel.parquet
 - data\processed\compare\daily_panel_agg.parquet
 - data\processed\features\daily_features.parquet
 - reports\models\did1_v3launch_eventstudy_logvol_dex_only_coefs.parquet
 - reports\models\did1_v3launch_logvol_dex_vs_cex_coefs.parquet
 - reports\models\did2_ftx_eventstudy_logvol_dex_only_coefs.parquet
 - reports\models\did2_ftx_logvol_dex_vs_cex_coefs.parquet
 - reports\models\fe_absret_v3share_cex_coefs.parquet
 - reports\models\fe_cex_proxy_amihud_coefs.parquet
 - reports\models\fe_cex_proxy_chl_coefs.parquet
 ...


In [5]:
# -------------------------
# 3) Manifest (hashes + sizes)
# -------------------------
rows = []
for p in files:
    rel = p.relative_to(ROOT).as_posix()
    try:
        rows.append(dict(
            path=rel,
            size_bytes=p.stat().st_size,
            sha256=sha256sum(p),
            mtime_iso=dt.datetime.utcfromtimestamp(p.stat().st_mtime).isoformat() + "Z"
        ))
    except Exception as e:
        print(f"[WARN] skipped {rel}: {e}")

manifest = pd.DataFrame(rows).sort_values("path").reset_index(drop=True)
display(manifest.head(10))
MANIFEST_DIR = ROOT / "reports" / "manifests"
MANIFEST_DIR.mkdir(parents=True, exist_ok=True)
manifest_csv  = MANIFEST_DIR / f"manifest_{stamp}.csv"
manifest_json = MANIFEST_DIR / f"manifest_{stamp}.json"
manifest.to_csv(manifest_csv, index=False)
manifest.to_json(manifest_json, orient="records", lines=False, indent=2)
print(f"[OK] manifest written: {manifest_csv.name}, {manifest_json.name}")

Unnamed: 0,path,size_bytes,sha256,mtime_iso
0,data/processed/compare/daily_panel.parquet,2568641,12d4ede0d7c29b429a0d4224745d1c416527da65e3b3ec...,2025-09-13T07:14:53.836082Z
1,data/processed/compare/daily_panel_agg.parquet,2688713,ae09958efafb91e36fd014bf7677cab903d7c4ec573688...,2025-09-13T07:14:54.199145Z
2,data/processed/features/daily_features.parquet,4392719,1dbcce0371dfa1b0c6712bc265883f6958da20d68e1345...,2025-09-14T03:40:42.704580Z
3,reports/figures/descriptives/01_v3_share_daily...,111407,2cae60bce5d6c3202fbd2cbea74078b0a7ab78d9d39900...,2025-09-14T10:22:11.183809Z
4,reports/figures/descriptives/01_v3_share_daily...,60170,8e82897460d90a1068a091d10a1c0a0a0ef45e2e35f12c...,2025-09-14T10:22:11.342134Z
5,reports/figures/descriptives/02_v3_share_by_la...,70793,f0862d7506064dfc6c88fa17ef25f7577c6b2fb5838357...,2025-09-14T10:22:11.849685Z
6,reports/figures/descriptives/02_v3_share_by_la...,54728,a3916f6765462e11126774f3b2c96e92a5cba099d181e5...,2025-09-14T10:22:11.992053Z
7,reports/figures/descriptives/03_eth_median_eff...,126863,29286cf6956660a12dc46578ce7ea6663b26c77dcbc2c7...,2025-09-14T10:22:12.619736Z
8,reports/figures/descriptives/03_eth_median_eff...,62038,37596e46c9a276ceb49d3e1e111144d91004260cbb4e06...,2025-09-14T10:22:12.778224Z
9,reports/figures/descriptives/04_dex_volume_vs_...,192993,f45bb91edf6915d46e768dda2604330012943beac44de2...,2025-09-14T10:22:08.956874Z


[OK] manifest written: manifest_20250914_2046.csv, manifest_20250914_2046.json


In [6]:
# -------------------------
# 4) Update README section (between markers)
# -------------------------
section = f"""
<!-- REPL_EXPORT_START -->
# ssrn-3984897-replication

Replication of SSRN 3984897.

## Folder layout

```

├── data
│   ├── raw                  <- Vendor downloads and raw extracts (not tracked)
│   │   ├── cex/ohlcv
│   │   ├── ethereum/gas\_fees
│   │   ├── uniswap\_v2/pair\_day\_data
│   │   └── uniswap\_v3/pool\_day\_data
│   ├── interim              <- Cleaned / intermediate artifacts (not tracked)
│   │   ├── cex/proxies      <- Daily CEX microstructure proxies (Roll/CHL/CS/Amihud)
│   │   └── dex/proxies      <- Daily DEX proxies (Amihud/Roll) by v3 fee-tier & v2
│   └── processed            <- Final analysis tables (not tracked)
│       ├── compare          <- Daily comparison panels
│       │   ├── daily\_panel.parquet
│       │   └── daily\_panel\_agg.parquet   <- adds aggregated v3 ("uniswap\_v3\_all")
│       └── features         <- Modeling features
│           └── daily\_features.parquet
├── notebooks
│   ├── 01\_\* … 06\_\*          <- Data collection & panel build (up to daily panel)
│   ├── 07\_build\_features.ipynb
│   ├── 08\_descriptives.ipynb
│   ├── 09\_models\_fe\_did.ipynb
│   ├── 10\_robustness\_checks.ipynb
│   ├── 11\_visualization\_and\_interpretation.ipynb
│   └── 12\_export\_replication\_package.ipynb
├── src
│   └── replication
│       ├── build\_daily\_comparison\_panel.py
│       ├── cex\_proxies\_from\_ohlcv.py
│       └── dex\_proxies\_from\_daily.py
├── reports
│   ├── figures
│   │   ├── descriptives
│   │   ├── models
│   │   ├── robustness
│   │   └── interpretation
│   ├── models               <- Tidy coefficient tables (*.parquet)
│   ├── tables
│   │   ├── models           <- statsmodels summaries (*.txt)
│   │   └── robustness
│   ├── manifests            <- File inventories (auto-created)
│   └── system               <- Environment snapshots (pip/conda)
├── dist                     <- Zipped replication bundles
└── config                   <- Credentials templates & settings

```

## Environment

Conda env: `$CondaEnv` (Python 3.11). See `environment.yml`.
We also snapshot the environment during export into `reports/system/`.

---

## Research plan (what we replicate and why)

**Focus:** market microstructure under limited (daily) data. We proxy liquidity on CEX and DEX and study how Uniswap v3 and major shocks shift activity/liquidity.

**Events:**
* Uniswap v3 launch — 2021-05-05
* FTX collapse — 2022-11-10

**Main analyses (A1–A8):**
* **A1 — DEX activity vs v3 adoption (TWFE)**  
  *H1:* Higher v3 penetration within a pair increases DEX volume.  
  `log(DEX volume) ~ v3_share + ETH gas + FE(label) + FE(date)`
* **A2 — Spillovers to CEX volatility (TWFE)**  
  *H2:* Higher v3 share (DEX) lowers CEX volatility (abs returns).  
  `|CEX ret| ~ v3_share + ETH gas + FE(label) + FE(date)`
* **A3 — CEX microstructure (TWFE, per proxy)**  
  *H3:* CEX spreads/illiquidity (Roll, CHL, CS, Amihud) improve with volume and deteriorate with congestion.  
  `proxy_cex ~ log(CEX volume) + ETH gas + FEs`
* **A4 — DEX microstructure (TWFE, per proxy)**  
  *H4:* DEX Amihud/Roll decrease with DEX volume; increase with gas.  
  `proxy_dex ~ log(DEX volume) + ETH gas + FEs`
* **A5 — DiD #1: v3 launch (DEX vs CEX)**  
  *H5:* Post-launch, DEX volume rises relative to CEX.  
  `log(volume) ~ is_dex×Post_v3 + gas + FEs`
* **A6 — Event study: v3 (DEX only)**  
  *H6:* Flat pre-trends; post-event increase in DEX activity.
* **A7 — DiD #2: FTX (DEX vs CEX)**  
  *H7:* Post-FTX, DEX activity rises relative to CEX.
* **A8 — Event study: FTX (DEX only)**  
  *H8:* Persistent reallocation toward DEX post-FTX.

**Proxies used**
* **CEX (from OHLCV):** Roll (1984), CHL (Abdi–Ranaldo 2017), Corwin–Schultz (2012), Amihud (2002).
* **DEX (from daily OHLC + volume):** Amihud, Roll (adapted to daily).

---

## Core datasets (minimal to run models)

* `data/processed/compare/daily_panel.parquet` — CEX + DEX (v2 + v3 fee-tiers) + ETH gas.
* `data/processed/compare/daily_panel_agg.parquet` — adds aggregated v3 tier “uniswap_v3_all”.
* `data/processed/features/daily_features.parquet` — modeling features (e.g., `v3_share`, flags, logs, proxies).

---

## How to reproduce (quick)

1. Create the conda env (`environment.yml`) or use the snapshots in `reports/system/`.
2. Run or verify up to the processed panel (`build_daily_comparison_panel.py` / `06_*`).
3. Run `07_build_features.ipynb`.
4. Run `08_descriptives.ipynb` (optional figures).
5. Run `09_models_fe_did.ipynb` (FE + DiD) and `10_robustness_checks.ipynb`.
6. Inspect tables in `reports/tables/` and coefs in `reports/models/`.
7. (Optional) `11_visualization_and_interpretation.ipynb` for consolidated takeaways.
8. `12_export_replication_package.ipynb` to build a clean ZIP + README insert.

---

## Notes & limitations

* We work at **daily** frequency; true microstructure moments (trade/quote) are approximated by **proxies**. Some paper results cannot be exactly reproduced without high-freq data.
* Uniswap v3 fee-tier activity is aggregated to a **volume-weighted v3 share** per pair/day.
* Warnings like “covariance of constraints does not have full rank” arise from many FE dummies; clustered SEs are still computed but some dummy covariances are singular (expected under two-way FE with many fixed effects).

---

## Replication package

Run `12_export_replication_package.ipynb` to:
* snapshot the environment (pip/conda) into `reports/system/`,
* write a file manifest with hashes to `reports/manifests/`,
* zip essential artifacts into `dist/replication_package_*.zip`,
* update this section in place.

Artifacts included: processed panels, features, model coefs, model summaries, robustness tables, figures, and manifests (raw vendor downloads are excluded).
<!-- REPL_EXPORT_END -->
""".strip()

def replace_block(text: str, block: str, start="<!-- REPL_EXPORT_START -->", end="<!-- REPL_EXPORT_END -->"):
    if start in text and end in text and text.index(start) < text.index(end):
        head = text[: text.index(start)]
        tail = text[text.index(end) + len(end):]
        return head + block + "\n" + tail
    else:
        # If markers don’t exist, append the block at the end
        return text.rstrip() + "\n\n" + block + "\n"

try:
    if README.exists():
        current = README.read_text(encoding="utf-8")
        updated = replace_block(current, section)
        if updated != current:
            README.write_text(updated, encoding="utf-8")
            print(f"[OK] README updated between marker blocks.")
        else:
            print("[info] README unchanged (no marker pair found; section appended).")
    else:
        print("[info] README.md not found; skipping README update.")
except Exception as e:
    print(f"[WARN] README update skipped: {e}")

[OK] README updated between marker blocks.


In [7]:
# -------------------------
# 5) Create ZIP package under dist/
# -------------------------
pkg_name = f"replication_package_{stamp}.zip"
pkg_path = DIST / pkg_name

# Build a staging dir to control the structure
STAGE = ROOT / f"_stage_export_{stamp}"
if STAGE.exists():
    shutil.rmtree(STAGE)
STAGE.mkdir(parents=True, exist_ok=True)

# Copy files preserving rel paths (parents ensured)
for p in files:
    copy_into_stage(p, STAGE)

# Add manifests, and a lightweight README for the ZIP root
copy_into_stage(manifest_csv,  STAGE)
copy_into_stage(manifest_json, STAGE)

write_text(STAGE / "HOWTO.txt",
"""How to use this package
-----------------------
1) Create an environment using the files in reports/system/
2) Place folder contents at the repo root or adjust paths accordingly
3) See README.md (in the repo) for the full pipeline and notebooks
""")

# Make the ZIP
shutil.make_archive(pkg_path.with_suffix(""), "zip", root_dir=STAGE)
print(f"[OK] ZIP created: {pkg_path}")

# Clean staging
shutil.rmtree(STAGE, ignore_errors=True)

[OK] ZIP created: C:\Users\jazzn\Downloads\thesis\ssrn-3984897-replication\dist\replication_package_20250914_2046.zip


In [8]:
# -------------------------
# 6) Final checklist
# -------------------------
print("\n== Export summary ==")
print(f"ZIP: {pkg_path}")
print(f"Manifest rows: {len(manifest)} | hash sample: {manifest['sha256'].iloc[0][:12]}...")
print("✓ Env snapshots saved")
print("✓ README section written (or appended)")
print("✓ Ready to distribute")


== Export summary ==
ZIP: C:\Users\jazzn\Downloads\thesis\ssrn-3984897-replication\dist\replication_package_20250914_2046.zip
Manifest rows: 57 | hash sample: 12d4ede0d7c2...
✓ Env snapshots saved
✓ README section written (or appended)
✓ Ready to distribute
