# dsfb-add Colab Sweep Notebook

This notebook loads Rust-generated CSVs from `/output-dsfb-add/<timestamp>/` and generates Plotly PNG figures for AET, IWLT, TCP, and RLT.

It is designed to fail closed if either the notebook copy or the Colab package environment is stale relative to the repository `main` branch.
For Colab reliability it uses the self-contained `kaleido==0.2.1` export path instead of Chrome-managed Kaleido v1.

Recommended workflow:

1. Open the notebook from the repository Colab link on the `main` branch.
2. Use `Runtime -> Restart session and run all` when Colab prompts after the install cell.
3. By default the notebook clones the repository, runs `cargo run -p dsfb-add --bin dsfb_add_sweep`, and uses the fresh timestamped output.
4. Only set `OUTPUT_DIR` explicitly if you intentionally want to analyze a specific existing run directory.
5. The generated PNGs are written back into the same timestamped directory as the CSVs.


In [None]:
%pip install -q --upgrade "plotly==6.1.1" "kaleido==0.2.1" "ripser==0.6.12"


In [None]:
import json
import sys
from importlib.metadata import version
from urllib.request import urlopen

NOTEBOOK_VERSION = "2026-03-01-8"
NOTEBOOK_RAW_URL = "https://raw.githubusercontent.com/infinityabundance/dsfb/main/crates/dsfb-add/dsfb_add_sweep.ipynb"
EXPECTED_PACKAGE_VERSIONS = {
    "plotly": "6.1.1",
    "kaleido": "0.2.1",
}

for prefix in ("plotly", "kaleido", "ripser"):
    loaded = [name for name in list(sys.modules) if name == prefix or name.startswith(prefix + ".")]
    for name in loaded:
        del sys.modules[name]

remote_nb = json.load(urlopen(NOTEBOOK_RAW_URL))
remote_version = remote_nb.get("metadata", {}).get("dsfb_add_notebook_version")
if remote_version != NOTEBOOK_VERSION:
    raise RuntimeError(
        f"Stale notebook copy detected. This notebook is {NOTEBOOK_VERSION}, but main has {remote_version}. "
        "Reopen the notebook from the repository Colab link."
    )

installed_versions = {name: version(name) for name in EXPECTED_PACKAGE_VERSIONS}
mismatches = {
    name: (installed_versions[name], expected)
    for name, expected in EXPECTED_PACKAGE_VERSIONS.items()
    if installed_versions[name] != expected
}
if mismatches:
    mismatch_text = "\n".join(
        f" - {name}: installed {installed}, expected {expected}"
        for name, (installed, expected) in mismatches.items()
    )
    raise RuntimeError(
        "Notebook environment is stale. Re-run the install cell, then restart the Colab runtime. "
        "Version mismatches:\n" + mismatch_text
    )

print("Notebook freshness check passed:", NOTEBOOK_VERSION)
print("Pinned packages:", installed_versions)


In [None]:
from pathlib import Path

# Leave OUTPUT_DIR as None to use a fresh Colab-generated run by default.
# Or set it explicitly, for example:
# OUTPUT_DIR = Path("/content/output-dsfb-add/2026-03-01T12-00-00Z")
OUTPUT_DIR = None
RUN_RUST_SWEEP_IN_COLAB = True
REPO_URL = "https://github.com/infinityabundance/dsfb.git"
REPO_DIR = Path("/content/dsfb")
CARGO_BIN_DIR = Path("/root/.cargo/bin")

OUTPUT_ROOT_CANDIDATES = [
    REPO_DIR / "output-dsfb-add",
    Path("/content/output-dsfb-add"),
    Path("/content/dsfb/output-dsfb-add"),
    Path("/content/drive/MyDrive/output-dsfb-add"),
    Path("output-dsfb-add"),
]

OUTPUT_DIR


In [None]:
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objects as go
import plotly.io as pio
from ripser import ripser

if plotly.__version__ != EXPECTED_PACKAGE_VERSIONS["plotly"]:
    raise RuntimeError(
        f"Imported stale plotly module {plotly.__version__}; expected {EXPECTED_PACKAGE_VERSIONS['plotly']}. "
        "Restart the Colab runtime and run all cells again."
    )

pio.renderers.default = "notebook"
pio.templates.default = "none"


In [None]:
def require_file(path: Path) -> Path:
    if not path.exists():
        raise FileNotFoundError(path)
    return path

def latest_timestamped_dir(root: Path):
    if not root.exists() or not root.is_dir():
        return None
    candidates = sorted(path for path in root.iterdir() if path.is_dir())
    return candidates[-1] if candidates else None

def resolve_output_dir(explicit_dir, candidate_roots):
    if explicit_dir is not None:
        explicit_dir = Path(explicit_dir)
        if explicit_dir.exists() and explicit_dir.is_dir():
            return explicit_dir
        raise FileNotFoundError(
            f"Configured OUTPUT_DIR does not exist: {explicit_dir}. Upload or mount your run folder first."
        )

    for root in candidate_roots:
        candidate = latest_timestamped_dir(root)
        if candidate is not None:
            return candidate

    searched = "\n".join(f" - {root}" for root in candidate_roots)
    raise FileNotFoundError(
        "No output-dsfb-add run directory was found. Upload or mount the Rust-generated "
        "output folder, or set OUTPUT_DIR explicitly. Searched:\n" + searched
    )

def save_png(fig, filename: str, width: int = 1400, height: int = 900, scale: int = 2) -> Path:
    target = OUTPUT_DIR / filename
    fig.write_image(target, width=width, height=height, scale=scale)
    return target

def make_line_figure(df, x_col: str, y_col: str, title: str, y_title: str, color: str = "#1f77b4"):
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=df[x_col],
            y=df[y_col],
            mode="lines+markers",
            line={"width": 3, "color": color},
            marker={"size": 6, "color": color},
            name=y_col,
        )
    )
    fig.update_layout(
        title=title,
        paper_bgcolor="white",
        plot_bgcolor="white",
        font={"size": 16, "color": "#222222"},
        margin={"l": 80, "r": 40, "t": 90, "b": 70},
        showlegend=False,
    )
    fig.update_xaxes(title="lambda", showgrid=True, gridcolor="#d9d9d9", zeroline=False)
    fig.update_yaxes(title=y_title, showgrid=True, gridcolor="#d9d9d9", zeroline=False)
    return fig



In [None]:
import os
import shutil
import subprocess

def unique_paths(paths):
    out = []
    seen = set()
    for path in paths:
        key = str(path)
        if key in seen:
            continue
        out.append(path)
        seen.add(key)
    return out

def cargo_env():
    env = os.environ.copy()
    env["PATH"] = f"{CARGO_BIN_DIR}:{env['PATH']}"
    return env

def run_cmd(args, cwd=None, env=None):
    args = [str(arg) for arg in args]
    print("+", " ".join(args))
    subprocess.run(args, cwd=str(cwd) if cwd else None, env=env, check=True)

def ensure_cargo_installed():
    if shutil.which("cargo", path=cargo_env()["PATH"]):
        return
    run_cmd([
        "bash",
        "-lc",
        "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal",
    ])
    if not shutil.which("cargo", path=cargo_env()["PATH"]):
        raise RuntimeError("cargo is unavailable after rustup installation")

def ensure_repo_checkout():
    if (REPO_DIR / ".git").exists():
        return
    REPO_DIR.parent.mkdir(parents=True, exist_ok=True)
    if REPO_DIR.exists() and any(REPO_DIR.iterdir()):
        raise RuntimeError(
            f"Repo directory exists but is not a git checkout: {REPO_DIR}. Remove it or set OUTPUT_DIR explicitly."
        )
    run_cmd(["git", "clone", "--depth", "1", REPO_URL, REPO_DIR])

def generate_fresh_output_if_requested():
    if OUTPUT_DIR is not None or not RUN_RUST_SWEEP_IN_COLAB:
        return
    ensure_cargo_installed()
    ensure_repo_checkout()
    run_cmd(["cargo", "run", "-p", "dsfb-add", "--bin", "dsfb_add_sweep"], cwd=REPO_DIR, env=cargo_env())

generate_fresh_output_if_requested()
OUTPUT_ROOT_CANDIDATES = unique_paths(OUTPUT_ROOT_CANDIDATES)
OUTPUT_DIR = resolve_output_dir(OUTPUT_DIR, OUTPUT_ROOT_CANDIDATES)
OUTPUT_DIR


In [None]:
aet = pd.read_csv(require_file(OUTPUT_DIR / "aet_sweep.csv"))

fig_aet = make_line_figure(aet, "lambda", "echo_slope", "AET Echo Slope vs Lambda", "echo_slope")
save_png(fig_aet, "fig_aet_echo_slope_vs_lambda.png")
fig_aet


In [None]:
iwlt = pd.read_csv(require_file(OUTPUT_DIR / "iwlt_sweep.csv"))

fig_iwlt = make_line_figure(iwlt, "lambda", "entropy_density", "IWLT Entropy Density vs Lambda", "entropy_density", color="#d95f02")
save_png(fig_iwlt, "fig_iwlt_entropy_density_vs_lambda.png")
fig_iwlt


In [None]:
rlt = pd.read_csv(require_file(OUTPUT_DIR / "rlt_sweep.csv"))

fig_rlt_escape = make_line_figure(rlt, "lambda", "escape_rate", "RLT Escape Rate vs Lambda", "escape_rate", color="#7570b3")
save_png(fig_rlt_escape, "fig_rlt_escape_rate_vs_lambda.png")

fig_rlt_expansion = make_line_figure(rlt, "lambda", "expansion_ratio", "RLT Expansion Ratio vs Lambda", "expansion_ratio", color="#1b9e77")
save_png(fig_rlt_expansion, "fig_rlt_expansion_ratio_vs_lambda.png")

fig_rlt_escape


## TCP persistent-homology figure

The Rust summary already contains coarse `betti0`/`betti1` proxies. The cell below optionally recomputes an H1 count with `ripser` from selected exported point clouds in `tcp_points/`.


In [None]:
tcp = pd.read_csv(require_file(OUTPUT_DIR / "tcp_sweep.csv"))
point_files = sorted((OUTPUT_DIR / "tcp_points").glob("points_lambda_*.csv"))

TCP_PH_MAX_FILES = 24
if len(point_files) > TCP_PH_MAX_FILES:
    stride = max(1, len(point_files) // TCP_PH_MAX_FILES)
    point_files = point_files[::stride]

ph_rows = []
for point_file in point_files:
    idx = int(point_file.stem.split("_")[-1])
    points = pd.read_csv(point_file)[["x", "y"]].to_numpy()
    diagrams = ripser(points, maxdim=1)["dgms"]
    h1 = diagrams[1] if len(diagrams) > 1 else np.empty((0, 2))
    persistent = h1[np.isfinite(h1[:, 1]) & ((h1[:, 1] - h1[:, 0]) > 0.05)] if len(h1) else np.empty((0, 2))
    ph_rows.append({"lambda": float(tcp.loc[idx, "lambda"]), "betti1_count": int(len(persistent))})

tcp_ph = pd.DataFrame(ph_rows).sort_values("lambda")
fig_tcp = make_line_figure(tcp_ph, "lambda", "betti1_count", "TCP Betti-1 Count vs Lambda (ripser subset)", "betti1_count", color="#e7298a")
save_png(fig_tcp, "fig_tcp_betti1_vs_lambda.png")
fig_tcp


In [None]:
pngs = sorted(path.name for path in OUTPUT_DIR.glob("*.png"))
print("Saved PNGs:")
for name in pngs:
    print(" -", name)
