# Compare Bayes runs: distance on vs off

This notebook is intentionally thin.

- It imports the typed pipeline in `src/`.
- It writes run artifacts into `data/runs/`.
- Then you can open `bayes_streamlit.py` and use **Compare two runs** to see deltas.



In [None]:
import os
import sys

import asyncio
import pandas as pd

# Ensure repo root is on sys.path so `import src...` works even when Jupyter's cwd is `notebooks/`.
sys.path.insert(0, os.path.abspath(".."))

from src.pipeline import (
    DatasetBuildConfig,
    build_dataset,
    run_bayes,
    run_heuristic,
    write_run_artifacts,
)

# -----------------
# Settings
# -----------------
LABEL_BASE = "nb-negbin-27-censored-eng1-expect0s-fixed-config-data"
# Always write artifacts into the repo-root data/runs, even if the notebook cwd is `notebooks/`.
RUNS_DIR = os.path.abspath(os.path.join("..", "data", "runs"))

# For a "proper" run, bump these up (this notebook will take longer).
DRAWS = 2000
TUNE = 2000
TARGET_ACCEPT = 0.95
HDI_PROB = 0.9

# Multiple seeds help check stability. Keep small at first (e.g. 2–3 seeds).
SEEDS = [0, 1, 2]

# Enable censored supervision (lower bounds) from src/config.py:
# - config.CENSORED_COUNTRIES_LOWER_BOUNDS
# This is optional and should be A/B-tested.
USE_CENSORING = True

# Count likelihood:
# - "poisson" is simple but can be brittle with heavy-tailed counts
# - "negbin" (Negative Binomial) often behaves better here
LIKELIHOOD = "negbin"  # or "poisson"

# Variants: 2×2 ablation grid
VARIANTS = [
    # {"use_distance": True, "use_english": True, "tag": "dist1_eng1"},
    # {"use_distance": True, "use_english": False, "tag": "dist1_eng0"},
    {"use_distance": False, "use_english": True, "tag": "dist0_eng1"},
    # {"use_distance": False, "use_english": False, "tag": "dist0_eng0"},
]

# -----------------
# Build dataset once
# -----------------
# Build with BOTH features present so the model can ablate covariates cleanly.
# (When a covariate is disabled, the Bayes model simply doesn't use that column.)
df = await build_dataset(
        DatasetBuildConfig(dataset_csv=None, use_distance=True, use_english=True)
    )


# Heuristic output is useful to keep in the CSV for comparing against Bayes.
heur = run_heuristic(
    df,
    use_language_factor=False,
    language_english_factor=1.25,
    language_euro_latin_factor=1.0,
    language_other_factor=0.75,
)

# -----------------
# Run grid
# -----------------
rows = []

for v in VARIANTS:
    for seed in SEEDS:
        label = f"{LABEL_BASE}-{v['tag']}-usecensored{int(USE_CENSORING)}-seed{seed}"

        bayes = run_bayes(
            df,
            use_distance=bool(v["use_distance"]),
            use_english=bool(v["use_english"]),
            use_censoring=bool(USE_CENSORING),
            likelihood=str(LIKELIHOOD),
            draws=int(DRAWS),
            tune=int(TUNE),
            target_accept=float(TARGET_ACCEPT),
            seed=int(seed),
            hdi_prob=float(HDI_PROB),
        )

        out = pd.merge(heur, bayes, on="alpha_3", how="left")
        csv_path, meta_path = write_run_artifacts(
            out,
            runs_dir=RUNS_DIR,
            label=label,
            meta={
                "kind": "bayes_run",
                "source": "notebook",
                "variant": v,
                "seed": int(seed),
                "use_censoring": bool(USE_CENSORING),
                "likelihood": str(LIKELIHOOD),
                "draws": int(DRAWS),
                "tune": int(TUNE),
                "target_accept": float(TARGET_ACCEPT),
                "hdi_prob": float(HDI_PROB),
            },
        )

        # Pull a few run-level summaries from the CSV (coefficients are duplicated per-row).
        df_run = pd.read_csv(csv_path)
        summary_row = df_run.iloc[0]

        rows.append(
            {
                "label": label,
                "csv": csv_path,
                "meta": meta_path,
                "use_distance": bool(v["use_distance"]),
                "use_english": bool(v["use_english"]),
                "seed": int(seed),
                "beta_distance": summary_row.get("bayes_beta_log1p_uk_distance_km_mean"),
                "beta_english": summary_row.get("bayes_beta_english_speakers_rate_mean"),
                "rhat_max": summary_row.get("bayes_rhat_max"),
                "ess_bulk_min": summary_row.get("bayes_ess_bulk_min"),
                "train_n": summary_row.get("bayes_train_n"),
            }
        )

print(f"Wrote {len(rows)} runs into: {RUNS_DIR}")

summary = pd.DataFrame(rows)
summary.sort_values(["use_distance", "use_english", "seed"]).reset_index(drop=True)


Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta, nb_alpha]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 11 seconds.
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta, nb_alpha]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 11 seconds.
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta, nb_alpha]
Sampling 4 chains for 2_000 tune and 2_000 draw iterations (8_000 + 8_000 draws total) took 11 seconds.


Wrote 3 runs into: /Users/joshuamason/git/renc/data/runs


Unnamed: 0,label,csv,meta,use_distance,use_english,seed,beta_distance,beta_english,rhat_max,ess_bulk_min,train_n
0,nb-negbin-27-censored-eng1-expect0s-dist0_eng1...,/Users/joshuamason/git/renc/data/runs/nb-negbi...,/Users/joshuamason/git/renc/data/runs/nb-negbi...,False,True,0,,0.441976,1.0,3109.0,23.0
1,nb-negbin-27-censored-eng1-expect0s-dist0_eng1...,/Users/joshuamason/git/renc/data/runs/nb-negbi...,/Users/joshuamason/git/renc/data/runs/nb-negbi...,False,True,1,,0.450121,1.0,2910.0,23.0
2,nb-negbin-27-censored-eng1-expect0s-dist0_eng1...,/Users/joshuamason/git/renc/data/runs/nb-negbi...,/Users/joshuamason/git/renc/data/runs/nb-negbi...,False,True,2,,0.438197,1.0,3188.0,23.0


Need 2+ seeds for the selected variant to compute per-country instability.


In [None]:
merged.sort_values("p1_std_across_seeds", ascending=False).head(25)
