In [2]:
"""
Jupyter notebook cell — Transport Simulation V0 (length-only network)

Batch-aware runner:
- Scans every dataset under `dataset_root` (folders that contain a top-level `nodes.csv`)
- For each solution under `<dataset>/solutions/<solution_name>/` that has `edges.csv`
  (or `cost.csv`), runs the **length-only transport simulation** if not already done.
- Saves per-solution results under `<solution>/sim_v0/`, and updates a global
  comparison table at `<dataset_root>/transport_sim_v0_overview.csv`.

Model (high level):
- Demand purposes: work, recreation, family, vacation, tourism, emergency
- Annual **per-capita intercity trip rates** vary by city size (small/medium/large),
  derived from population quantiles (50% / 85% thresholds).
- **Destination choice** uses a gravity form:
    weight_j ∝ pop_j^α / (euclid_dist_ij + 1)^β_p   (i ≠ j)
- Travel time uses **network shortest-path distance** / purpose speed_kmh
  (network built from solution edges; no capacity limits ⇒ all trips complete).
- Metrics: total trips, total person-hours, avg time (min), avg distance (km),
  weighted detour ratio (network_dist / euclid_dist), and a scalar **EfficiencyScore**
  = trips_year / person_hours (higher is better, ≈ trips per travel-hour).

Edit `SimV0Config.dataset_root` and run.
"""
from __future__ import annotations

import os
import json
import math
import time
from dataclasses import dataclass
from typing import Dict, Any, List, Tuple, Iterable

import numpy as np
import pandas as pd


# ------------------------------
# Config
# ------------------------------
@dataclass
class SimV0Config:
    # Root containing many datasets (each has a top-level nodes.csv)
    dataset_root: str = r"C:\Users\User\Documents\Code\traffic-optimization\00_Datasets"
    # Subfolder that holds solutions in each dataset
    solutions_subdir: str = "solutions"
    # Where to store global comparison CSV under dataset_root
    global_summary_csv: str = "transport_sim_v0_overview.csv"

    # Purpose set
    purposes: Tuple[str, ...] = ("work", "recreation", "family", "vacation", "tourism", "emergency")

    # Gravity parameters
    alpha_pop: float = 1.05  # destination attraction exponent on population
    beta_by_purpose: Dict[str, float] = None  # distance-decay exponents

    # Speeds by purpose (km/h) — coarse averages
    speed_kmh_by_purpose: Dict[str, float] = None

    # Annual per-capita intercity trip rates by city "size" (small/medium/large)
    # These are *expected* intercity trips per person per year for each purpose.
    rates_annual_by_class: Dict[str, Dict[str, float]] = None

    # Random seed for any tie-breaking
    seed: int = 42


def _default_params(cfg: SimV0Config) -> None:
    if cfg.beta_by_purpose is None:
        cfg.beta_by_purpose = {
            "work": 1.6,
            "recreation": 1.2,
            "family": 1.1,
            "vacation": 0.9,
            "tourism": 0.8,
            "emergency": 1.3,
        }
    if cfg.speed_kmh_by_purpose is None:
        cfg.speed_kmh_by_purpose = {
            "work": 80.0,
            "recreation": 70.0,
            "family": 75.0,
            "vacation": 85.0,
            "tourism": 75.0,
            "emergency": 90.0,
        }
    if cfg.rates_annual_by_class is None:
        # Rough intercity frequencies (per person per year)
        cfg.rates_annual_by_class = {
            "small":   {"work": 6.0,  "recreation": 3.0,  "family": 2.0, "vacation": 1.2, "tourism": 1.5, "emergency": 0.05},
            "medium":  {"work": 8.0,  "recreation": 3.5,  "family": 2.3, "vacation": 1.5, "tourism": 1.8, "emergency": 0.06},
            "large":   {"work": 10.0, "recreation": 4.0,  "family": 2.6, "vacation": 1.8, "tourism": 2.0, "emergency": 0.07},
        }


# ------------------------------
# IO helpers
# ------------------------------
def _iter_datasets_with_nodes(dataset_root: str, solutions_subdir: str) -> Iterable[str]:
    for root, dirs, files in os.walk(dataset_root):
        # don't walk into nested solutions
        dirs[:] = [d for d in dirs if d != solutions_subdir]
        if "nodes.csv" in files:
            yield root


def _iter_solutions(dataset_dir: str, solutions_subdir: str) -> Iterable[str]:
    sol_root = os.path.join(dataset_dir, solutions_subdir)
    if not os.path.isdir(sol_root):
        return
    for name in sorted(os.listdir(sol_root)):
        sdir = os.path.join(sol_root, name)
        if os.path.isdir(sdir):
            yield sdir


def _solution_has_required_files(solution_dir: str) -> bool:
    return os.path.exists(os.path.join(solution_dir, "edges.csv")) or os.path.exists(os.path.join(solution_dir, "cost.csv"))


def _simulation_done(solution_dir: str) -> bool:
    return os.path.exists(os.path.join(solution_dir, "sim_v0", "summary.json"))


def _load_nodes(nodes_csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(nodes_csv_path)
    req = {"id", "x_km", "y_km", "pop"}
    if not req.issubset(df.columns):
        raise ValueError(f"nodes.csv missing columns: {req - set(df.columns)}")
    df = df.sort_values("id").reset_index(drop=True)
    if not np.array_equal(df["id"].to_numpy(), np.arange(len(df))):
        # Remap to 0..N-1 if needed
        df.insert(0, "orig_id", df["id"].values)
        df["id"] = np.arange(len(df), dtype=int)
    df = df.set_index("id", drop=False)
    return df


def _cost_matrix_from_solution(nodes: pd.DataFrame, solution_dir: str) -> np.ndarray:
    n = len(nodes)
    C = np.full((n, n), np.inf, dtype=float)
    np.fill_diagonal(C, 0.0)
    edges_csv = os.path.join(solution_dir, "edges.csv")
    cost_csv  = os.path.join(solution_dir, "cost.csv")
    if os.path.exists(edges_csv):
        e = pd.read_csv(edges_csv)
        for _, r in e.iterrows():
            u, v, w = int(r["u"]), int(r["v"]), float(r["length_km"])
            C[u, v] = C[v, u] = w
    elif os.path.exists(cost_csv):
        C = pd.read_csv(cost_csv, header=None).to_numpy(dtype=float)
    else:
        raise FileNotFoundError("Neither edges.csv nor cost.csv found.")
    return C


# ------------------------------
# Math helpers
# ------------------------------
def _euclidean_matrix(nodes: pd.DataFrame) -> np.ndarray:
    x = nodes["x_km"].to_numpy()
    y = nodes["y_km"].to_numpy()
    X = x[:, None] - x[None, :]
    Y = y[:, None] - y[None, :]
    return np.sqrt(X * X + Y * Y)


def _floyd_warshall(C: np.ndarray) -> np.ndarray:
    """All-pairs shortest paths for non-negative costs (Naive O(N^3), fine for N<=200)."""
    D = C.copy()
    n = D.shape[0]
    for k in range(n):
        # vectorized relaxation
        D = np.minimum(D, D[:, [k]] + D[[k], :])
    return D


# ------------------------------
# Demand model
# ------------------------------
def _city_classes(pop: np.ndarray) -> List[str]:
    q50 = np.percentile(pop, 50)
    q85 = np.percentile(pop, 85)
    cls = []
    for p in pop:
        if p < q50:
            cls.append("small")
        elif p < q85:
            cls.append("medium")
        else:
            cls.append("large")
    return cls


def _annual_trips_by_origin(nodes: pd.DataFrame, purposes: Tuple[str, ...], rates_by_class: Dict[str, Dict[str, float]]) -> np.ndarray:
    """Return array R (n, P) of *annual* trips originating at i for each purpose p."""
    n = len(nodes)
    P = len(purposes)
    pop = nodes["pop"].to_numpy().astype(float)
    classes = _city_classes(pop)
    R = np.zeros((n, P), dtype=float)
    for i in range(n):
        cls = classes[i]
        for p_idx, p in enumerate(purposes):
            R[i, p_idx] = pop[i] * rates_by_class[cls][p]
    return R


def _od_share_weights(pop: np.ndarray, D_euclid: np.ndarray,
                      alpha: float,
                      beta: Dict[str, float],
                      purposes: Tuple[str, ...]) -> np.ndarray:
    """
    For each purpose p, compute W_p(i,j) ∝ pop_j^alpha / (dist_ij + 1)^beta_p  for j!=i,
    row-normalized over j. Returns W with shape (P, n, n).

    Fix: robust handling of rows whose denominator is 0; set uniform over j≠i
    and correctly zero the diagonal for only those rows (no shape mismatch).
    """
    n = len(pop)
    P = len(purposes)
    W = np.zeros((P, n, n), dtype=float)
    if n <= 1:
        return W  # nothing to distribute

    pop_attr = np.power(pop.astype(float), alpha)            # (n,)
    M = D_euclid + 1.0                                       # soften small distances

    for p_idx, p in enumerate(purposes):
        numer = pop_attr[None, :] / np.power(M, beta[p])     # (n, n)
        np.fill_diagonal(numer, 0.0)

        denom = numer.sum(axis=1, keepdims=True)             # (n, 1)
        zero_rows_mask = (denom[:, 0] <= 0.0)

        if np.any(zero_rows_mask):
            rows = np.where(zero_rows_mask)[0]               # indices of problematic rows
            numer[rows, :] = 1.0 / (n - 1)                   # uniform across destinations
            numer[rows, rows] = 0.0                          # but not to self
            denom = numer.sum(axis=1, keepdims=True)         # recompute; now safe

        W[p_idx] = numer / denom

    return W



# ------------------------------
# Simulation core
# ------------------------------
def simulate_solution(nodes: pd.DataFrame,
                      cost_net: np.ndarray,
                      purposes: Tuple[str, ...],
                      alpha: float,
                      beta: Dict[str, float],
                      speeds: Dict[str, float],
                      rates_by_class: Dict[str, Dict[str, float]]) -> Dict[str, Any]:
    """
    Deterministic expected-value simulation (no capacity limits):
    - Build all-pairs shortest path distances
    - Annual origin volumes by purpose (pop * rate_class)
    - Destination split by gravity weights
    - Travel time = distance / speed[purpose]
    """
    n = len(nodes)
    P = len(purposes)

    # Distances
    D_net = _floyd_warshall(cost_net)
    D_euclid = _euclidean_matrix(nodes)

    # Demand
    R = _annual_trips_by_origin(nodes, purposes, rates_by_class)  # (n, P)
    W = _od_share_weights(nodes["pop"].to_numpy(), D_euclid, alpha, beta, purposes)  # (P,n,n)

    # Annual trips per OD per purpose
    # flows[p,i,j] = R[i,p] * W[p,i,j]
    flows = np.zeros((P, n, n), dtype=float)
    for p_idx in range(P):
        flows[p_idx] = R[:, [p_idx]] * W[p_idx]

    # Travel time matrix per purpose (hours)
    T_hours = np.zeros((P, n, n), dtype=float)
    for p_idx, p in enumerate(purposes):
        T_hours[p_idx] = D_net / max(1e-6, speeds[p])

    # Aggregations
    # Exclude i==j from flows & ratios
    mask_offdiag = ~np.eye(n, dtype=bool)
    total_trips = float(flows[:, mask_offdiag].sum())
    total_person_hours = float((flows * T_hours)[:, mask_offdiag].sum())
    mean_time_h = total_person_hours / max(1.0, total_trips)
    mean_time_min = mean_time_h * 60.0

    mean_dist_km = float((flows * D_net)[:, mask_offdiag].sum()) / max(1.0, total_trips)
    # Weighted detour ratio vs straight line; ignore pairs with zero euclid
    with np.errstate(divide="ignore", invalid="ignore"):
        detour = D_net / np.where(D_euclid > 0, D_euclid, np.nan)
    mean_detour = float((flows * detour)[:, mask_offdiag].sum()) / max(1.0, flows[:, mask_offdiag].sum())

    # Purpose breakdown
    per_purpose = []
    for p_idx, p in enumerate(purposes):
        trips_p = float(flows[p_idx, mask_offdiag].sum())
        hours_p = float((flows[p_idx] * T_hours[p_idx])[mask_offdiag].sum())
        per_purpose.append({
            "purpose": p,
            "trips": trips_p,
            "person_hours": hours_p,
            "avg_time_min": (hours_p / trips_p * 60.0) if trips_p > 0 else 0.0,
            "avg_dist_km": float((flows[p_idx] * D_net)[mask_offdiag].sum()) / trips_p if trips_p > 0 else 0.0,
        })

    # Scalar score: trips per travel-hour (higher is better). Equivalent to 1/mean_time_h.
    efficiency_score = total_trips / max(1e-9, total_person_hours)

    return {
        "n_nodes": int(n),
        "total_trips_year": total_trips,
        "total_person_hours": total_person_hours,
        "avg_time_min": mean_time_min,
        "avg_dist_km": mean_dist_km,
        "avg_detour_ratio": mean_detour,
        "efficiency_score": efficiency_score,
        "per_purpose": per_purpose,
        "purposes": list(purposes),
    }


# ------------------------------
# Saving + comparison
# ------------------------------
def _save_solution_results(solution_dir: str, sim: Dict[str, Any], cfg: SimV0Config) -> str:
    out_dir = os.path.join(solution_dir, "sim_v0")
    os.makedirs(out_dir, exist_ok=True)

    # JSON summary
    json_path = os.path.join(out_dir, "summary.json")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump({
            "created_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
            "config": {
                "alpha_pop": cfg.alpha_pop,
                "beta_by_purpose": cfg.beta_by_purpose,
                "speed_kmh_by_purpose": cfg.speed_kmh_by_purpose,
                "rates_annual_by_class": cfg.rates_annual_by_class,
            },
            "results": sim,
        }, f, indent=2)

    # CSV per-purpose
    df_pp = pd.DataFrame(sim["per_purpose"])
    df_pp.to_csv(os.path.join(out_dir, "per_purpose.csv"), index=False)

    # Flat summary CSV for quick comparison
    flat = {
        "total_trips_year": sim["total_trips_year"],
        "total_person_hours": sim["total_person_hours"],
        "avg_time_min": sim["avg_time_min"],
        "avg_dist_km": sim["avg_dist_km"],
        "avg_detour_ratio": sim["avg_detour_ratio"],
        "efficiency_score": sim["efficiency_score"],
        "n_nodes": sim["n_nodes"],
    }
    pd.DataFrame([flat]).to_csv(os.path.join(out_dir, "summary_flat.csv"), index=False)
    return json_path


def _update_global_overview(dataset_root: str,
                            dataset_dir: str,
                            solution_dir: str,
                            sim: Dict[str, Any],
                            overview_name: str) -> None:
    # derive ids
    ds_id = os.path.relpath(dataset_dir, dataset_root).replace("\\", "/")
    sol_id = os.path.basename(solution_dir)

    row = {
        "dataset": ds_id,
        "solution": sol_id,
        "n_nodes": sim["n_nodes"],
        "total_trips_year": sim["total_trips_year"],
        "total_person_hours": sim["total_person_hours"],
        "avg_time_min": sim["avg_time_min"],
        "avg_dist_km": sim["avg_dist_km"],
        "avg_detour_ratio": sim["avg_detour_ratio"],
        "efficiency_score": sim["efficiency_score"],
    }

    overview_path = os.path.join(dataset_root, overview_name)
    if os.path.exists(overview_path):
        df = pd.read_csv(overview_path)
        # upsert
        mask = (df["dataset"] == row["dataset"]) & (df["solution"] == row["solution"])
        if mask.any():
            df.loc[mask, list(row.keys())[2:]] = pd.Series(row)[list(row.keys())[2:]].values
        else:
            df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    else:
        df = pd.DataFrame([row])

    df.to_csv(overview_path, index=False)


# ------------------------------
# Orchestration
# ------------------------------
def main(cfg: SimV0Config) -> None:
    _default_params(cfg)
    np.random.seed(cfg.seed)

    datasets = list(_iter_datasets_with_nodes(cfg.dataset_root, cfg.solutions_subdir))
    if not datasets:
        print(f"No datasets with nodes.csv found under: {cfg.dataset_root}")
        return

    print(f"Datasets found: {len(datasets)}")
    created = 0
    skipped = 0

    for ds in datasets:
        nodes_csv = os.path.join(ds, "nodes.csv")
        nodes = _load_nodes(nodes_csv)

        for sol in _iter_solutions(ds, cfg.solutions_subdir):
            if not _solution_has_required_files(sol):
                continue
            if _simulation_done(sol):
                print(f"[SKIP] sim_v0 already exists → {os.path.relpath(sol, cfg.dataset_root)}")
                skipped += 1
                continue

            # Build network cost matrix
            C = _cost_matrix_from_solution(nodes, sol)

            # Run simulation
            sim_res = simulate_solution(
                nodes=nodes,
                cost_net=C,
                purposes=cfg.purposes,
                alpha=cfg.alpha_pop,
                beta=cfg.beta_by_purpose,
                speeds=cfg.speed_kmh_by_purpose,
                rates_by_class=cfg.rates_annual_by_class,
            )

            # Save per-solution and update global comparison
            _save_solution_results(sol, sim_res, cfg)
            _update_global_overview(cfg.dataset_root, ds, sol, sim_res, cfg.global_summary_csv)
            print(f"[OK] sim_v0 created → {os.path.relpath(sol, cfg.dataset_root)}")
            created += 1

    print(f"\n[Transport Sim V0] Done. New simulations: {created} | Skipped (already existed): {skipped}")
    print(f"Global overview: {os.path.join(cfg.dataset_root, cfg.global_summary_csv)}")


# ------------------------------
# Run
# ------------------------------
_cfg = SimV0Config(
    dataset_root=r"C:\Users\User\Documents\Code\traffic-optimization\00_Datasets",
    solutions_subdir="solutions",
    global_summary_csv="transport_sim_v0_overview.csv",
)
main(_cfg)


Datasets found: 10
[OK] sim_v0 created → sv1.2\dv0.1_ds01_seed42_tp3M_nc15\solutions\v0_random_tree_seed42
[OK] sim_v0 created → sv1.2\dv0.1_ds01_seed42_tp3M_nc15\solutions\v1_greedy_mst
[OK] sim_v0 created → sv1.2\dv0.1_ds02_seed43_tp3.5M_nc18\solutions\v0_random_tree_seed42
[OK] sim_v0 created → sv1.2\dv0.1_ds02_seed43_tp3.5M_nc18\solutions\v1_greedy_mst
[OK] sim_v0 created → sv1.2\dv0.1_ds03_seed44_tp4M_nc20\solutions\v0_random_tree_seed42
[OK] sim_v0 created → sv1.2\dv0.1_ds03_seed44_tp4M_nc20\solutions\v1_greedy_mst
[OK] sim_v0 created → sv1.2\dv0.1_ds04_seed45_tp2.5M_nc22\solutions\v0_random_tree_seed42
[OK] sim_v0 created → sv1.2\dv0.1_ds04_seed45_tp2.5M_nc22\solutions\v1_greedy_mst
[OK] sim_v0 created → sv1.2\dv0.1_ds05_seed46_tp5M_nc24\solutions\v0_random_tree_seed42
[OK] sim_v0 created → sv1.2\dv0.1_ds05_seed46_tp5M_nc24\solutions\v1_greedy_mst
[OK] sim_v0 created → sv1.2\dv0.1_ds06_seed47_tp5.5M_nc26\solutions\v0_random_tree_seed42
[OK] sim_v0 created → sv1.2\dv0.1_ds06_seed

In [7]:
# Transport Simulation V0 — Per-dataset grouped report (plus Best-by-dataset & Stats-by-algorithm)
# (tabulate-free version)

import os
import pandas as pd
import numpy as np

# --- Config (edit if needed) ---
dataset_root = r"C:\Users\User\Documents\Code\traffic-optimization\00_Datasets"
overview_csv  = os.path.join(dataset_root, "transport_sim_v0_overview.csv")
grouped_md    = os.path.join(dataset_root, "transport_sim_v0_by_dataset.md")
best_csv      = os.path.join(dataset_root, "transport_sim_v0_best_by_dataset.csv")
stats_csv     = os.path.join(dataset_root, "transport_sim_v0_stats_by_algo.csv")

def _fmt_num(x):
    try:
        x = float(x)
    except Exception:
        return str(x)
    if x >= 1e9:  return f"{x/1e9:.2f}B"
    if x >= 1e6:  return f"{x/1e6:.2f}M"
    if x >= 1e3:  return f"{x/1e3:.2f}k"
    return f"{x:.0f}"

def _algo_from_solution(name: str) -> str:
    n = str(name).lower()
    if "v1" in n and ("mst" in n or "greedy" in n): return "V1 Greedy MST"
    if "v0" in n and ("random" in n or "tree" in n): return "V0 Random Tree"
    return "Other"

def _df_to_markdown(df: pd.DataFrame, float_decimals: int = 3) -> str:
    """Simple, dependency-free DataFrame → Markdown 'pipe' table."""
    cols = list(df.columns)
    header = "| " + " | ".join(cols) + " |"
    sep    = "|" + "|".join(["---"] * len(cols)) + "|"
    rows = []
    for _, r in df.iterrows():
        vals = []
        for c in cols:
            v = r[c]
            if isinstance(v, (float, np.floating)):
                vals.append(f"{v:.{float_decimals}f}")
            elif isinstance(v, (int, np.integer)):
                vals.append(str(int(v)))
            else:
                vals.append(str(v))
        rows.append("| " + " | ".join(vals) + " |")
    return "\n".join([header, sep] + rows)

if not os.path.exists(overview_csv):
    print(f"Overview file not found:\n  {overview_csv}\nRun the simulation first.")
else:
    df = pd.read_csv(overview_csv).copy()
    # (Re)derive algo label to be safe
    df["algo"] = df["solution"].map(_algo_from_solution)

    # ------- Grouped by dataset (printed + markdown) -------
    md = ["# Transport Simulation V0 — Grouped by Dataset", ""]
    for ds_idx, (ds_name, g) in enumerate(sorted(df.groupby("dataset"), key=lambda kv: kv[0])):
        g = g.sort_values("efficiency_score", ascending=False).reset_index(drop=True)

        # Print block
        header = f"Dataset {ds_idx} — {ds_name}"
        print("\n" + header)
        print("-" * len(header))
        for _, r in g.iterrows():
            algo = _algo_from_solution(r["solution"])
            line = (
                f"- {algo} ({r['solution']}): "
                f"eff={r['efficiency_score']:.4f}, "
                f"avg_time={r['avg_time_min']:.1f} min, "
                f"avg_dist={r['avg_dist_km']:.1f} km, "
                f"detour={r['avg_detour_ratio']:.2f}, "
                f"trips={_fmt_num(r['total_trips_year'])}, "
                f"hours={_fmt_num(r['total_person_hours'])}"
            )
            print(line)

        # Markdown block
        md.append(f"## Dataset {ds_idx} — `{ds_name}`")
        for _, r in g.iterrows():
            algo = _algo_from_solution(r["solution"])
            md.append(
                f"- **{algo}** (`{r['solution']}`): "
                f"eff={r['efficiency_score']:.4f}, "
                f"avg_time={r['avg_time_min']:.1f} min, "
                f"avg_dist={r['avg_dist_km']:.1f} km, "
                f"detour={r['avg_detour_ratio']:.2f}, "
                f"trips={_fmt_num(r['total_trips_year'])}, "
                f"hours={_fmt_num(r['total_person_hours'])}"
            )
        md.append("")

    # ------- Best solution by dataset -------
    idx = df.groupby("dataset")["efficiency_score"].idxmax()
    best = df.loc[idx].sort_values("dataset").reset_index(drop=True)
    best = best[[
        "dataset","solution","algo","n_nodes",
        "efficiency_score","avg_time_min","avg_dist_km","avg_detour_ratio",
        "total_trips_year","total_person_hours"
    ]]

    print("\nBest solution per dataset")
    print("-------------------------")
    print(best.to_string(index=False))

    md += [
        "---",
        "## Best Solution per Dataset",
        "",
        "| Dataset | Solution | Algo | Nodes | Eff. score | Avg time (min) | Avg dist (km) | Detour | Trips | Hours |",
        "|---|---|---:|---:|---:|---:|---:|---:|---:|---:|",
    ]
    for _, r in best.iterrows():
        md.append(
            f"| `{r['dataset']}` | `{r['solution']}` | {r['algo']} | {int(r['n_nodes'])} | "
            f"{r['efficiency_score']:.4f} | {r['avg_time_min']:.1f} | {r['avg_dist_km']:.1f} | "
            f"{r['avg_detour_ratio']:.2f} | {_fmt_num(r['total_trips_year'])} | {_fmt_num(r['total_person_hours'])} |"
        )
    md.append("")

    # Save CSV
    best.to_csv(best_csv, index=False)

    # ------- Stats by algorithm -------
    stats = (
        df.groupby("algo")
          .agg(
              n=("solution","count"),
              avg_eff=("efficiency_score","mean"),
              med_eff=("efficiency_score","median"),
              avg_time_min=("avg_time_min","mean"),
              avg_dist_km=("avg_dist_km","mean"),
              avg_detour=("avg_detour_ratio","mean"),
          )
          .sort_values("avg_eff", ascending=False)
    )

    print("\nStats by algorithm")
    print("------------------")
    print(stats.to_string())

    md += [
        "",
        "## Stats by Algorithm",
        "",
        _df_to_markdown(stats.reset_index(), float_decimals=3),
        ""
    ]
    stats.to_csv(stats_csv)

    # ------- Write markdown file -------
    with open(grouped_md, "w", encoding="utf-8") as f:
        f.write("\n".join(md))

    print("\nSaved files:")
    print(f" - Grouped markdown: {grouped_md}")
    print(f" - Best by dataset CSV: {best_csv}")
    print(f" - Stats by algorithm CSV: {stats_csv}")



Dataset 0 — sv1.2/dv0.1_ds01_seed42_tp3M_nc15
---------------------------------------------
- V1 Greedy MST (v1_greedy_mst): eff=0.6941, avg_time=86.4 min, avg_dist=110.7 km, detour=1.53, trips=53.74M, hours=77.43M
- V0 Random Tree (v0_random_tree_seed42): eff=0.1506, avg_time=398.3 min, avg_dist=511.2 km, detour=10.68, trips=53.74M, hours=356.81M

Dataset 1 — sv1.2/dv0.1_ds02_seed43_tp3.5M_nc18
-----------------------------------------------
- V1 Greedy MST (v1_greedy_mst): eff=0.9222, avg_time=65.1 min, avg_dist=83.5 km, detour=1.80, trips=66.73M, hours=72.36M
- V0 Random Tree (v0_random_tree_seed42): eff=0.2430, avg_time=246.9 min, avg_dist=317.0 km, detour=7.33, trips=66.73M, hours=274.61M

Dataset 2 — sv1.2/dv0.1_ds03_seed44_tp4M_nc20
---------------------------------------------
- V1 Greedy MST (v1_greedy_mst): eff=0.7533, avg_time=79.6 min, avg_dist=102.1 km, detour=1.70, trips=72.60M, hours=96.38M
- V0 Random Tree (v0_random_tree_seed42): eff=0.3144, avg_time=190.9 min, avg_di