## V0 — Uniform Synthetic Nodes

**Goal:** Minimal, reproducible node set in a rectangular region.
**Inputs:** seed, N, bbox (km), population range.
**Method:** Sample positions uniformly inside bbox; populations \~ Uniform\[min,max].
**Outputs:** `id, x_km, y_km, pop`.
**Validation:** N matches; inside bbox; pop in range.

In [None]:
"""
Jupyter notebook cell — Version 0 (Nodes only)
Minimal, reproducible node set in a rectangular region (planar coordinates in km).
- Placement: uniform within bbox
- Population: uniform in [pop_min, pop_max]
- Outputs: nodes.csv, meta.json, preview.png

Usage (in a single notebook cell): just run this cell. Edit `V0Config` as needed.
"""
from __future__ import annotations

import json
import os
import time
import hashlib
from dataclasses import asdict, dataclass
from typing import Tuple, Dict, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter


# ------------------------------
# Config
# ------------------------------
@dataclass
class V0Config:
    seed: int = 42
    n_nodes: int = 50
    bbox_km: Tuple[float, float, float, float] = (0.0, 0.0, 200.0, 200.0)  # (minx, miny, maxx, maxy)
    pop_min: int = 1_000
    pop_max: int = 500_000
    out_dir: str = "maps/sv1.0/dv0.1_v0_uniform"
    crs: str = "EPSG:3857"  # Synthetic planar meters; we store km here for simplicity
    schema_version: str = "1.0"  # node schema version
    dataset_version: str = "0.1"  # dataset/edition version


# ------------------------------
# Core functions
# ------------------------------

def set_seed(seed: int) -> None:
    np.random.seed(seed)


def generate_uniform_nodes(cfg: V0Config) -> pd.DataFrame:
    minx, miny, maxx, maxy = cfg.bbox_km
    if not (minx < maxx and miny < maxy):
        raise ValueError("Invalid bbox: must satisfy minx<maxx and miny<maxy")

    # sample coordinates uniformly in km
    xs = np.random.uniform(minx, maxx, size=cfg.n_nodes)
    ys = np.random.uniform(miny, maxy, size=cfg.n_nodes)

    # sample populations uniformly (ints)
    pops = np.random.randint(cfg.pop_min, cfg.pop_max + 1, size=cfg.n_nodes)

    df = pd.DataFrame({
        "id": np.arange(cfg.n_nodes, dtype=int),
        "x_km": xs,
        "y_km": ys,
        "pop": pops,
    })
    return df


def validate_nodes(df: pd.DataFrame, cfg: V0Config) -> Dict[str, Any]:
    """Return validation metrics and raise on hard failures."""
    minx, miny, maxx, maxy = cfg.bbox_km

    metrics: Dict[str, Any] = {}
    # Count
    n = len(df)
    if n != cfg.n_nodes:
        raise AssertionError(f"Node count mismatch: expected {cfg.n_nodes}, got {n}")
    metrics["n_nodes"] = n

    # Bounds
    inside_x = (df["x_km"] >= minx) & (df["x_km"] <= maxx)
    inside_y = (df["y_km"] >= miny) & (df["y_km"] <= maxy)
    inside = inside_x & inside_y
    violations = int((~inside).sum())
    if violations:
        raise AssertionError(f"{violations} nodes fall outside bbox")
    metrics["bbox"] = {"minx": minx, "miny": miny, "maxx": maxx, "maxy": maxy}

    # Population range
    pmin, pmax = int(df["pop"].min()), int(df["pop"].max())
    if pmin < cfg.pop_min or pmax > cfg.pop_max:
        raise AssertionError(
            f"Population out of range: observed [{pmin},{pmax}] vs cfg [{cfg.pop_min},{cfg.pop_max}]"
        )
    metrics["pop_range_observed"] = {"min": pmin, "max": pmax}

    # Distribution summaries
    metrics["pop_percentiles"] = {q: int(np.percentile(df["pop"], q)) for q in (5, 25, 50, 75, 95)}

    return metrics

def preview_nodes(df: pd.DataFrame, cfg: V0Config, save_path: str) -> None:
    """Scatter plot sized by population with a color gradient and colorbar legend.
    Also annotates the top-3 most populated cities with their population values.
    """
    minx, miny, maxx, maxy = cfg.bbox_km


    # size scaling: sqrt to reduce dynamic range
    s = 10 + 90 * np.sqrt(df["pop"].values / df["pop"].max())


    # Color by population (uses matplotlib's default colormap)
    pop_vals = df["pop"].values.astype(float)


    plt.figure(figsize=(6, 6))
    sc = plt.scatter(df["x_km"], df["y_km"], s=s, c=pop_vals)


    # Colorbar as legend for population
    cbar = plt.colorbar(sc)
    cbar.set_label("Population")
    try:
        cbar.ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
    except Exception:
        pass # fallback to default ticks if formatter not available


    # Annotate top-3 by population
    top3 = df.nlargest(3, "pop").copy()
    # Offset annotations by a small fraction of the bbox size to avoid overlap
    dx = 0.01 * (maxx - minx)
    dy = 0.01 * (maxy - miny)
    for _, row in top3.iterrows():
        label = f"{int(row['pop']):,}"
        plt.text(
            row["x_km"] + dx,
            row["y_km"] + dy,
            label,
            fontsize=8,
            ha="left",
            va="bottom",
            bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="none", alpha=0.7),
        )


    plt.title("Nodes — V0 (uniform placement) — color = population")
    plt.xlabel("x (km)")
    plt.ylabel("y (km)")
    plt.xlim(minx, maxx)
    plt.ylim(miny, maxy)
    plt.gca().set_aspect("equal", adjustable="box")
    plt.tight_layout()
    plt.savefig(save_path, dpi=150)
    plt.close()

def compute_metrics_hash(metrics: Dict[str, Any]) -> str:
    blob = json.dumps(metrics, sort_keys=True).encode("utf-8")
    return hashlib.sha256(blob).hexdigest()[:16]


def save_artifacts(df: pd.DataFrame, cfg: V0Config, metrics: Dict[str, Any]) -> Dict[str, str]:
    os.makedirs(cfg.out_dir, exist_ok=True)

    nodes_path = os.path.join(cfg.out_dir, "nodes.csv")
    preview_path = os.path.join(cfg.out_dir, "preview.png")
    meta_path = os.path.join(cfg.out_dir, "meta.json")

    # Save nodes
    df.to_csv(nodes_path, index=False)

    # Preview
    preview_nodes(df, cfg, preview_path)

    # Meta
    meta = {
        "schema_version": cfg.schema_version,
        "dataset_version": cfg.dataset_version,
        "crs": cfg.crs,
        "seed": cfg.seed,
        "generator": {
            "name": "nodes_v0_uniform",
            "params": {
                "n_nodes": cfg.n_nodes,
                "bbox_km": cfg.bbox_km,
                "pop_min": cfg.pop_min,
                "pop_max": cfg.pop_max,
            },
        },
        "region_bbox": list(cfg.bbox_km),
        "metrics": metrics,
        "metrics_hash": compute_metrics_hash(metrics),
        "created_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    }
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)

    return {"nodes": nodes_path, "preview": preview_path, "meta": meta_path}


# ------------------------------
# Orchestration
# ------------------------------

def main(cfg: V0Config | None = None) -> pd.DataFrame:
    cfg = cfg or V0Config()
    set_seed(cfg.seed)

    df = generate_uniform_nodes(cfg)
    metrics = validate_nodes(df, cfg)
    paths = save_artifacts(df, cfg, metrics)

    # Summary printout
    print("\n[Nodes V0] Build complete:\n" + "-" * 40)
    print(f"Nodes: {len(df)} | bbox: {cfg.bbox_km} | pop ∈ [{cfg.pop_min},{cfg.pop_max}]")
    print(f"Saved: nodes → {paths['nodes']}\n       preview → {paths['preview']}\n       meta → {paths['meta']}")
    print(f"Metrics hash: {compute_metrics_hash(metrics)}")
    return df


# ------------------------------
_cfg = V0Config(
    seed=30,
    n_nodes=10,
    bbox_km=(0.0, 0.0, 200.0, 200.0),
    pop_min=1_000,
    pop_max=1_000_000,
    out_dir="maps/sv1.0/dv0.1_v0_uniform",
)

_ = main(_cfg)



[Nodes V0] Build complete:
----------------------------------------
Nodes: 10 | bbox: (0.0, 0.0, 200.0, 200.0) | pop ∈ [1000,1000000]
Saved: nodes → maps/sv1.0/dv0.1_v0_uniform\nodes.csv
       preview → maps/sv1.0/dv0.1_v0_uniform\preview.png
       meta → maps/sv1.0/dv0.1_v0_uniform\meta.json
Metrics hash: 66fd5ac38ba48a3b
