## V1 — City Size Classes

**Goal:** Urban hierarchy.
**Changes:** Assign `class ∈ {large, medium, small}` with quotas (e.g., 10/30/60%). Distinct lognormal per class.
**Validation:** Class counts match; medians ordered `large > medium > small`.

In [14]:
"""
Jupyter notebook cell — Version 1 (Nodes only)
Probabilistic city classes + uniform population ranges per class.
- Placement: uniform within bbox (same as V0)
- Class assignment: per-node categorical draw with probabilities
- Population: uniform integer in [min, max] for the node’s class
- Outputs: nodes.csv, meta.json, preview.png (color = population; marker shape = class)

Usage: run this cell. Edit `V1Config` to change class probabilities or ranges.
"""
from __future__ import annotations

import json
import os
import time
import hashlib
from dataclasses import dataclass, field
from typing import Tuple, Dict, Any, List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from matplotlib.lines import Line2D


# ------------------------------
# Config
# ------------------------------
@dataclass
class V1Config:
    seed: int = 42
    n_nodes: int = 120
    bbox_km: Tuple[float, float, float, float] = (0.0, 0.0, 200.0, 200.0)  # (minx, miny, maxx, maxy)

    # Per-class probabilities (will be normalized to sum 1). Each node draws its class independently.
    class_prob: Dict[str, float] = field(
        default_factory=lambda: {"large": 0.10, "medium": 0.30, "small": 0.60}
    )

    # Uniform population ranges per class (inclusive of min/max)
    class_ranges: Dict[str, Dict[str, int]] = field(
        default_factory=lambda: {
            # Choose non-overlapping ranges so medians satisfy large > medium > small
            "large":  {"min": 600_000, "max": 1_500_000},
            "medium": {"min": 150_000, "max":   500_000},
            "small":  {"min":   1_000, "max":   120_000},
        }
    )

    # Output & metadata
    out_dir: str = "maps/sv1.1/dv0.2_v1_classes_prob_uniform"
    crs: str = "EPSG:3857"  # Synthetic planar; coordinates stored in km for simplicity
    schema_version: str = "1.1"  # still includes optional `class` column
    dataset_version: str = "0.2"


# ------------------------------
# Core helpers
# ------------------------------

def set_seed(seed: int) -> None:
    np.random.seed(seed)


def generate_positions_uniform(bbox_km: Tuple[float, float, float, float], n: int) -> np.ndarray:
    minx, miny, maxx, maxy = bbox_km
    if not (minx < maxx and miny < maxy):
        raise ValueError("Invalid bbox: must satisfy minx<maxx and miny<maxy")
    xs = np.random.uniform(minx, maxx, size=n)
    ys = np.random.uniform(miny, maxy, size=n)
    return np.column_stack([xs, ys])


def _sample_classes(n: int, class_prob: Dict[str, float]) -> List[str]:
    labels = list(class_prob.keys())
    probs = np.array([class_prob[k] for k in labels], dtype=float)
    probs = probs / probs.sum()
    draws = np.random.choice(labels, size=n, p=probs)
    return draws.tolist()


def _sample_uniform_int(low: int, high: int, size: int) -> np.ndarray:
    """Inclusive uniform integer sampling in [low, high]."""
    if high < low:
        raise ValueError(f"Invalid range: [{low},{high}]")
    return np.random.randint(low, high + 1, size=size, dtype=int)


def generate_nodes_v1(cfg: V1Config) -> pd.DataFrame:
    # positions
    pts = generate_positions_uniform(cfg.bbox_km, cfg.n_nodes)

    # classes (probabilistic per-node)
    classes = _sample_classes(cfg.n_nodes, cfg.class_prob)

    # populations per class (uniform within class range)
    pops = np.empty(cfg.n_nodes, dtype=int)
    for cls in cfg.class_prob.keys():
        idx = [i for i, c in enumerate(classes) if c == cls]
        if not idx:
            continue
        r = cfg.class_ranges.get(cls, None)
        if r is None:
            raise KeyError(f"Missing class range for '{cls}'")
        pops[idx] = _sample_uniform_int(int(r["min"]), int(r["max"]), size=len(idx))

    df = pd.DataFrame({
        "id": np.arange(cfg.n_nodes, dtype=int),
        "x_km": pts[:, 0],
        "y_km": pts[:, 1],
        "class": classes,
        "pop": pops,
    })
    return df


def validate_nodes(df: pd.DataFrame, cfg: V1Config) -> Dict[str, Any]:
    minx, miny, maxx, maxy = cfg.bbox_km
    metrics: Dict[str, Any] = {}

    # Count
    n = len(df)
    if n != cfg.n_nodes:
        raise AssertionError(f"Node count mismatch: expected {cfg.n_nodes}, got {n}")
    metrics["n_nodes"] = n

    # Bounds
    inside_x = (df["x_km"] >= minx) & (df["x_km"] <= maxx)
    inside_y = (df["y_km"] >= miny) & (df["y_km"] <= maxy)
    violations = int((~(inside_x & inside_y)).sum())
    if violations:
        raise AssertionError(f"{violations} nodes fall outside bbox")
    metrics["bbox"] = {"minx": minx, "miny": miny, "maxx": maxx, "maxy": maxy}

    # Class counts (no strict quota check; just report)
    metrics["class_counts"] = df["class"].value_counts().to_dict()

    # Population summaries and median ordering (should hold if ranges are non-overlapping)
    med = df.groupby("class")["pop"].median().to_dict()
    metrics["class_medians"] = {k: int(v) for k, v in med.items()}
    try:
        if not (med["large"] > med["medium"] > med["small"]):
            # Don’t hard fail, just record a flag
            metrics["median_order_ok"] = False
        else:
            metrics["median_order_ok"] = True
    except KeyError:
        metrics["median_order_ok"] = False

    # Global population range
    pmin, pmax = int(df["pop"].min()), int(df["pop"].max())
    metrics["pop_range_observed"] = {"min": pmin, "max": pmax}
    metrics["pop_percentiles"] = {q: int(np.percentile(df["pop"], q)) for q in (5, 25, 50, 75, 95)}

    return metrics


def preview_nodes(df: pd.DataFrame, cfg: V1Config, save_path: str) -> None:
    """Scatter sized by population (color = population, colorbar legend).
    Marker shape encodes class ({large: square, medium: triangle, small: circle}).
    Annotates the top-3 most populated cities with population labels.
    """
    minx, miny, maxx, maxy = cfg.bbox_km

    vmax = df["pop"].max()
    vmin = df["pop"].min()

    markers = {"large": "s", "medium": "^", "small": "o"}

    plt.figure(figsize=(6, 6))

    sc = None
    for cls in ["large", "medium", "small"]:
        sub = df[df["class"] == cls]
        if sub.empty:
            continue
        sc = plt.scatter(
            sub["x_km"],
            sub["y_km"],
            s=10 + 90 * np.sqrt(sub["pop"].values / vmax),
            c=sub["pop"].values.astype(float),
            vmin=vmin,
            vmax=vmax,
            marker=markers.get(cls, "o"),
            label=cls.title(),
        )

    if sc is not None:
        cbar = plt.colorbar(sc)
        cbar.set_label("Population")
        try:
            cbar.ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
        except Exception:
            pass

    handles = [Line2D([], [], marker=markers.get(cls, "o"), linestyle="None", label=cls.title())
               for cls in ["large", "medium", "small"]]
    plt.legend(handles=handles, title="Class", loc="best", framealpha=0.8)

    # Annotate top-3 by population
    top3 = df.nlargest(3, "pop").copy()
    dx = 0.01 * (maxx - minx)
    dy = 0.01 * (maxy - miny)
    for _, row in top3.iterrows():
        label = f"{int(row['pop']):,}"
        plt.text(
            row["x_km"] + dx,
            row["y_km"] + dy,
            label,
            fontsize=8,
            ha="left",
            va="bottom",
            bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="none", alpha=0.7),
        )

    plt.title("Nodes — V1 (probabilistic classes; uniform ranges)")
    plt.xlabel("x (km)")
    plt.ylabel("y (km)")
    plt.xlim(minx, maxx)
    plt.ylim(miny, maxy)
    plt.gca().set_aspect("equal", adjustable="box")
    plt.tight_layout()
    plt.savefig(save_path, dpi=150)
    plt.close()


def compute_metrics_hash(metrics: Dict[str, Any]) -> str:
    blob = json.dumps(metrics, sort_keys=True).encode("utf-8")
    return hashlib.sha256(blob).hexdigest()[:16]


def save_artifacts(df: pd.DataFrame, cfg: V1Config, metrics: Dict[str, Any]) -> Dict[str, str]:
    os.makedirs(cfg.out_dir, exist_ok=True)

    nodes_path = os.path.join(cfg.out_dir, "nodes.csv")
    preview_path = os.path.join(cfg.out_dir, "preview.png")
    meta_path = os.path.join(cfg.out_dir, "meta.json")

    df.to_csv(nodes_path, index=False)
    preview_nodes(df, cfg, preview_path)

    meta = {
        "schema_version": cfg.schema_version,
        "dataset_version": cfg.dataset_version,
        "crs": cfg.crs,
        "seed": cfg.seed,
        "generator": {
            "name": "nodes_v1_classes_prob_uniform",
            "params": {
                "n_nodes": cfg.n_nodes,
                "bbox_km": cfg.bbox_km,
                "class_prob": cfg.class_prob,
                "class_ranges": cfg.class_ranges,
            },
        },
        "region_bbox": list(cfg.bbox_km),
        "metrics": metrics,
        "metrics_hash": compute_metrics_hash(metrics),
        "created_at_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    }
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)

    return {"nodes": nodes_path, "preview": preview_path, "meta": meta_path}


# ------------------------------
# Orchestration
# ------------------------------

def main(cfg: V1Config | None = None) -> pd.DataFrame:
    cfg = cfg or V1Config()
    set_seed(cfg.seed)

    df = generate_nodes_v1(cfg)
    metrics = validate_nodes(df, cfg)
    paths = save_artifacts(df, cfg, metrics)

    print("\n[Nodes V1] Build complete:\n" + "-" * 40)
    print(f"Nodes: {len(df)} | bbox: {cfg.bbox_km}")
    print(f"Class counts: {metrics['class_counts']}")
    print(f"Class medians: {metrics['class_medians']} (order ok = {metrics.get('median_order_ok')})")
    print(f"Saved: nodes → {paths['nodes']}\n       preview → {paths['preview']}\n       meta → {paths['meta']}")
    print(f"Metrics hash: {compute_metrics_hash(metrics)}")
    return df


# ------------------------------
# Run (notebook-friendly)
# ------------------------------
_cfg = V1Config(
    seed=30,
    n_nodes=30,
    bbox_km=(0.0, 0.0, 200.0, 200.0),
    class_prob={"large": 0.10, "medium": 0.30, "small": 0.60},
    class_ranges={
        "large":  {"min": 600_000, "max": 1_500_000},
        "medium": {"min": 100_000, "max":   300_000},
        "small":  {"min":   1_000, "max":   60_000},
    },
    out_dir="maps/sv1.1/dv0.2_v1_classes",
)

_ = main(_cfg)



[Nodes V1] Build complete:
----------------------------------------
Nodes: 30 | bbox: (0.0, 0.0, 200.0, 200.0)
Class counts: {'small': 17, 'medium': 11, 'large': 2}
Class medians: {'large': 1296607, 'medium': 215837, 'small': 43342} (order ok = True)
Saved: nodes → maps/sv1.1/dv0.2_v1_classes\nodes.csv
       preview → maps/sv1.1/dv0.2_v1_classes\preview.png
       meta → maps/sv1.1/dv0.2_v1_classes\meta.json
Metrics hash: af5893ff0c1a3f51


### V1.1 — Single-Core Density Field

**Goal:** Spatial realism with a capital-like core.
**Changes:** Rejection sampling from a 2D Gaussian density. Parameters: `core_location`, `core_sigma_km`, `density_strength`.
**Outputs:** (optional) `core_dist_km`.
**Validation:** Mean distance to core below uniform baseline.