In [None]:
"""
T-Drive: timeslot routing reconstruction (bbox + cache, OSMnx 2.0.6)
+ average speed per timeslot
+ minute-within-slot metadata

- Reads all taxi .txt files from INPUT_DIR (format: taxi_id,timestamp,lon,lat).
- Buckets records into M_MINUTES slots starting at 00:00 (UTC).
- For each (date, slot), sorts by time, snaps points to nearest OSM node,
  computes shortest paths between consecutive points (NetworkX), and sums distances.
- Adds:
  - slot_size_min           : slot length in minutes (M_MINUTES)
  - slot_start_utc          : UTC timestamp of the slot’s start (date 00:00 + slot*M_MINUTES)
  - start_minute_in_slot    : minute offset (0..M_MINUTES-1) of the first point within the slot
  - start_offset_s          : exact seconds offset from slot start to first point
- Writes:
  - <output_dir>/<file>_slots.csv : per-slot summary
  - (optional) <output_dir>/<file>_paths.geojson : reconstructed polylines
"""

import os
import math
from typing import Tuple, List, Optional
import pandas as pd
import numpy as np

# tqdm is optional
try:
    from tqdm import tqdm
except ImportError:
    def tqdm(x): return x

import osmnx as ox
import networkx as nx
import geopandas as gpd
from shapely.geometry import LineString, box

# -------------------------- USER CONFIG ---------------------------------------

INPUT_DIR  = "tdrive"
OUTPUT_DIR = "tdrive_routes_speed_MINUTES"
GRAPH_CACHE_DIR = OUTPUT_DIR

M_MINUTES = 60  # slot size in minutes

# Use a bbox to avoid full-city queries: (min_lat, max_lat, min_lon, max_lon)
BBOX = (39.8, 40.05, 116.2, 116.5) #For TDRIVE DATASET
#BBOX = (37.60002136230469, 37.75, -122.39936828613281, -122.25) # For San Francisco Dataset

# Weight for shortest path: "length" (meters) or "travel_time" (if you add speeds)
WEIGHT = "length"

# Export polylines for each slot (disable for speed while testing)
WRITE_GEOJSON = True

# Skip routing if consecutive point time gap exceeds this (minutes)
MAX_GAP_MIN = 60  # set None to disable

# -----------------------------------------------------------------------------

def _graph_cache_path(bbox: Tuple[float, float, float, float]) -> str:
    miny, maxy, minx, maxx = bbox
    tag = f"{miny:.4f}_{maxy:.4f}_{minx:.4f}_{maxx:.4f}"
    return os.path.join(GRAPH_CACHE_DIR, f"drive_bbox_{tag}.graphml")


def load_graph() -> nx.MultiDiGraph:
    """
    Load drivable OSM graph for the configured BBOX, using local cache.
    Compatible with OSMnx 2.0.6 (polygon-based).
    """
    os.makedirs(GRAPH_CACHE_DIR, exist_ok=True)
    ox.settings.use_cache = True
    ox.settings.log_console = False

    miny, maxy, minx, maxx = BBOX
    cache_path = _graph_cache_path(BBOX)

    if os.path.exists(cache_path):
        G = ox.load_graphml(cache_path)
    else:
        polygon = box(minx, miny, maxx, maxy)
        G = ox.graph_from_polygon(polygon, network_type="drive")
        G = ox.distance.add_edge_lengths(G)  # adds 'length' to edges
        ox.save_graphml(G, cache_path)

    # Ensure 'length' exists (paranoia)
    if not all('length' in d for _, _, d in G.edges(data=True)):
        G = ox.distance.add_edge_lengths(G)
        ox.save_graphml(G, cache_path)
    return G


def read_tdrive_file(path: str) -> pd.DataFrame:
    """
    Read T-Drive txt: taxi_id,timestamp,lon,lat (no header).
    Adds: date (YYYY-MM-DD string), slot_id (00:00-based index).
    All timestamps are parsed as UTC.
    """
    df = pd.read_csv(
        path,
        header=None,
        names=["taxi_id", "timestamp", "lon", "lat"],
        dtype={"taxi_id": str, "timestamp": str, "lon": float, "lat": float},
    )
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)
    df = df.dropna(subset=["timestamp", "lon", "lat"]).copy()

    df["date"] = df["timestamp"].dt.date.astype(str)
    minutes = df["timestamp"].dt.hour * 60 + df["timestamp"].dt.minute
    df["slot_id"] = (minutes // M_MINUTES).astype(int)
    return df.sort_values("timestamp")


def snap_points_to_nodes(G: nx.MultiDiGraph, df: pd.DataFrame) -> np.ndarray:
    """
    Snap each (lon,lat) to the nearest OSM node. Returns node ids (np.array).
    """
    xs = df["lon"].to_numpy()
    ys = df["lat"].to_numpy()
    nodes = ox.distance.nearest_nodes(G, X=xs, Y=ys)
    return np.array(nodes)


def path_between_nodes(G: nx.MultiDiGraph, u: int, v: int, weight: str = "length") -> Optional[List[int]]:
    """
    Shortest path node-list between u and v using NetworkX. None if no path.
    """
    try:
        if u == v:
            return [u]
        return nx.shortest_path(G, u, v, weight=weight)
    except (nx.NetworkXNoPath, nx.NodeNotFound):
        return None


def path_length_m(G: nx.MultiDiGraph, path: List[int]) -> float:
    """
    Sum edge lengths (meters) along a node path.
    """
    if not path or len(path) < 2:
        return 0.0
    total = 0.0
    for a, b in zip(path[:-1], path[1:]):
        if G.has_edge(a, b):
            edges = G.get_edge_data(a, b)
            if not edges:
                continue
            # choose the minimum length among parallel edges
            best = min((edata.get("length", math.inf) for _, edata in edges.items()))
            if math.isfinite(best):
                total += best
    return total


def build_linestring_from_path(G: nx.MultiDiGraph, path: List[int]) -> Optional[LineString]:
    """
    Convert a node path into a LineString in EPSG:4326.
    """
    if not path:
        return None
    coords = []
    for n in path:
        nd = G.nodes[n]
        x, y = nd.get("x"), nd.get("y")
        if x is None or y is None:
            return None
        coords.append((x, y))
    if len(coords) < 2:
        return None
    return LineString(coords)


def process_file(G: nx.MultiDiGraph, infile: str, outdir: str) -> None:
    """
    Process a single taxi file and write CSV (+ optional GeoJSON).
    """
    basename = os.path.basename(infile)
    stem = os.path.splitext(basename)[0]

    df = read_tdrive_file(infile)
    if df.empty:
        print(f"[SKIP] {basename}: no valid rows")
        return

    df["node"] = snap_points_to_nodes(G, df)

    rows = []
    geoms = []
    gprops = []

    for (date, slot), g in df.groupby(["date", "slot_id"], sort=True):
        g = g.sort_values("timestamp")
        npts = len(g)

        # Compute slot start instant in UTC (date 00:00 + slot*M_MINUTES)
        slot_start_utc = pd.to_datetime(f"{date} 00:00:00", utc=True)
        slot_start_utc = slot_start_utc + pd.Timedelta(minutes=int(slot) * M_MINUTES)

        # Record start/end times for duration
        start_time = g["timestamp"].iloc[0]
        end_time   = g["timestamp"].iloc[-1]
        duration_s = max(0.0, (end_time - start_time).total_seconds())
        duration_min = duration_s / 60.0

        # Offsets relative to the slot start
        start_offset_s = max(0.0, (start_time - slot_start_utc).total_seconds())
        start_minute_in_slot = int(start_offset_s // 60)  # 0 .. M_MINUTES-1

        if npts < 2:
            rows.append({
                "file": basename,
                "date": date,
                "slot_id": slot,
                "slot_size_min": M_MINUTES,
                "slot_start_utc": slot_start_utc,
                "start_minute_in_slot": start_minute_in_slot,
                "start_offset_s": start_offset_s,
                "n_points": npts,
                "n_paths": 0,
                "total_length_m": 0.0,
                "duration_s": duration_s,
                "duration_min": duration_min,
                "avg_speed_kmh": 0.0,
                "start_time": start_time,
                "end_time": end_time,
            })
            continue

        times = g["timestamp"].to_numpy()
        nodes = g["node"].to_numpy()

        slot_paths: List[List[int]] = []
        slot_length = 0.0
        n_paths = 0

        for i in range(npts - 1):
            t0 = pd.Timestamp(times[i])
            t1 = pd.Timestamp(times[i + 1])
            if MAX_GAP_MIN is not None:
                gap_min = (t1 - t0).total_seconds() / 60.0
                if gap_min > MAX_GAP_MIN:
                    continue  # skip unrealistic jump

            u = int(nodes[i])
            v = int(nodes[i + 1])
            if u == v:
                continue  # no movement

            path = path_between_nodes(G, u, v, weight=WEIGHT)
            if path is None:
                continue

            slot_paths.append(path)
            slot_length += path_length_m(G, path)
            n_paths += 1

        # Average speed (km/h) over the slot, using total_length_m and slot duration
        if duration_s > 0:
            avg_speed_kmh = (slot_length / 1000.0) / (duration_s / 3600.0)
        else:
            avg_speed_kmh = 0.0

        rows.append({
            "file": basename,
            "date": date,
            "slot_id": slot,
            "slot_size_min": M_MINUTES,
            "slot_start_utc": slot_start_utc,
            "start_minute_in_slot": start_minute_in_slot,
            "start_offset_s": start_offset_s,
            "n_points": npts,
            "n_paths": n_paths,
            "total_length_m": slot_length,
            "duration_s": duration_s,
            "duration_min": duration_min,
            "avg_speed_kmh": avg_speed_kmh,
            "start_time": start_time,
            "end_time": end_time,
        })

        if WRITE_GEOJSON and n_paths > 0:
            # Merge paths by concatenating coordinates while avoiding duplicate joints
            merged_coords = []
            for pth in slot_paths:
                ls = build_linestring_from_path(G, pth)
                if ls is None:
                    continue
                seg = list(ls.coords)
                if not merged_coords:
                    merged_coords.extend(seg)
                else:
                    merged_coords.extend(seg[1:])
            if len(merged_coords) >= 2:
                geoms.append(LineString(merged_coords))
                gprops.append({
                    "file": basename,
                    "date": date,
                    "slot_id": slot,
                    "slot_size_min": M_MINUTES,               # optional, for context
                    "start_minute_in_slot": start_minute_in_slot,  # optional
                    "n_points": npts,
                    "n_paths": n_paths,
                    "total_length_m": slot_length,
                    "duration_s": duration_s,
                    "avg_speed_kmh": avg_speed_kmh
                })

    # CSV summary
    os.makedirs(outdir, exist_ok=True)
    out_csv = os.path.join(outdir, f"{stem}_slots.csv")
    pd.DataFrame(rows).to_csv(out_csv, index=False)

    # GeoJSON polylines
    if WRITE_GEOJSON and geoms:
        gdf = gpd.GeoDataFrame(gprops, geometry=geoms, crs="EPSG:4326")
        out_geojson = os.path.join(outdir, f"{stem}_paths.geojson")
        gdf.to_file(out_geojson, driver="GeoJSON")

    print(f"[OK] {basename} → {os.path.relpath(out_csv, outdir)}"
          f"{' and paths.geojson' if WRITE_GEOJSON and geoms else ''}")


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"[1/3] Loading (or caching) road graph for bbox {BBOX} …")
    G = load_graph()

    # If you want time-based paths, uncomment the next two lines and set WEIGHT = "travel_time"
    # G = ox.speed.add_edge_speeds(G)        # adds 'speed_kph'
    # G = ox.speed.add_edge_travel_times(G)  # adds 'travel_time'

    print("[2/3] Processing files…")
    files = sorted(f for f in os.listdir(INPUT_DIR) if f.endswith(".txt"))
    if not files:
        print("No .txt files found in INPUT_DIR.")
        return

    for fname in tqdm(files):
        process_file(G, os.path.join(INPUT_DIR, fname), OUTPUT_DIR)

    print("[3/3] Done.")


if __name__ == "__main__":
    main()


In [None]:
'''
------------------------------------------------------------------------------
DESCRIPTION
------------------------------------------------------------------------------
Purpose
-------
Post-process slot-wise routed paths (GeoJSON) into a grid-based, time-aware
traversal summary. Each reconstructed polyline is raster-walked across a
regular grid using a Bresenham-style line traversal, converting distance
traversed into cumulative "reaching time" per grid cell.

Terminology (as used in the paper/artifact)
-------------------------------------------
• "Big box"  := a cellular area (coarse grid cell). It aggregates K×K small boxes.
• "Small box": a subarea within a cellular area (fine grid cell). Map segments
                and their traversal are attributed at this finer granularity.

Inputs
------
• INPUT_DIR: folder containing per-slot routed polylines exported as
             "<stem>_paths.geojson" (EPSG:4326). Each feature must include:
    - geometry            : LineString of the routed path for that slot
    - total_length_m      : total routed length (meters) within the slot
    - duration_s          : slot traversal duration (seconds)
    - n_points, n_paths   : metadata for sanity/QA
    - file, date, slot_id : provenance keys
    - start_minute_in_slot (optional): minute offset (0..M_MINUTES-1) at which
                                       the slot’s first trajectory point occurs.

Configuration
-------------
• BBOX          : geographic bounding box (min_lat, max_lat, min_lon, max_lon).
• UTM_CRS       : projected CRS in meters (e.g., UTM zone 50N for Beijing).
• CELL_SIZE_X/Y : small-box (subarea) grid resolution in meters.
• K             : grouping factor; a big box = K×K small boxes (cellular area).

Processing pipeline
-------------------
1) Project the geographical BBOX to UTM_CRS to obtain metric bounds.
2) Construct a regular grid:
     - small-grid resolution: CELL_SIZE_X by CELL_SIZE_Y (meters)
     - big-grid resolution  : groups of K×K small cells (cellular areas)
3) For each LineString in <stem>_paths.geojson:
     a) Compute average speed (m/s) = total_length_m / duration_s.
     b) Convert the line to a sequence of small-grid indices via Bresenham,
        enumerating every crossed small cell along the path.
     c) For each Bresenham step, increment cumulative time t_cum by
        step_distance / average_speed, yielding the grid "reaching time".
     d) Emit labels "BIG_ID/SMALL_ID/REACH_TIME_SECONDS" in traversal order.
     e) Preserve 'start_minute_in_slot' if present (else -1).

Outputs
-------
• OUTPUT_DIR/<stem>_boxes_reach.csv with columns:
    - file, date, slot_id
    - start_minute_in_slot     : forwarded from input (or -1 if absent)
    - n_points, n_paths
    - total_length_m
    - box_labels               : semicolon-separated sequence of labels
                                 "BIG_ID/SMALL_ID/REACH_TIME_SECONDS"

Notes & Guarantees
------------------
• "Big box" (cellular area) IDs come from (I,J) = (i//K, j//K) where (i,j) are
  small-grid indices. "Small box" (subarea) IDs index each fine cell uniquely
  within the full grid.
• Bresenham traversal ensures every intersected small box is visited in a
  reproducible order with no gaps.
• Reaching time starts at 0 s at the first encountered small box for that
  feature and accumulates along the rasterized path.
• Assumes inputs were generated by the prior routing pipeline and that GeoJSON
  features are valid LineStrings with required properties.
------------------------------------------------------------------------------
'''


import os
import math
import pandas as pd
import geopandas as gpd
from shapely.geometry import LineString

# ---------------- CONFIG ----------------
INPUT_DIR  = "tdrive_routes_speed_MINUTES"
OUTPUT_DIR = "tdrive_boxes_time_MINUTES"
# Bounding box (lat, lon)
BBOX = (39.8, 40.05, 116.2, 116.5)  # (min_lat, max_lat, min_lon, max_lon)
# Grid cell size in meters
CELL_SIZE_X = 250
CELL_SIZE_Y = 250

# Grouping factor: how many small cells per big cell
K = 4  # (e.g., 4×4 small cells = 1 big cell)

# CRS for Beijing (UTM zone 50N, units = meters)
UTM_CRS = 32650
# ----------------------------------------
# CRS for SANFRA (UTM zone 50N, units = meters)
#UTM_CRS =  32610

def coord_to_grid(x, y, xmin, ymin, dx, dy):
    """Convert projected (x,y) into grid indices (i,j)."""
    i = int((x - xmin) // dx)
    j = int((y - ymin) // dy)
    return i, j


def bresenham(i0, j0, i1, j1):
    """Return grid cells between (i0,j0) and (i1,j1) using Bresenham."""
    cells = []

    dx = abs(i1 - i0)
    dy = abs(j1 - j0)
    sx = 1 if i0 < i1 else -1
    sy = 1 if j0 < j1 else -1
    err = dx - dy

    x, y = i0, j0
    while True:
        cells.append((x, y))
        if x == i1 and y == j1:
            break
        e2 = 2 * err
        if e2 > -dy:
            err -= dy
            x += sx
        if e2 < dx:
            err += dx
            y += sy
    return cells


def step_distance(i0, j0, i1, j1, dx, dy):
    """Distance in meters for one Bresenham step."""
    if i0 == i1 or j0 == j1:
        # horizontal or vertical step
        return dx if i0 != i1 else dy
    else:
        # diagonal step
        return math.sqrt(dx**2 + dy**2)


def process_geojson(infile, outdir, xmin, ymin, xmax, ymax, dx, dy, nx, ny, k):
    """
    Process one _paths.geojson file with Bresenham grid traversal + reaching time.
    Also forwards the 'start_minute_in_slot' property (if present in the GeoJSON)
    into the output CSV as a dedicated column.
    """
    gdf = gpd.read_file(infile).to_crs(epsg=UTM_CRS)

    # Number of big cells horizontally and vertically
    nbx = (nx + k - 1) // k
    nby = (ny + k - 1) // k

    rows = []

    for _, row in gdf.iterrows():
        geom = row.geometry
        if geom is None or geom.is_empty or geom.geom_type != "LineString":
            continue

        total_length_m = row["total_length_m"]
        duration_s = row.get("duration_s", None)
        if duration_s is None or duration_s <= 0 or total_length_m <= 0:
            continue

        # average speed (m/s) used to convert Bresenham step distances to time
        avg_speed_mps = total_length_m / duration_s

        # NEW: forward the starting minute within the slot if present in GeoJSON props
        # falls back to -1 if not available
        start_minute_in_slot = int(row.get("start_minute_in_slot", -1))

        coords = list(geom.coords)
        timeline = []  # ["big/small/reaching_time"]

        t_cum = 0.0  # reaching time starts at 0 (seconds)

        for m in range(len(coords) - 1):
            x0, y0 = coords[m]
            x1, y1 = coords[m + 1]
            i0, j0 = coord_to_grid(x0, y0, xmin, ymin, dx, dy)
            i1, j1 = coord_to_grid(x1, y1, xmin, ymin, dx, dy)
            cells = bresenham(i0, j0, i1, j1)

            for step in range(len(cells) - 1):
                iA, jA = cells[step]
                iB, jB = cells[step + 1]

                if iA < 0 or jA < 0 or iA >= nx or jA >= ny:
                    continue

                small_id = jA * nx + iA + 1
                I = iA // k
                J = jA // k
                big_id = J * nbx + I + 1

                label = f"{big_id}/{small_id}/{round(t_cum, 2)}"
                timeline.append(label)

                # advance time for next step
                d = step_distance(iA, jA, iB, jB, dx, dy)
                dt = d / avg_speed_mps
                t_cum += dt

        # Add last cell at its final reaching time
        if timeline:
            iL, jL = cells[-1]
            if 0 <= iL < nx and 0 <= jL < ny:
                small_id = jL * nx + iL + 1
                I = iL // k
                J = jL // k
                big_id = J * nbx + I + 1
                label = f"{big_id}/{small_id}/{round(t_cum, 2)}"
                timeline.append(label)

        rows.append({
            "file": row["file"],
            "date": row["date"],
            "slot_id": row["slot_id"],
            "start_minute_in_slot": start_minute_in_slot,  # <<< NEW COLUMN
            "n_points": row["n_points"],
            "n_paths": row["n_paths"],
            "total_length_m": total_length_m,
            "box_labels": ";".join(timeline),
        })

    # Save CSV
    os.makedirs(outdir, exist_ok=True)
    basename = os.path.basename(infile).replace("_paths.geojson", "_boxes_reach.csv")
    outfile = os.path.join(outdir, basename)
    pd.DataFrame(rows).to_csv(outfile, index=False)

    print(f"[OK] {os.path.basename(infile)} → {os.path.basename(outfile)}")


def main():
    # Prepare bbox in UTM
    bbox_poly = gpd.GeoDataFrame(
        geometry=[LineString([
            (BBOX[2], BBOX[0]), (BBOX[3], BBOX[0]),
            (BBOX[3], BBOX[1]), (BBOX[2], BBOX[1]),
            (BBOX[2], BBOX[0])
        ])],
        crs="EPSG:4326"
    ).to_crs(epsg=UTM_CRS)

    xmin, ymin, xmax, ymax = bbox_poly.total_bounds
    dx, dy = CELL_SIZE_X, CELL_SIZE_Y

    nx = int((xmax - xmin) // dx) + 1
    ny = int((ymax - ymin) // dy) + 1
    nbx = (nx + K - 1) // K
    nby = (ny + K - 1) // K

    print(f"Total small grids: {nx * ny}")
    print(f"Total big grids: {nbx * nby}")

    files = [f for f in os.listdir(INPUT_DIR) if f.endswith("_paths.geojson")]
    if not files:
        print("No _paths.geojson files found.")
        return

    print(f"Processing {len(files)} GeoJSON files …")
    for fname in sorted(files):
        infile = os.path.join(INPUT_DIR, fname)
        process_geojson(infile, OUTPUT_DIR, xmin, ymin, xmax, ymax, dx, dy, nx, ny, K)

    print("All files processed.")


if __name__ == "__main__":
    main()


In [None]:

"""
Normalized Urgency-Weighted Popularity (UWP) computation across big boxes.

Each *_boxes_reach.csv row has:
  box_labels = "big_id/small_id/reaching_time; ...", where reaching_time is in SECONDS.

Let T_final = max(reaching_time) among tokens in the row.
We compute weights only for cross-big tokens (skip tokens where token.big_id == start_big)
using the standard UWP formulation.

Outputs one CSV per starting big box:
  big_<BIGID>_popularity_sparse.csv with columns:
    day_idx, slot_id, nnz, total_urgency, counts_json

Notes:
- Extra columns (e.g., start_minute_in_slot) are ignored.
- JSON values are rounded floats (ROUND_DECIMALS).
"""

import os
import glob
import csv
import json
import math
from collections import defaultdict, Counter
from typing import Optional, Tuple, List

import pandas as pd

# ---------------- CONFIG ----------------
INPUT_DIR  = "tdrive_boxes_time_MINUTES"            # where *_boxes_reach.csv live
OUTPUT_DIR = "tdrive_UWP_sparse_1hour_250_Normal"  # output folder
FILE_GLOB  = "*_boxes_reach.csv"

# Column names expected in input CSVs
COL_FILE       = "file"
COL_DATE       = "date"
COL_SLOT       = "slot_id"
COL_BOX_LABELS = "box_labels"  # "big/small/time;big/small/time;..."

# ---- UWP calculation parameters ----
P             = 3.0        # higher -> more separation near the start
CLAMP_01      = True       # clamp weights into [0,1]
ROUND_DECIMALS = 6         # rounding for JSON values
# ------------------------------------


def parse_label_with_time(token: str) -> Optional[Tuple[int, int, float]]:
    """
    Parse 'big/small/time' -> (big_id:int, small_id:int, t:float)
    Returns None if parsing fails or any field is missing.
    """
    try:
        parts = token.strip().split("/")
        if len(parts) < 3:
            return None
        big_id = int(parts[0].strip())
        small_id = int(parts[1].strip())
        t = float(parts[2].strip())
        return big_id, small_id, t
    except Exception:
        return None


def build_day_index_map(all_dates: List[str]) -> dict:
    """Map unique date strings -> day_idx (1..N) in ascending date order."""
    unique_sorted = sorted(set(all_dates))
    return {d: i + 1 for i, d in enumerate(unique_sorted)}


def _clamp01(x: float) -> float:
    if CLAMP_01:
        if x < 0.0:
            return 0.0
        if x > 1.0:
            return 1.0
    return x


def uwp_weights(times_rel: List[float]) -> List[float]:
    """
    Given a list of relative times t/T_final in [0,1], return UWP weights.
    """
    if not times_rel:
        return []
    return [_clamp01((1.0 - tr) ** P) for tr in times_rel]


def process_folder(input_dir: str, output_dir: str) -> None:
    os.makedirs(output_dir, exist_ok=True)

    # 1) Load all *_boxes_reach.csv
    paths = sorted(glob.glob(os.path.join(input_dir, FILE_GLOB)))
    if not paths:
        print("No *_boxes_reach.csv files found.")
        return

    frames = []
    for p in paths:
        try:
            df = pd.read_csv(p, dtype={COL_FILE: "string", COL_DATE: "string"}, keep_default_na=False)
            # Keep only the columns we need; extra columns (e.g., start_minute_in_slot) are ignored
            cols = [COL_FILE, COL_DATE, COL_SLOT, COL_BOX_LABELS]
            df = df[[c for c in cols if c in df.columns]].copy()
            frames.append(df)
        except Exception as e:
            print(f"[WARN] Skipping {os.path.basename(p)}: {e}")

    if not frames:
        print("No valid data after reading files.")
        return

    data = pd.concat(frames, ignore_index=True)
    if data.empty:
        print("Input data is empty.")
        return

    # 2) Build day indices
    day_map = build_day_index_map(data[COL_DATE].tolist())
    data["day_idx"] = data[COL_DATE].map(day_map)

    # 3) Accumulator:
    #    uwp[start_big][(day_idx, slot_id)] = Counter({small_id: weight_sum})
    uwp = defaultdict(lambda: defaultdict(Counter))

    # 4) Iterate rows
    for _, row in data.iterrows():
        date_str   = row.get(COL_DATE, "")
        slot_val   = row.get(COL_SLOT, None)
        labels_str = row.get(COL_BOX_LABELS, "")

        if not date_str or slot_val is None or not labels_str:
            continue

        try:
            slot_id = int(slot_val)
        except Exception:
            continue

        tokens = [tok.strip() for tok in labels_str.split(";") if tok.strip()]
        if not tokens:
            continue

        # Parse all tokens with time
        parsed = [parse_label_with_time(tok) for tok in tokens]
        parsed = [x for x in parsed if x is not None]
        if not parsed:
            continue

        # Determine starting big box from the FIRST token
        start_big = parsed[0][0]

        # Destination time = maximum reaching time among ALL tokens in this row
        T_final = max(t for (_, _, t) in parsed)
        if T_final <= 0:
            continue

        # Consider only cross-big tokens for weighting
        cross = [(big, small, t) for (big, small, t) in parsed if big != start_big]
        if not cross:
            continue

        # Relative times for selected tokens
        times_rel = [t / T_final for (_, _, t) in cross]

        # Compute UWP weights
        w_list = uwp_weights(times_rel)

        day_idx = day_map[date_str]
        ctr = uwp[start_big][(day_idx, slot_id)]

        # Accumulate weights per small_id
        for ((_, small_id, _), w) in zip(cross, w_list):
            ctr[int(small_id)] += float(w)

    # 5) Write one CSV per starting big box
    for start_big, bucket in uwp.items():
        out_path = os.path.join(output_dir, f"big_{start_big}_popularity_sparse.csv")
        with open(out_path, "w", newline="") as f:
            writer = csv.writer(f)
            # Header: day_idx, slot_id, nnz (#unique small boxes), total_urgency, counts_json
            writer.writerow(["day_idx", "slot_id", "nnz", "total_urgency", "counts_json"])
            for (day_idx, slot_id), ctr in sorted(bucket.items()):
                nnz = len(ctr)
                total_urgency = float(sum(ctr.values()))
                counts_json = json.dumps(
                    {str(k): round(float(v), ROUND_DECIMALS) for k, v in sorted(ctr.items())},
                    separators=(',', ':'), sort_keys=True
                )
                writer.writerow([day_idx, slot_id, nnz, round(total_urgency, ROUND_DECIMALS), counts_json])

        print(f"[OK] Wrote {os.path.basename(out_path)} with {len(bucket)} rows (UWP computed)")

    # Also write day index map for reference
    pd.DataFrame(
        [{"date": d, "day_idx": idx} for d, idx in sorted(day_map.items(), key=lambda x: x[1])]
    ).to_csv(os.path.join(output_dir, "day_index_map.csv"), index=False)
    print("[OK] Wrote day_index_map.csv")


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    process_folder(INPUT_DIR, OUTPUT_DIR)


if __name__ == "__main__":
    main()

In [None]:
"""
Chosing the cellular areas (big_boxes) for the experimemts.
Enumerate small_ids across selected big boxes, align rows/columns, and write:
  - global_small_ids.csv        : col_idx (0-based), small_id
  - reverse_small_ids.csv       : small_id, col_idx (0-based)
  - big_<ID>_vectors.csv        : day_idx, slot_id, vector (zero-filled, aligned)

Behavior:
- Columns = UNION of small_ids observed in the SELECTED files only (not the whole folder).
- Rows    = UNION of (day_idx, slot_id) across the SELECTED files; zero-fill where missing.
- Values  = parsed from counts_json as float (works for both popularity ints and UWP floats).

Adjust SPARSE_DIR / OUT_DIR / BIG_IDS as needed.
"""

import os
import glob
import csv
import json
import pandas as pd

# ---------------- CONFIG ----------------
SPARSE_DIR = "tdrive_UWP_sparse_1hour_250_Normal"  # or popularity folder
OUT_DIR    = "UWP_ENUM_Minutes"    # output folder
FILE_GLOB  = "big_*_popularity_sparse.csv"

# Select ONLY these big boxes; global small_ids are built from them
BIG_IDS = [21, 22]   # <-- edit as needed Change according to need, if want to reproduce result for Tdrive then the is [21,22], 
#for San Francisco [12,13]
# Vector serialization (keep float to support UWP). If using pure ints, it's fine too.
ROUND_DECIMALS = 6
# ----------------------------------------


def safe_load_json(s: str):
    try:
        return json.loads(s) if isinstance(s, str) and s else {}
    except Exception:
        return {}


def list_available_files(sparse_dir: str, pattern: str):
    """Return {big_id: filepath} for all matching files."""
    out = {}
    for p in glob.glob(os.path.join(sparse_dir, pattern)):
        base = os.path.basename(p)
        if base.startswith("big_") and base.endswith("_popularity_sparse.csv"):
            core = base[len("big_"):-len("_popularity_sparse.csv")]
            try:
                out[int(core)] = p
            except Exception:
                pass
    return out


def collect_union_small_ids(files_map):
    """Union of small_ids across the selected files, returned as a sorted list."""
    sids = set()
    for _, path in sorted(files_map.items()):
        try:
            df = pd.read_csv(path, usecols=["counts_json"])
        except Exception:
            continue
        for _, r in df.iterrows():
            d = safe_load_json(r.get("counts_json", ""))
            for k in d.keys():
                try:
                    sids.add(int(k))
                except Exception:
                    continue
    return sorted(sids)


def collect_union_day_slot(files_map):
    """Union of (day_idx, slot_id) across selected files, returned as a sorted DataFrame."""
    pairs = set()
    for _, path in files_map.items():
        try:
            df = pd.read_csv(path, usecols=["day_idx", "slot_id"])
        except Exception:
            continue
        for _, r in df.iterrows():
            d = r.get("day_idx")
            s = r.get("slot_id")
            if pd.notna(d) and pd.notna(s):
                try:
                    pairs.add((int(d), int(s)))
                except Exception:
                    pass
    if not pairs:
        return pd.DataFrame(columns=["day_idx", "slot_id"])
    return pd.DataFrame(sorted(pairs), columns=["day_idx", "slot_id"])


def write_global_index(out_dir, small_ids_sorted):
    """Write the enumeration mapping: col_idx (0-based), small_id."""
    os.makedirs(out_dir, exist_ok=True)
    path = os.path.join(out_dir, "global_small_ids.csv")
    with open(path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["col_idx", "small_id"])
        for i, sid in enumerate(small_ids_sorted):  # 0-based
            w.writerow([i, sid])
    print(f"[OK] Wrote global_small_ids.csv (K={len(small_ids_sorted)})")


def write_reverse_mapping(out_dir, small_ids_sorted):
    """Write reverse mapping: small_id -> col_idx (0-based)."""
    path = os.path.join(out_dir, "reverse_small_ids.csv")
    with open(path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["small_id", "col_idx"])
        for i, sid in enumerate(small_ids_sorted):  # 0-based
            w.writerow([sid, i])
    print(f"[OK] Wrote reverse_small_ids.csv (K={len(small_ids_sorted)})")


def vector_to_string(vec):
    return "[" + ",".join(f"{v:.{ROUND_DECIMALS}f}" for v in vec) + "]"


def load_sparse_map(path):
    """
    Load one big_* file into:
      {(day_idx, slot_id): {small_id: float_value, ...}}
    """
    out = {}
    try:
        df = pd.read_csv(path)
    except Exception:
        return out

    if not {"day_idx", "slot_id", "counts_json"}.issubset(df.columns):
        return out

    for _, r in df.iterrows():
        d = r.get("day_idx")
        s = r.get("slot_id")
        if pd.isna(d) or pd.isna(s):
            continue
        try:
            key = (int(d), int(s))
        except Exception:
            continue
        dd = {}
        counts = safe_load_json(r.get("counts_json", ""))
        for k, v in counts.items():
            try:
                dd[int(k)] = float(v)  # supports UWP floats & int popularity
            except Exception:
                continue
        out[key] = dd
    return out


def convert_to_vectors(files_map, small_ids_sorted, union_day_slot, out_dir):
    """Write one vectors CSV per selected big box, zero-filling missing."""
    os.makedirs(out_dir, exist_ok=True)
    col_index = {sid: i for i, sid in enumerate(small_ids_sorted)}
    K = len(small_ids_sorted)

    for bid, path in sorted(files_map.items()):
        sparse_map = load_sparse_map(path)
        rows = []

        for _, rs in union_day_slot.iterrows():
            d = int(rs["day_idx"])
            s = int(rs["slot_id"])
            dd = sparse_map.get((d, s), {})  # {} => zero vector

            vec = [0.0] * K
            for sid, val in dd.items():
                j = col_index.get(sid)
                if j is not None:
                    vec[j] = val

            rows.append({"day_idx": d, "slot_id": s, "vector": vector_to_string(vec)})

        out_df = pd.DataFrame(rows).sort_values(["day_idx", "slot_id"])
        out_name = f"big_{bid}_vectors.csv"
        out_path = os.path.join(out_dir, out_name)
        out_df.to_csv(out_path, index=False)
        print(f"[OK] Wrote {out_name}  rows={len(out_df)}  vec_dim={K}")


def main():
    # Keep ONLY the selected big boxes
    all_files = list_available_files(SPARSE_DIR, FILE_GLOB)
    files_map = {bid: all_files[bid] for bid in BIG_IDS if bid in all_files}
    missing = [bid for bid in BIG_IDS if bid not in files_map]
    if missing:
        print(f"[WARN] Missing big IDs: {missing}")
    if not files_map:
        print("ERROR: none of the requested big IDs exist in the folder.")
        return

    # 1) UNION of small_ids across selected files -> enumerate 0..K-1
    small_ids_sorted = collect_union_small_ids(files_map)
    write_global_index(OUT_DIR, small_ids_sorted)
    write_reverse_mapping(OUT_DIR, small_ids_sorted)

    # 2) UNION of (day_idx, slot_id) across selected files -> consistent rows
    union_day_slot = collect_union_day_slot(files_map)
    if union_day_slot.empty:
        print("[INFO] No (day_idx, slot_id) pairs found. Nothing to write.")
        return

    # 3) Convert each selected file -> aligned, zero-filled vectors
    convert_to_vectors(files_map, small_ids_sorted, union_day_slot, OUT_DIR)

    print(f"All done. Outputs at: {OUT_DIR}")


if __name__ == "__main__":
    main()

In [None]:
import os
import ast
import numpy as np
import pandas as pd

# ---------------- CONFIG ----------------
INPUT_DIR = "UWP_ENUM_Minutes"      # Folder where your CSVs are
FILES = ["big_21_vectors.csv", "big_22_vectors.csv"]  # Add more if needed
# <-- edit as needed Change according to need, if want to reproduce result for Tdrive then the is [21,22], 
#for San Francisco [12,13]
SAVE_PATH = "big_UWP_tensor.npy"        # Save in current directory
# ----------------------------------------


def parse_vector(vec_str):
    """Convert '[0,1,2,...]' string into list[float]."""
    try:
        return [float(x) for x in ast.literal_eval(vec_str)]
    except Exception:
        return []

tensors = []

for fname in FILES:
    path = os.path.join(INPUT_DIR, fname)
    if not os.path.exists(path):
        print(f"[WARN] File not found: {path}")
        continue

    # Read CSV and parse vector column
    df = pd.read_csv(path)
    df["vector_list"] = df["vector"].apply(parse_vector)

    # Convert to NumPy array (num_timeslots, vector_len)
    arr = np.array(df["vector_list"].to_list())
    tensors.append(arr)
    print(f"[OK] Loaded {fname} → shape {arr.shape}")

if not tensors:
    raise RuntimeError("No valid CSV files loaded. Exiting.")

# Stack all arrays into one 3D tensor: (num_big_boxes, num_timeslots, vector_len)
tensor = np.stack(tensors, axis=0)
print(f"\n✅ Final tensor shape: {tensor.shape}")

# Save tensor in the **current directory** as `big_tensor.npy`
np.save(SAVE_PATH, tensor)
print(f"[OK] Saved combined tensor → {os.path.abspath(SAVE_PATH)}")

In [None]:
"""
This code produces, for each cellular area (“big box”) and time slot, the list of
HD map segments (small boxes) requested by vehicles together with their Urgency-Weighted Popularity (UWP) weights.

INPUT_DIR contains many *_boxes_reach.csv files with columns:
  - file (original taxi filename; vehicle id derived from its stem)
  - date (YYYY-MM-DD)
  - slot_id (integer hour slot)
  - start_minute_in_slot (optional; integer minute within slot)
  - box_labels: "big_id/small_id/reaching_time;big_id/small_id/reaching_time;..."

For each row:
  - Determine starting big box (start_big) from the FIRST token's big_id.
  - Compute T_final = max(reaching_time) across tokens in the row.
  - Build a sublist: [vehicle_id, start_minute, [[small_id, uwp_weight], ...]],
    where uwp_weight is computed from relative time t/T_final via the UWP formula.
  - Only include tokens whose big_id != start_big (skip within-same-big-box visits).

We aggregate by (start_big, day_idx, slot_id).
For each (day_idx, slot_id), we collect a list of those sublists (one per input row).

OUTPUT:
  OUTPUT_DIR / big_<BIGID>.csv
    Columns:
      - day_idx
      - slot_id
      - entries_json   (JSON array of sublists as described above)

Also writes:
  OUTPUT_DIR / day_index_map.csv  (date -> day_idx mapping)
"""

import os
import glob
import json
import csv
import math
from collections import defaultdict
from typing import Optional, Tuple, List

import pandas as pd

# ---------------- CONFIG ----------------
INPUT_DIR  = "tdrive_boxes_time_MINUTES"
OUTPUT_DIR = "bigbox_slot_dicts_uwp"
FILE_GLOB  = "*_boxes_reach.csv"

# UWP parameters
P         = 3.0     # controls separation (higher → sharper early emphasis)
CLAMP_01  = True    # clamp weights to [0,1]
ROUND_DEC = 6       # rounding for weights inside JSON

# Column names in *_boxes_reach.csv
COL_FILE        = "file"
COL_DATE        = "date"
COL_SLOT        = "slot_id"
COL_BOX_LABELS  = "box_labels"
COL_START_MIN   = "start_minute_in_slot"  # optional
# ----------------------------------------


def parse_label_with_time(token: str) -> Optional[Tuple[int, int, float]]:
    """Parse 'big/small/time' -> (big_id:int, small_id:int, t:float)."""
    try:
        parts = token.strip().split("/")
        if len(parts) < 3:
            return None
        big_id = int(parts[0].strip())
        small_id = int(parts[1].strip())
        t = float(parts[2].strip())
        return big_id, small_id, t
    except Exception:
        return None


def build_day_index_map(all_dates: List[str]) -> dict:
    """Map unique date strings -> day_idx (1..N) in ascending date order."""
    uniq = sorted(set(all_dates))
    return {d: i + 1 for i, d in enumerate(uniq)}


def stem_from_path(path: str) -> str:
    """Return basename without extension."""
    base = os.path.basename(path)
    if "." in base:
        return ".".join(base.split(".")[:-1]) or base
    return base


def extract_vehicle_id(file_field: str) -> str:
    """Derive vehicle id from the 'file' column by taking the stem."""
    return stem_from_path(file_field)


def _clamp01(x: float) -> float:
    if CLAMP_01:
        if x < 0.0:
            return 0.0
        if x > 1.0:
            return 1.0
    return x


def uwp_weights(times_rel: List[float]) -> List[float]:
    """
    Given relative times t/T in [0,1] for cross-big tokens of a row,
    return per-token UWP weights: w = (1 - t/T)^P.
    """
    if not times_rel:
        return []
    return [_clamp01((1.0 - tr) ** P) for tr in times_rel]


def process(input_dir: str, output_dir: str) -> None:
    os.makedirs(output_dir, exist_ok=True)

    # 1) Load all *_boxes_reach.csv
    paths = sorted(glob.glob(os.path.join(input_dir, FILE_GLOB)))
    if not paths:
        print("No *_boxes_reach.csv files found.")
        return

    frames = []
    for p in paths:
        try:
            df = pd.read_csv(
                p,
                dtype={COL_FILE: "string", COL_DATE: "string"},
                keep_default_na=False,
            )
            # Keep only relevant columns; tolerate missing start_minute_in_slot
            cols = [COL_FILE, COL_DATE, COL_SLOT, COL_BOX_LABELS, COL_START_MIN]
            df = df[[c for c in cols if c in df.columns]].copy()
            frames.append(df)
        except Exception as e:
            print(f"[WARN] Skipping {os.path.basename(p)}: {e}")

    if not frames:
        print("No valid data after reading files.")
        return

    data = pd.concat(frames, ignore_index=True)
    if data.empty:
        print("Input data is empty.")
        return

    # 2) day_idx mapping
    day_map = build_day_index_map(data[COL_DATE].tolist())
    data["day_idx"] = data[COL_DATE].map(day_map)

    has_start_min_col = (COL_START_MIN in data.columns)

    # 3) Accumulator:
    # agg[start_big][(day_idx, slot_id)] = list of sublists
    # sublist = [vehicle_id, start_minute, [[small_id, uwp_weight], ...]]
    agg = defaultdict(lambda: defaultdict(list))

    # 4) Iterate rows
    for _, row in data.iterrows():
        date_str   = row.get(COL_DATE, "")
        slot_val   = row.get(COL_SLOT, None)
        labels_str = row.get(COL_BOX_LABELS, "")
        file_val   = row.get(COL_FILE, "")

        if not date_str or slot_val is None or not labels_str or not file_val:
            continue

        try:
            slot_id = int(slot_val)
        except Exception:
            continue

        tokens = [tok.strip() for tok in str(labels_str).split(";") if tok.strip()]
        if not tokens:
            continue

        parsed = [parse_label_with_time(tok) for tok in tokens]
        parsed = [x for x in parsed if x is not None]
        if not parsed:
            continue

        # Start big from FIRST token
        start_big = parsed[0][0]

        # Destination time = max reaching time among tokens
        T_final = max(t for (_, _, t) in parsed)
        # If no time progression, we end up with empty weights for this row
        if T_final <= 0:
            uwp_pairs: List[List[float]] = []
        else:
            # Keep only cross-big visits
            cross = [(bid, sid, t) for (bid, sid, t) in parsed if bid != start_big]
            if not cross:
                uwp_pairs = []
            else:
                times_rel = [t / T_final for (_, _, t) in cross]
                w_list = uwp_weights(times_rel)
                uwp_pairs = [[int(sid), round(float(w), ROUND_DEC)] for ((_, sid, _), w) in zip(cross, w_list)]

        # minute
        if has_start_min_col:
            try:
                m = row.get(COL_START_MIN, -1)
                start_min = int(m) if pd.notna(m) else -1
            except Exception:
                start_min = -1
        else:
            start_min = -1

        # vehicle id
        vehicle_id = extract_vehicle_id(str(file_val))

        # sublist for this row
        sublist = [vehicle_id, start_min, uwp_pairs]

        day_idx = int(row["day_idx"])
        agg[start_big][(day_idx, slot_id)].append(sublist)

    # 5) Write one CSV per start_big
    for start_big, bucket in agg.items():
        out_path = os.path.join(output_dir, f"big_{start_big}.csv")
        with open(out_path, "w", newline="") as f:
            w = csv.writer(f)
            w.writerow(["day_idx", "slot_id", "entries_json"])
            for (day_idx, slot_id) in sorted(bucket.keys()):
                entries = bucket[(day_idx, slot_id)]  # list of sublists
                entries_json = json.dumps(entries, separators=(",", ":"), ensure_ascii=False)
                w.writerow([day_idx, slot_id, entries_json])
        print(f"[OK] Wrote {os.path.basename(out_path)} with {len(bucket)} keys")

    # Also write reference day index map
    day_map_path = os.path.join(output_dir, "day_index_map.csv")
    pd.DataFrame(
        [{"date": d, "day_idx": idx} for d, idx in sorted(day_map.items(), key=lambda x: x[1])]
    ).to_csv(day_map_path, index=False)
    print(f"[OK] Wrote day_index_map.csv at {day_map_path}")


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    process(INPUT_DIR, OUTPUT_DIR)


if __name__ == "__main__":
    main()


In [None]:
"""
Build a nested Python dictionary from multiple big_<ID>.csv files, the cellular areas which are considered together for experiments from
all the cellular area in the city.

INPUT:
  - BIGBOX_IDS: list of big box IDs to load
  - Each file: OUTPUT_DIR / big_<ID>.csv
      Columns: day_idx, slot_id, entries_json
      entries_json = list of sublists: [vehicle_id, start_minute, [[small_id, uwp_weight], ...]]

OUTPUT:
  - A Python dictionary with structure:
        result = {
            big_id_1: {
                (day_idx, slot_id): [
                    [vehicle_id, start_minute, [[small_id, uwp_weight], ...]],
                    ...
                ],
                ...
            },
            big_id_2: {
                ...
            },
            ...
        }
"""

import os
import json
import pandas as pd

# ---------------- CONFIG ----------------
INPUT_DIR = "bigbox_slot_dicts_uwp"  # folder with big_<ID>.csv
BIGBOX_IDS = [21,22]   # For Cellular area 21,22
# ----------------------------------------


def load_bigbox_dict(input_dir: str, big_ids: list) -> dict:
    """
    Load multiple big_<ID>.csv files into a nested dictionary.
    """
    result = {}

    for bid in big_ids:
        path = os.path.join(input_dir, f"big_{bid}.csv")
        if not os.path.exists(path):
            print(f"[WARN] File not found: {path}")
            continue

        df = pd.read_csv(path, dtype={"day_idx": int, "slot_id": int})
        big_dict = {}

        for _, row in df.iterrows():
            day_idx = int(row["day_idx"])
            slot_id = int(row["slot_id"])
            key = (day_idx, slot_id)

            try:
                entries = json.loads(row["entries_json"])
            except json.JSONDecodeError:
                print(f"[ERROR] JSON parse failed for big {bid}, day {day_idx}, slot {slot_id}")
                entries = []

            big_dict[key] = entries

        result[bid] = big_dict
        print(f"[OK] Loaded big_{bid}.csv → {len(big_dict)} time slots")

    return result


bigbox_data = load_bigbox_dict(INPUT_DIR, BIGBOX_IDS)

# ✅ Example usage: iterate over dictionary
for bid, slots in bigbox_data.items():
    print(f"\n=== BIG BOX {bid} ===")
    for (day_idx, slot_id), entries in list(slots.items())[:3]:  # show first 3
        print(f"Day {day_idx}, Slot {slot_id}: {len(entries)} vehicle records")
        for vehicle_entry in entries[:2]:  # show first 2 vehicles
            print("   ", vehicle_entry)

In [None]:
import pandas as pd
import copy

def load_reverse_map(csv_path: str) -> dict:
    """
    reverse_small_ids.csv -> {small_id: col_idx}
    """
    df = pd.read_csv(csv_path, dtype={"small_id": int, "col_idx": int})
    return dict(zip(df["small_id"], df["col_idx"]))

def map_small_ids_to_global(bigbox_data: dict,
                            reverse_map_csv: str,
                            skip_unmapped: bool = True,
                            inplace: bool = False) -> dict:
    """
    Replace small_id with global col_idx using reverse_small_ids.csv.

    Parameters
    ----------
    bigbox_data : dict
        { big_id: { (day_idx, slot_id): [ [vehicle_id, start_minute, [[small_id, weight], ...]], ... ] } }
    reverse_map_csv : str
        Path to CSV with columns: small_id,col_idx
    skip_unmapped : bool
        If True, drops pairs whose small_id isn't in the reverse map.
        If False, keeps original small_id for those pairs.
    inplace : bool
        If True, modify bigbox_data in place. Otherwise, return a deep-copied mapped dict.

    Returns
    -------
    dict
        Same structure as input, with small_ids replaced by col_idx where mapped.
    """
    rev_map = load_reverse_map(reverse_map_csv)

    target = bigbox_data if inplace else copy.deepcopy(bigbox_data)

    missing = 0
    total   = 0

    for big_id, slot_dict in target.items():
        for key, vehicle_list in slot_dict.items():
            # vehicle_list: [ [vehicle_id, start_minute, [[small_id, weight], ...]], ... ]
            for entry in vehicle_list:
                # entry: [vehicle_id, start_minute, pairs]
                pairs = entry[2]
                new_pairs = []
                for sid, w in pairs:
                    total += 1
                    sid_int = int(sid)
                    if sid_int in rev_map:
                        new_pairs.append([rev_map[sid_int], float(w)])
                    else:
                        missing += 1
                        if not skip_unmapped:
                            new_pairs.append([sid_int, float(w)])
                        # else: drop it
                entry[2] = new_pairs  # replace pairs list

    if missing:
        print(f"[INFO] Mapping done. {missing}/{total} pairs had no mapping "
              f"({missing/total:.2%}). {'Dropped' if skip_unmapped else 'Kept'} unmapped.")
    else:
        print(f"[INFO] Mapping done. All {total} pairs mapped.")

    return target


In [None]:
# Path to the reverse map CSV
reverse_map_csv = "UWP_ENUM_Minutes/reverse_small_ids.csv"

# Create a new mapped dictionary (does NOT modify the original)
mapped_data = map_small_ids_to_global(
    bigbox_data, 
    reverse_map_csv,
    skip_unmapped=True,   # drop small IDs not found in the reverse map
    inplace=False         # set to True if you want to overwrite bigbox_data
)

In [None]:
import numpy as np
car4=[]
car4.append(mapped_data)
np.save('car4real.npy',car4)