diff --git a/examples/workers/l2/vector_add/main.py b/examples/workers/l2/vector_add/main.py
index 6ad1480ad..6db910b84 100644
--- a/examples/workers/l2/vector_add/main.py
+++ b/examples/workers/l2/vector_add/main.py
@@ -127,7 +127,7 @@ def build_chip_callable(platform: str) -> ChipCallable:
     )
 
 
-def _run(worker: Worker, chip_cid: int) -> None:
+def _run(worker: Worker, chip_cid: int):
     """Allocate device memory, copy inputs, execute, copy outputs back, verify."""
     # --- 1. Prepare host arrays ---
     torch.manual_seed(42)
@@ -155,7 +155,8 @@ def _run(worker: Worker, chip_cid: int) -> None:
     # --- 4. Run. CallConfig() defaults are fine for this kernel. ---
     config = CallConfig()
     print("[vector_add] running on device...")
-    worker.run(chip_cid, args, config)
+    timing = worker.run(chip_cid, args, config)
+    print(f"[vector_add] {timing}")
 
     # --- 5. D2H copy back + verify ---
     worker.copy_from(host_out.data_ptr(), dev_out, NBYTES)
@@ -169,6 +170,7 @@ def _run(worker: Worker, chip_cid: int) -> None:
     print(f"[vector_add] max |host_out - expected| = {max_diff:.3e}")
     assert torch.allclose(host_out, expected, rtol=1e-5, atol=1e-5)
     print("[vector_add] golden check PASSED")
+    return timing
 
 
 def run(platform: str, device_id: int) -> int:
diff --git a/examples/workers/l2/vector_add/test_run_timing.py b/examples/workers/l2/vector_add/test_run_timing.py
new file mode 100644
index 000000000..a3944c087
--- /dev/null
+++ b/examples/workers/l2/vector_add/test_run_timing.py
@@ -0,0 +1,115 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Hardware ST asserting Worker.run returns a usable RunTiming.
+
+Reuses the vector_add example's kernel + ChipCallable build so this test
+doesn't drag in its own kernel sources. The contract being verified:
+
+    * `worker.run(...)` returns a `RunTiming` instance (not None).
+    * `host_wall_us` is strictly positive — there's no way a real dispatch
+      took zero steady-clock time, so this would catch a regression where
+      the C ABI stopped writing the timing back to the out-param.
+    * `device_wall_us` is strictly positive on the default build, even
+      without `--enable-l2-swimlane`. The orch_summary write was decoupled
+      from the swimlane shared-region path so device timing is always
+      available when PTO2_PROFILING is compiled in (default).
+    * `host_wall_us > device_wall_us` — host wall must wrap the device
+      dispatch, so the host clock should always read longer.
+"""
+
+import pytest
+from _task_interface import RunTiming  # pyright: ignore[reportMissingImports]
+from simpler.task_interface import CallConfig, ChipStorageTaskArgs, ContinuousTensor, DataType
+from simpler.worker import Worker
+
+from .main import N_COLS, N_ELEMS, N_ROWS, NBYTES, build_chip_callable
+
+
+def _drive_one_run(platform: str, device_id: int, *, enable_l2_swimlane: bool = False):
+    import torch  # noqa: PLC0415
+
+    worker = Worker(
+        level=2,
+        platform=platform,
+        runtime="tensormap_and_ringbuffer",
+        device_id=device_id,
+    )
+    chip_callable = build_chip_callable(platform)
+    chip_cid = worker.register(chip_callable)
+    worker.init()
+    try:
+        # Use deterministic inputs so the run never accidentally hits a
+        # degenerate kernel path that fails to publish a perf record.
+        host_a = torch.full((N_ROWS, N_COLS), 1.0, dtype=torch.float32)
+        host_b = torch.full((N_ROWS, N_COLS), 2.0, dtype=torch.float32)
+
+        dev_a = worker.malloc(NBYTES)
+        dev_b = worker.malloc(NBYTES)
+        dev_out = worker.malloc(NBYTES)
+        worker.copy_to(dev_a, host_a.data_ptr(), NBYTES)
+        worker.copy_to(dev_b, host_b.data_ptr(), NBYTES)
+
+        args = ChipStorageTaskArgs()
+        args.add_tensor(ContinuousTensor.make(dev_a, (N_ROWS, N_COLS), DataType.FLOAT32))
+        args.add_tensor(ContinuousTensor.make(dev_b, (N_ROWS, N_COLS), DataType.FLOAT32))
+        args.add_tensor(ContinuousTensor.make(dev_out, (N_ROWS, N_COLS), DataType.FLOAT32))
+
+        config = CallConfig()
+        config.enable_l2_swimlane = enable_l2_swimlane
+
+        timing = worker.run(chip_cid, args, config)
+
+        # Verify the output is sane (so we know the kernel actually ran and
+        # the timing isn't from an early-error path).
+        host_out = torch.zeros(N_ROWS, N_COLS, dtype=torch.float32)
+        worker.copy_from(host_out.data_ptr(), dev_out, NBYTES)
+        worker.free(dev_a)
+        worker.free(dev_b)
+        worker.free(dev_out)
+        assert torch.allclose(host_out, host_a + host_b, rtol=1e-5, atol=1e-5), (
+            f"vector_add kernel output diverged; max |a+b - out| = "
+            f"{float(torch.max(torch.abs(host_out - (host_a + host_b)))):.3e} "
+            f"(N_ELEMS={N_ELEMS})"
+        )
+        return timing
+    finally:
+        worker.close()
+
+
+@pytest.mark.platforms(["a2a3sim", "a2a3"])
+@pytest.mark.runtime("tensormap_and_ringbuffer")
+@pytest.mark.device_count(1)
+def test_worker_run_returns_run_timing(st_platform, st_device_ids):
+    timing = _drive_one_run(st_platform, int(st_device_ids[0]))
+
+    assert isinstance(timing, RunTiming), (
+        f"Worker.run must return a RunTiming (got {type(timing).__name__}); "
+        f"a None return means the ChipWorker Python wrapper dropped the C++ return value."
+    )
+    # Host wall is measured with steady_clock around the dispatch; even on
+    # sim it covers thread + IPC overhead so the only way to see 0 is a
+    # bug in the out-param plumbing.
+    assert timing.host_wall_us > 0.0, f"host_wall_us must be > 0, got {timing.host_wall_us}"
+    assert timing.host_wall_ns > 0, f"host_wall_ns must be > 0, got {timing.host_wall_ns}"
+    # device_wall must also be > 0 without --enable-l2-swimlane after the
+    # Phase B decoupling: orch_summary is written unconditionally when
+    # PTO2_PROFILING is on (default build). Hitting 0 here means either:
+    #   - the AICPU's l2_perf_aicpu_write_orch_summary path regressed back
+    #     under an is_l2_swimlane_enabled() gate, or
+    #   - the host stopped reading the phase header after the run.
+    assert timing.device_wall_us > 0.0, (
+        f"device_wall_us must be > 0 on the default PTO2_PROFILING build, got {timing.device_wall_us}; "
+        f"regression in the orch_summary capture path."
+    )
+    assert timing.device_wall_ns > 0
+    # Host wall must wrap the device dispatch — anything else points at
+    # a steady-clock / cycle-conversion bug.
+    assert timing.host_wall_us >= timing.device_wall_us, (
+        f"host_wall_us ({timing.host_wall_us}) must wrap device_wall_us ({timing.device_wall_us})"
+    )
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index a6f07c48c..5d4d874fa 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -652,6 +652,57 @@ NB_MODULE(_task_interface, m) {
     // one, change the other.
     m.attr("DEFAULT_LOG_THRESHOLD") = 20;  // V5 = Python INFO
 
+    // --- RunTiming ---
+    // Returned by ChipWorker.run_prepared* / Worker.run. Cycles → ns conversion
+    // happens on the platform side (frequency known there); units exposed to
+    // Python are µs as floats to match historical benchmark_rounds.sh output.
+    nb::class_<RunTiming>(m, "RunTiming")
+        .def(nb::init<>())
+        .def(
+            "__init__",
+            [](RunTiming *self, uint64_t host_wall_ns, uint64_t device_wall_ns) {
+                new (self) RunTiming{host_wall_ns, device_wall_ns};
+            },
+            nb::arg("host_wall_ns"), nb::arg("device_wall_ns") = 0,
+            "Construct with explicit ns values (used by the Python Worker.run "
+            "wrapper to surface host-side timing for L3+ DAGs)."
+        )
+        .def_prop_ro(
+            "host_wall_us",
+            [](const RunTiming &t) {
+                return t.host_wall_ns / 1000.0;
+            },
+            "Host steady-clock wall around the dispatch, in microseconds."
+        )
+        .def_prop_ro(
+            "device_wall_us",
+            [](const RunTiming &t) {
+                return t.device_wall_ns / 1000.0;
+            },
+            "On-NPU wall (orch end - orch start), in microseconds. Populated whenever the "
+            "runtime was built with PTO2_PROFILING (the default); independent of "
+            "enable_l2_swimlane after the orch_summary capture was decoupled from the "
+            "swimlane shared region. Zero only on a PTO2_PROFILING-off build."
+        )
+        .def_prop_ro(
+            "host_wall_ns",
+            [](const RunTiming &t) {
+                return t.host_wall_ns;
+            }
+        )
+        .def_prop_ro(
+            "device_wall_ns",
+            [](const RunTiming &t) {
+                return t.device_wall_ns;
+            }
+        )
+        .def("__repr__", [](const RunTiming &t) {
+            std::ostringstream os;
+            os << "RunTiming(host_wall_us=" << t.host_wall_ns / 1000.0
+               << ", device_wall_us=" << t.device_wall_ns / 1000.0 << ")";
+            return os.str();
+        });
+
     // --- ChipWorker ---
     nb::class_<ChipWorker>(m, "_ChipWorker")
         .def(nb::init<>())
@@ -685,19 +736,21 @@ NB_MODULE(_task_interface, m) {
         .def(
             "run",
             [](ChipWorker &self, int32_t callable_id, ChipStorageTaskArgs &args, const CallConfig &config) {
-                self.run(callable_id, &args, config);
+                return self.run(callable_id, &args, config);
             },
             nb::arg("callable_id"), nb::arg("args"), nb::arg("config"),
-            "Launch a callable_id previously staged via prepare_callable."
+            "Launch a callable_id previously staged via prepare_callable. "
+            "Returns RunTiming with host/device wall."
         )
         .def(
             "run",
             [](ChipWorker &self, int32_t callable_id, TaskArgs &args, const CallConfig &config) {
                 TaskArgsView view = make_view(args);
-                self.run(callable_id, view, config);
+                return self.run(callable_id, view, config);
             },
             nb::arg("callable_id"), nb::arg("args"), nb::arg("config"),
-            "Launch a callable_id from a TaskArgs (used for in-process callers)."
+            "Launch a callable_id from a TaskArgs (used for in-process callers). "
+            "Returns RunTiming."
         )
         .def(
             "run_prepared_from_blob",
@@ -710,7 +763,7 @@ NB_MODULE(_task_interface, m) {
                 // loops never re-implement the tensor/scalar layout in Python
                 // (where it has historically dropped fields like child_memory).
                 TaskArgsView view = read_blob(reinterpret_cast<const uint8_t *>(args_blob_ptr), blob_capacity);
-                self.run(callable_id, view, config);
+                return self.run(callable_id, view, config);
             },
             nb::arg("callable_id"), nb::arg("args_blob_ptr"), nb::arg("blob_capacity"), nb::arg("config"),
             "Launch a callable_id from a raw mailbox-blob pointer + capacity "
diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py
index 2e0398898..49e8471d6 100644
--- a/python/simpler/task_interface.py
+++ b/python/simpler/task_interface.py
@@ -521,12 +521,14 @@ def run(self, callable_id, args, config=None, **kwargs):
             args: ChipStorageTaskArgs for this invocation.
             config: Optional CallConfig. If None, a default is created.
             **kwargs: Overrides applied to config (e.g. block_dim=24).
+
+        Returns a :class:`RunTiming` with host + device wall.
         """
         if config is None:
             config = CallConfig()
         for k, v in kwargs.items():
             setattr(config, k, v)
-        self._impl.run(int(callable_id), args, config)
+        return self._impl.run(int(callable_id), args, config)
 
     def unregister_callable(self, callable_id):
         """Drop prepared state for ``callable_id`` and release its orch SO share."""
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 64c678236..ba02d5e86 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -70,6 +70,7 @@ def my_l4_orch(orch, args, config):
     MAX_REGISTERED_CALLABLE_IDS,
     ChipBootstrapChannel,
     ChipBootstrapMailboxState,
+    RunTiming,
     _mailbox_load_i32,
     _mailbox_store_i32,
     read_args_from_blob,
@@ -1444,7 +1445,7 @@ def copy_from(self, dst: int, src: int, size: int, worker_id: int = 0) -> None:
     # run — uniform entry point
     # ------------------------------------------------------------------
 
-    def run(self, callable, args=None, config=None) -> None:
+    def run(self, callable, args=None, config=None) -> RunTiming:
         """Execute one task (L2) or one DAG (L3+) synchronously.
 
         Dispatch:
@@ -1455,35 +1456,48 @@ def run(self, callable, args=None, config=None) -> None:
 
         ``args``  : TaskArgs (optional)
         ``config``: CallConfig (optional, default-constructed if None)
+
+        Returns a :class:`RunTiming` with ``host_wall_us`` (Python wall-clock
+        around the dispatch) and ``device_wall_us`` (on-NPU orchestrator wall,
+        populated whenever the runtime was built with ``PTO2_PROFILING`` —
+        the default build has it on). For L3+ DAGs, ``host_wall_us`` covers
+        the whole orch fn and ``device_wall_us`` is unset (0) — per-task
+        device timings are not aggregated here.
         """
         assert self._initialized, "Worker not initialized; call init() first"
         cfg = config if config is not None else CallConfig()
 
         if self.level == 2:
             assert self._chip_worker is not None
-            self._chip_worker.run(int(callable), args, cfg)
-        else:
-            self._start_hierarchical()
-            assert self._orch is not None
-            assert self._worker is not None
-            # Drop any error stashed by a previous run() so this call starts
-            # clean. drain() rethrows on the way out; every successful run()
-            # leaves the error slot empty, but an unrelated caller may have
-            # poked it.
-            self._orch._clear_error()
-            self._orch._scope_begin()
-            try:
-                callable(self._orch, args, cfg)
-            finally:
-                # Always release scope refs and drain so ring slots aren't
-                # stranded when the orch fn raises mid-DAG. drain() also
-                # rethrows the first dispatch failure for this run — that
-                # is how child-task exceptions surface to the caller of
-                # Worker.run(). scope_end deliberately does NOT throw: if
-                # it did, released refs would be incomplete and drain
-                # would hang on in-flight tasks.
-                self._orch._scope_end()
-                self._orch._drain()
+            return self._chip_worker.run(int(callable), args, cfg)
+
+        self._start_hierarchical()
+        assert self._orch is not None
+        assert self._worker is not None
+        # Drop any error stashed by a previous run() so this call starts
+        # clean. drain() rethrows on the way out; every successful run()
+        # leaves the error slot empty, but an unrelated caller may have
+        # poked it.
+        self._orch._clear_error()
+        self._orch._scope_begin()
+        t_start = time.perf_counter_ns()
+        try:
+            callable(self._orch, args, cfg)
+        finally:
+            # Always release scope refs and drain so ring slots aren't
+            # stranded when the orch fn raises mid-DAG. drain() also
+            # rethrows the first dispatch failure for this run — that
+            # is how child-task exceptions surface to the caller of
+            # Worker.run(). scope_end deliberately does NOT throw: if
+            # it did, released refs would be incomplete and drain
+            # would hang on in-flight tasks.
+            self._orch._scope_end()
+            self._orch._drain()
+        # device_wall stays 0 for L3+: aggregating per-task device cycles
+        # across a DAG isn't implemented here (would need accumulation in the
+        # ring scheduler). Callers wanting per-task device wall should issue
+        # individual run calls.
+        return RunTiming(time.perf_counter_ns() - t_start, 0)
 
     def prepare_callable(self, callable_id: int, callable) -> None:
         """L2 only: pre-stage a callable under ``callable_id`` (see
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index e681fa167..5a994b332 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -325,6 +325,69 @@ def _temporary_env(env_updates):
                 os.environ[k] = v
 
 
+def _log_round_timings(timings):
+    """Print per-round + summary host/device wall (µs) for multi-round runs.
+
+    Replaces the device-log scraping that ``tools/benchmark_rounds.sh`` used to
+    do — Worker.run now returns the timing directly, so the per-round table
+    can be emitted by the framework without grep+awk over device logs.
+
+    Output goes to ``print`` (not ``logger.info``) because benchmark numbers
+    are a user-facing artifact and the project's default log level suppresses
+    INFO in standalone test_*.py runs.
+
+    ``timings`` is a list of (host_wall_us, device_wall_us) tuples. The device
+    column reports 0 when the runtime was built without PTO2_PROFILING (the
+    default build has it on) or when an L3+ DAG is in use (per-task device
+    cycles aren't aggregated).
+    """
+    if not timings:
+        return
+    n = len(timings)
+    host_vals = sorted(t[0] for t in timings)
+    dev_vals = sorted(t[1] for t in timings)
+    host_avg = sum(host_vals) / n
+    # Device avg averages over non-zero rounds only. When --rounds > 1 the
+    # framework restricts enable_l2_swimlane (and thus orch_summary capture)
+    # to round 0, so the other rounds report 0; including those zeros in the
+    # average would silently halve / quarter the reported device wall and
+    # mislead the benchmark consumer. dev_count carries the sample size into
+    # the summary line for transparency.
+    dev_nonzero = [v for v in dev_vals if v > 0.0]
+    dev_count = len(dev_nonzero)
+    dev_avg = (sum(dev_nonzero) / dev_count) if dev_count else 0.0
+    show_device = dev_count > 0
+
+    header = f"  {'Round':<6}  {'Host (us)':>12}"
+    if show_device:
+        header += f"  {'Device (us)':>12}"
+    print(header)
+    print("  " + "-" * (len(header) - 2))
+    for i, (h, d) in enumerate(timings):
+        line = f"  {i:<6d}  {h:>12.1f}"
+        if show_device:
+            line += f"  {d:>12.1f}"
+        print(line)
+    summary = f"  Avg Host: {host_avg:.1f} us"
+    if show_device:
+        summary += f"  |  Avg Device: {dev_avg:.1f} us [{dev_count}/{n} rounds captured]"
+    summary += f"  ({n} rounds)"
+    print(summary)
+
+    trim = 10
+    if n > 2 * trim:
+        tc = n - 2 * trim
+        host_trim = sum(host_vals[trim:-trim]) / tc
+        msg = f"  Trimmed Avg Host: {host_trim:.1f} us"
+        if show_device and dev_count > 2 * trim:
+            dev_nonzero_sorted = sorted(dev_nonzero)
+            dev_trim_count = dev_count - 2 * trim
+            dev_trim = sum(dev_nonzero_sorted[trim:-trim]) / dev_trim_count
+            msg += f"  |  Trimmed Avg Device: {dev_trim:.1f} us"
+        msg += f"  (dropped {trim} low + {trim} high, {tc} rounds used)"
+        print(msg)
+
+
 def _resolve_callable_paths(cls, cls_dir):
     """Resolve relative source paths in CALLABLE against cls_dir."""
     callable_spec = cls.CALLABLE
@@ -909,26 +972,37 @@ def _run_and_validate_l2(
                 initial_outputs[name] = getattr(test_args, name).clone()
 
         # Execute rounds
+        timings = []  # populated only when rounds > 1
         for round_idx in range(rounds):
             if round_idx > 0:
                 for name, initial in initial_outputs.items():
                     getattr(test_args, name).copy_(initial)
 
+            # enable_l2_swimlane / enable_dep_gen are already forced False by
+            # the upstream gate in test_run / run_module when rounds > 1, so an
+            # extra `and round_idx == 0` here is dead code; pass them through
+            # verbatim. (If the upstream gate is ever relaxed, restore the
+            # per-round masking here.)
             config = self._build_config(
                 config_dict,
-                enable_l2_swimlane=(enable_l2_swimlane if round_idx == 0 else 0),
+                enable_l2_swimlane=enable_l2_swimlane,
                 enable_dump_tensor=enable_dump_tensor,
                 enable_pmu=enable_pmu,
-                enable_dep_gen=(enable_dep_gen and round_idx == 0),
+                enable_dep_gen=enable_dep_gen,
                 output_prefix=output_prefix,
             )
 
             with _temporary_env(self._resolve_env()):
-                worker.run(cid, chip_args, config=config)
+                timing = worker.run(cid, chip_args, config=config)
+            if rounds > 1 and timing is not None:
+                timings.append((timing.host_wall_us, timing.device_wall_us))
 
             if not skip_golden:
                 _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
+        if timings:
+            _log_round_timings(timings)
+
     def _run_and_validate_l3(  # noqa: PLR0913 -- threads CLI diagnostic flags + L3 ns context
         self,
         worker,
@@ -981,17 +1055,20 @@ def _run_and_validate_l3(  # noqa: PLR0913 -- threads CLI diagnostic flags + L3
         orch_fn = self.CALLABLE["orchestration"]
 
         # Execute rounds
+        timings = []
         for round_idx in range(rounds):
             if round_idx > 0:
                 for name, initial in initial_tensors.items():
                     getattr(test_args, name).copy_(initial)
 
+            # See _run_and_validate_l2: the per-round masking is dead code
+            # under the existing upstream gate. Keep parity by passing through.
             config = self._build_config(
                 config_dict,
-                enable_l2_swimlane=(enable_l2_swimlane if round_idx == 0 else 0),
+                enable_l2_swimlane=enable_l2_swimlane,
                 enable_dump_tensor=enable_dump_tensor,
                 enable_pmu=enable_pmu,
-                enable_dep_gen=(enable_dep_gen and round_idx == 0),
+                enable_dep_gen=enable_dep_gen,
                 output_prefix=output_prefix,
             )
 
@@ -1001,11 +1078,18 @@ def task_orch(orch, _args, _cfg, _ns=ns, _test_args=test_args, _config=config):
                 orch_fn(orch, _ns, _test_args, _config)
 
             with _temporary_env(self._resolve_env()):
-                worker.run(task_orch)
+                timing = worker.run(task_orch)
+            if rounds > 1 and timing is not None:
+                # L3+ DAGs surface host wall only; device_wall_us is 0 because
+                # per-task device cycles aren't aggregated up to Worker.run.
+                timings.append((timing.host_wall_us, timing.device_wall_us))
 
             if not skip_golden:
                 _compare_outputs(test_args, golden_args, all_tensor_names, self.RTOL, self.ATOL)
 
+        if timings:
+            _log_round_timings(timings)
+
     # ------------------------------------------------------------------
     # pytest auto test method
     # ------------------------------------------------------------------
diff --git a/src/a2a3/platform/include/common/kernel_args.h b/src/a2a3/platform/include/common/kernel_args.h
index 9cffcd169..99e72fdea 100644
--- a/src/a2a3/platform/include/common/kernel_args.h
+++ b/src/a2a3/platform/include/common/kernel_args.h
@@ -95,6 +95,19 @@ struct KernelArgs {
     uint32_t log_info_v{5};             // INFO verbosity threshold (0..9); default V5
     uint32_t enable_profiling_flag{0};  // Profiling umbrella bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu
     uint32_t _pad{0};                   // Alignment padding
+
+    // Device pointer to an 8-byte buffer that the platform AICPU entry writes
+    // the run-wall (ns) into. Allocated once at simpler_init, kept resident.
+    // Onboard AICPU receives KernelArgs as a CANN-private copy (see
+    // rtAicpuKernelLaunchExWithArgs in launch_aicpu_kernel), so an inline
+    // field on KernelArgs is write-only from AICPU — host has no way to read
+    // it back. The dedicated device buffer is host-allocated and the address
+    // travels via this field; AICPU writes `*(uint64_t*)device_wall_data_base
+    // = wall_ns`, host pulls the 8 bytes via rtMemcpy(... DEVICE_TO_HOST)
+    // after stream sync. Sim's "device pointer" is a host malloc'd uint64
+    // (no special-casing — write-through works because sim AICPU and host
+    // share memory). Zero when the buffer was not allocated.
+    uint64_t device_wall_data_base{0};
 };
 
 #ifdef __cplusplus
diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp
index 843dc9fd6..32e24a526 100644
--- a/src/a2a3/platform/onboard/aicpu/kernel.cpp
+++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp
@@ -15,6 +15,7 @@
 #include "common/platform_config.h"
 #include "aicpu/dep_gen_collector_aicpu.h"
 #include "aicpu/device_log.h"
+#include "aicpu/device_time.h"
 #include "aicpu/l2_perf_collector_aicpu.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "aicpu/platform_regs.h"
@@ -22,6 +23,15 @@
 #include "aicpu/tensor_dump_aicpu.h"
 #include "runtime.h"
 
+// Run-wall capture: g_device_start_cycle is set once in
+// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread
+// of the multi-threaded DynTileFwkBackendKernelServer writes the converted
+// (end - start) into KernelArgs.device_wall_ns on exit. Plain stores —
+// last-writer-wins is fine for wall measurement (concurrent exiting threads'
+// `my_end` values differ by µs, the final overwrite is within benchmark
+// noise of the true max-end - min-start).
+static uint64_t g_device_start_cycle = 0;
+
 // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp)
 extern "C" int aicpu_execute(Runtime *arg);
 
@@ -55,6 +65,15 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     set_log_level(static_cast<int>(k_args->log_level));
     set_log_info_v(static_cast<int>(k_args->log_info_v));
 
+    // Init is launched single-threaded (block_dim=1), so the race-free spot
+    // to capture run start and reset the wall accumulator. Subsequent
+    // DynTileFwkBackendKernelServer threads stamp end on their way out, via
+    // the device-resident 8-byte buffer addressed by device_wall_data_base.
+    g_device_start_cycle = get_sys_cnt_aicpu();
+    if (k_args->device_wall_data_base != 0) {
+        *reinterpret_cast<uint64_t *>(k_args->device_wall_data_base) = 0;
+    }
+
     LOG_INFO_V0("%s", "Runtime Executor Init: Initializing AICPU kernel");
     return 0;
 }
@@ -117,5 +136,14 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     }
     LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully");
 
+    // Stamp end into the device_wall buffer (addressed via
+    // device_wall_data_base). Last-writer-wins across threads — wall
+    // measurements are µs-scale tolerant, see g_device_start_cycle comment.
+    uint64_t my_end = get_sys_cnt_aicpu();
+    if (k_args->device_wall_data_base != 0 && my_end > g_device_start_cycle) {
+        *reinterpret_cast<uint64_t *>(k_args->device_wall_data_base) =
+            static_cast<uint64_t>(cycles_to_us(my_end - g_device_start_cycle) * 1000.0);
+    }
+
     return rc;
 }
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 7e51b5c94..933247b4f 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -22,6 +22,7 @@
 #include <dlfcn.h>
 
 #include <cassert>
+#include <cstddef>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -512,6 +513,19 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Lazy-allocate the 8-byte device_wall buffer on first run. AICPU writes
+    // the run wall (ns) into this buffer via KernelArgs::device_wall_data_base;
+    // host pulls it back via rtMemcpy D2H after stream sync (see post-sync
+    // block below). Kept resident across runs; freed in finalize.
+    if (device_wall_dev_ptr_ == nullptr) {
+        device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t));
+        if (device_wall_dev_ptr_ != nullptr) {
+            kernel_args_.args.device_wall_data_base = reinterpret_cast<uint64_t>(device_wall_dev_ptr_);
+            uint64_t zero = 0;
+            (void)copy_to_device(device_wall_dev_ptr_, &zero, sizeof(uint64_t));
+        }
+    }
+
     // Validate block_dim against stream resource limits
     rc = validate_block_dim(stream_aicore_, block_dim);
     if (rc != 0) {
@@ -763,6 +777,23 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Pull the platform-level device wall (ns) back from the 8-byte device
+    // buffer that AICPU writes through via KernelArgs::device_wall_data_base.
+    // (We can't use the device_k_args_ shadow here — CANN's
+    // rtAicpuKernelLaunchExWithArgs copies KernelArgs into AICPU-private
+    // memory at launch, so AICPU's writes to its local copy don't propagate
+    // to device_k_args_.) Failure path is a soft warn — wall stays zero.
+    device_wall_ns_ = 0;
+    if (device_wall_dev_ptr_ != nullptr) {
+        int wall_rc = rtMemcpy(
+            &device_wall_ns_, sizeof(uint64_t), device_wall_dev_ptr_, sizeof(uint64_t), RT_MEMCPY_DEVICE_TO_HOST
+        );
+        if (wall_rc != 0) {
+            LOG_WARN("rtMemcpy(device_wall_ns) D2H failed: %d", wall_rc);
+            device_wall_ns_ = 0;
+        }
+    }
+
     // Tear down collectors. stop() joins mgmt then collector in the only safe
     // order (mgmt's final-drain pass into L2 has poll as its consumer).
     // Diagnostic exports use the per-task `output_prefix_` directory the user
@@ -1102,6 +1133,11 @@ int DeviceRunner::finalize() {
         acl_ready_ = false;
     }
 
+    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    if (device_wall_dev_ptr_ != nullptr) {
+        free_tensor(device_wall_dev_ptr_);
+        device_wall_dev_ptr_ = nullptr;
+    }
     device_id_ = -1;
     block_dim_ = 0;
     worker_count_ = 0;
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 812ba3c4a..99ad566f4 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -294,6 +294,14 @@ class DeviceRunner {
     void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; }
     const std::string &output_prefix() const { return output_prefix_; }
 
+    /**
+     * Device-side wall (ns) from the most recently completed run, written
+     * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in
+     * run()). Returns 0 before any run completes. Independent of any
+     * profiling / swimlane subsystem.
+     */
+    uint64_t last_device_wall_ns() const { return device_wall_ns_; }
+
     /**
      * Print handshake results from device
      *
@@ -550,6 +558,15 @@ class DeviceRunner {
     rtStream_t stream_aicore_{nullptr};
     AicpuSoInfo so_info_;
     KernelArgsHelper kernel_args_;
+
+    // Platform-level device wall buffer: 8-byte device-resident slot whose
+    // address rides on KernelArgs.device_wall_data_base. AICPU writes the
+    // run wall (ns) through that pointer; this DeviceRunner pulls it back
+    // via copy_from_device after stream sync and caches it for
+    // last_device_wall_ns(). Allocated once at simpler_init, freed in
+    // finalize.
+    void *device_wall_dev_ptr_{nullptr};
+    uint64_t device_wall_ns_{0};
     DeviceArgs device_args_;
 
     // Kernel binary management
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index 267114b7a..b287e277b 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -23,6 +23,7 @@
 
 #include <pthread.h>
 
+#include <chrono>
 #include <cstdlib>
 #include <utility>
 #include <vector>
@@ -298,8 +299,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
-    const char *output_prefix
+    const char *output_prefix, PtoRunTiming *out_timing
 ) {
+    if (out_timing != NULL) {
+        out_timing->host_wall_ns = 0;
+        out_timing->device_wall_ns = 0;
+    }
     if (ctx == NULL || runtime == NULL) return -1;
     DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
@@ -314,6 +319,8 @@ int run_prepared(
         pthread_setspecific(g_runner_key, nullptr);
     });
 
+    const auto host_t0 = std::chrono::steady_clock::now();
+
     try {
         int rc = runner->prepare_run_context(runner->device_id());
         if (rc != 0) return rc;
@@ -367,6 +374,12 @@ int run_prepared(
 
         rc = validate_runtime_impl(r);
         r->~Runtime();
+        if (out_timing != NULL) {
+            const auto host_t1 = std::chrono::steady_clock::now();
+            out_timing->host_wall_ns =
+                static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_t1 - host_t0).count());
+            out_timing->device_wall_ns = runner->last_device_wall_ns();
+        }
         return rc;
     } catch (...) {
         return -1;
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 46d3e6c5b..bd2b9a719 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -30,6 +30,7 @@
 #include <sys/stat.h>
 
 #include <atomic>
+#include <chrono>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -351,6 +352,18 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Lazy-allocate the 8-byte device_wall buffer on first run. Sim's
+    // "device pointer" is a host malloc returned by allocate_tensor; the
+    // sim AICPU thread (host CPU thread) writes through this pointer just
+    // like onboard's AICPU does.
+    if (device_wall_dev_ptr_ == nullptr) {
+        device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t));
+        if (device_wall_dev_ptr_ != nullptr) {
+            kernel_args_.device_wall_data_base = reinterpret_cast<uint64_t>(device_wall_dev_ptr_);
+            *static_cast<uint64_t *>(device_wall_dev_ptr_) = 0;
+        }
+    }
+
     // Calculate execution parameters
     block_dim_ = block_dim;
     int num_aicore = block_dim * cores_per_blockdim_;
@@ -578,8 +591,20 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     std::vector<std::thread> aicpu_threads;
     aicpu_threads.reserve(over_launch);
     std::atomic<int> aicpu_rc{0};
+
+    // Sim "device wall" capture — mirrors the onboard kernel.cpp path via the
+    // device_wall_data_base buffer (sim's "device pointer" is a host malloc'd
+    // uint64). sim_t0 is the host steady_clock equivalent of get_sys_cnt_aicpu
+    // in Init; each thread on exit overwrites the buffer with (now - sim_t0).
+    // Last-writer-wins — sim threads are host CPU threads, variance is well
+    // below benchmark noise.
+    if (kernel_args_.device_wall_data_base != 0) {
+        *reinterpret_cast<uint64_t *>(kernel_args_.device_wall_data_base) = 0;
+    }
+    const auto sim_t0 = std::chrono::steady_clock::now();
+
     for (int i = 0; i < over_launch; i++) {
-        aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc]() {
+        aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc, sim_t0]() {
             if (!platform_aicpu_affinity_gate(launch_aicpu_num, over_launch)) {
                 return;
             }
@@ -588,6 +613,11 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
                 int expected = 0;
                 aicpu_rc.compare_exchange_strong(expected, rc, std::memory_order_acq_rel);
             }
+            if (kernel_args_.device_wall_data_base != 0) {
+                const auto t1 = std::chrono::steady_clock::now();
+                *reinterpret_cast<uint64_t *>(kernel_args_.device_wall_data_base) =
+                    static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - sim_t0).count());
+            }
         }));
     }
 
@@ -615,6 +645,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     }
     LOG_INFO_V0("All threads completed");
 
+    // Snapshot the device_wall buffer into device_wall_ns_. Sim's "device
+    // memory" is host memory, so this is a single dereference — no DMA.
+    // Mirrors the post-sync rtMemcpy D2H that the onboard DeviceRunner does.
+    device_wall_ns_ = 0;
+    if (device_wall_dev_ptr_ != nullptr) {
+        device_wall_ns_ = *static_cast<uint64_t *>(device_wall_dev_ptr_);
+    }
+
     int runtime_rc = aicpu_rc.load(std::memory_order_acquire);
     if (runtime_rc != 0) {
         LOG_ERROR("AICPU execution failed with rc=%d", runtime_rc);
@@ -955,6 +993,11 @@ int DeviceRunner::finalize() {
     mem_alloc_.finalize();
     clear_cpu_sim_shared_storage();
 
+    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    if (device_wall_dev_ptr_ != nullptr) {
+        free_tensor(device_wall_dev_ptr_);
+        device_wall_dev_ptr_ = nullptr;
+    }
     device_id_ = -1;
     worker_count_ = 0;
     last_runtime_ = nullptr;
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 573236634..748b4e7ab 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -174,6 +174,14 @@ class DeviceRunner {
     void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; }
     const std::string &output_prefix() const { return output_prefix_; }
 
+    /**
+     * Device-side wall (ns) from the most recently completed run, written
+     * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in
+     * run()). Returns 0 before any run completes. Independent of any
+     * profiling / swimlane subsystem.
+     */
+    uint64_t last_device_wall_ns() const { return device_wall_ns_; }
+
     /**
      * Attach the calling thread to the simulated device.
      *
@@ -257,6 +265,15 @@ class DeviceRunner {
     // Simulation state (no actual device resources)
     KernelArgs kernel_args_;
 
+    // Platform-level device wall buffer: 8-byte device-resident slot whose
+    // address rides on KernelArgs.device_wall_data_base. AICPU writes the
+    // run wall (ns) through that pointer; this DeviceRunner pulls it back
+    // via copy_from_device after stream sync and caches it for
+    // last_device_wall_ns(). Allocated once at simpler_init, freed in
+    // finalize.
+    void *device_wall_dev_ptr_{nullptr};
+    uint64_t device_wall_ns_{0};
+
     // Chip-callable buffer pool (sim path). Keyed by FNV-1a 64-bit content
     // hash of the ChipCallable bytes. Each entry owns a host scratch holding
     // the ChipCallable with each child's resolved_addr_ fixed up to the
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index afbd85718..b6276f184 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -24,6 +24,7 @@
 #include <new>
 #include <pthread.h>
 
+#include <chrono>
 #include <utility>
 #include <vector>
 
@@ -273,8 +274,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
-    const char *output_prefix
+    const char *output_prefix, PtoRunTiming *out_timing
 ) {
+    if (out_timing != NULL) {
+        out_timing->host_wall_ns = 0;
+        out_timing->device_wall_ns = 0;
+    }
     if (ctx == NULL || runtime == NULL) return -1;
     DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
@@ -286,6 +291,8 @@ int run_prepared(
     pthread_once(&g_runner_key_once, create_runner_key);
     pthread_setspecific(g_runner_key, ctx);
 
+    const auto host_t0 = std::chrono::steady_clock::now();
+
     try {
         Runtime *r = new (runtime) Runtime();
         r->host_api.device_malloc = device_malloc;
@@ -331,6 +338,12 @@ int run_prepared(
         rc = validate_runtime_impl(r);
         r->~Runtime();
         pthread_setspecific(g_runner_key, nullptr);
+        if (out_timing != NULL) {
+            const auto host_t1 = std::chrono::steady_clock::now();
+            out_timing->host_wall_ns =
+                static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_t1 - host_t0).count());
+            out_timing->device_wall_ns = runner->last_device_wall_ns();
+        }
         return rc;
     } catch (...) {
         pthread_setspecific(g_runner_key, nullptr);
diff --git a/src/a5/platform/include/common/kernel_args.h b/src/a5/platform/include/common/kernel_args.h
index 8494ac369..9a5416806 100644
--- a/src/a5/platform/include/common/kernel_args.h
+++ b/src/a5/platform/include/common/kernel_args.h
@@ -81,6 +81,12 @@ struct KernelArgs {
     uint32_t log_info_v{5};                 // INFO verbosity threshold (0..9); default V5
     uint32_t enable_profiling_flag{0};      // Profiling umbrella bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu
     uint32_t _pad{0};                       // Alignment padding
+
+    // Device pointer to an 8-byte buffer that the platform AICPU entry writes
+    // the run-wall (ns) into. Allocated once at simpler_init, kept resident.
+    // See the a2a3 kernel_args.h for the full design rationale (CANN's
+    // AICPU args copy makes inline fields write-only).
+    uint64_t device_wall_data_base{0};
 };
 
 #ifdef __cplusplus
diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp
index 4fbea2945..4337e4429 100644
--- a/src/a5/platform/onboard/aicpu/kernel.cpp
+++ b/src/a5/platform/onboard/aicpu/kernel.cpp
@@ -14,6 +14,7 @@
 #include "common/kernel_args.h"
 #include "common/platform_config.h"
 #include "aicpu/device_log.h"
+#include "aicpu/device_time.h"
 #include "aicpu/l2_perf_collector_aicpu.h"
 #include "aicpu/platform_regs.h"
 #include "aicpu/platform_aicpu_affinity.h"
@@ -21,6 +22,13 @@
 #include "aicpu/tensor_dump_aicpu.h"
 #include "runtime.h"
 
+// Run-wall capture: g_device_start_cycle is set once in
+// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread
+// of the multi-threaded DynTileFwkBackendKernelServer writes the converted
+// (end - start) into KernelArgs.device_wall_ns on exit. Plain stores —
+// last-writer-wins is fine for wall measurement.
+static uint64_t g_device_start_cycle = 0;
+
 // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp)
 extern "C" int aicpu_execute(Runtime *arg);
 
@@ -54,6 +62,13 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     set_log_level(static_cast<int>(k_args->log_level));
     set_log_info_v(static_cast<int>(k_args->log_info_v));
 
+    // Init is launched single-threaded (block_dim=1) — race-free spot to
+    // capture run start and reset the device_wall buffer.
+    g_device_start_cycle = get_sys_cnt_aicpu();
+    if (k_args->device_wall_data_base != 0) {
+        *reinterpret_cast<uint64_t *>(k_args->device_wall_data_base) = 0;
+    }
+
     LOG_INFO_V0("%s", "Runtime Executor Init: Initializing AICPU kernel");
     return 0;
 }
@@ -114,5 +129,12 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     }
     LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully");
 
+    // Stamp end into the device_wall buffer. Last-writer-wins across threads.
+    uint64_t my_end = get_sys_cnt_aicpu();
+    if (k_args->device_wall_data_base != 0 && my_end > g_device_start_cycle) {
+        *reinterpret_cast<uint64_t *>(k_args->device_wall_data_base) =
+            static_cast<uint64_t>(cycles_to_us(my_end - g_device_start_cycle) * 1000.0);
+    }
+
     return rc;
 }
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index b27d9c6c9..bb46f24fc 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -23,6 +23,7 @@
 #include <dlfcn.h>
 
 #include <cassert>
+#include <cstddef>
 #include <cstring>
 #include <iostream>
 #include <string>
@@ -375,6 +376,17 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Lazy-allocate the 8-byte device_wall buffer on first run. See the a2a3
+    // onboard equivalent for the rationale.
+    if (device_wall_dev_ptr_ == nullptr) {
+        device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t));
+        if (device_wall_dev_ptr_ != nullptr) {
+            kernel_args_.args.device_wall_data_base = reinterpret_cast<uint64_t>(device_wall_dev_ptr_);
+            uint64_t zero = 0;
+            (void)copy_to_device(device_wall_dev_ptr_, &zero, sizeof(uint64_t));
+        }
+    }
+
     // Validate block_dim against stream resource limits
     rc = validate_block_dim(stream_aicore_, block_dim);
     if (rc != 0) {
@@ -560,6 +572,20 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Pull the platform-level device wall (ns) back from the dedicated
+    // 8-byte device buffer (AICPU writes via KernelArgs::device_wall_data_base
+    // — CANN copies args at launch so the inline field would be unreachable).
+    device_wall_ns_ = 0;
+    if (device_wall_dev_ptr_ != nullptr) {
+        int wall_rc = rtMemcpy(
+            &device_wall_ns_, sizeof(uint64_t), device_wall_dev_ptr_, sizeof(uint64_t), RT_MEMCPY_DEVICE_TO_HOST
+        );
+        if (wall_rc != 0) {
+            LOG_WARN("rtMemcpy(device_wall_ns) D2H failed: %d", wall_rc);
+            device_wall_ns_ = 0;
+        }
+    }
+
     // Tear down collectors. stop() joins mgmt then collector in the only
     // safe order (mgmt's final-drain pass into L2 has poll as its
     // consumer). Diagnostic exports use the per-task `output_prefix_`
@@ -882,6 +908,11 @@ int DeviceRunner::finalize() {
         return rc;
     }
 
+    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    if (device_wall_dev_ptr_ != nullptr) {
+        free_tensor(device_wall_dev_ptr_);
+        device_wall_dev_ptr_ = nullptr;
+    }
     device_id_ = -1;
     block_dim_ = 0;
     worker_count_ = 0;
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index 985f9708b..3762eeaee 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -279,6 +279,14 @@ class DeviceRunner {
     void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; }
     const std::string &output_prefix() const { return output_prefix_; }
 
+    /**
+     * Device-side wall (ns) from the most recently completed run, written
+     * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in
+     * run()). Returns 0 before any run completes. Independent of any
+     * profiling / swimlane subsystem.
+     */
+    uint64_t last_device_wall_ns() const { return device_wall_ns_; }
+
     /**
      * Print handshake results from device
      *
@@ -457,6 +465,15 @@ class DeviceRunner {
     rtStream_t stream_aicore_{nullptr};
     AicpuSoInfo so_info_;
     KernelArgsHelper kernel_args_;
+
+    // Platform-level device wall buffer: 8-byte device-resident slot whose
+    // address rides on KernelArgs.device_wall_data_base. AICPU writes the
+    // run wall (ns) through that pointer; this DeviceRunner pulls it back
+    // via copy_from_device after stream sync and caches it for
+    // last_device_wall_ns(). Allocated once at simpler_init, freed in
+    // finalize.
+    void *device_wall_dev_ptr_{nullptr};
+    uint64_t device_wall_ns_{0};
     DeviceArgs device_args_;
 
     // Kernel binary management
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 06fad5170..ee5b02f67 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -23,6 +23,7 @@
 
 #include <pthread.h>
 
+#include <chrono>
 #include <cstdlib>
 #include <utility>
 #include <vector>
@@ -334,8 +335,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int /*enable_dep_gen*/,
-    const char *output_prefix
+    const char *output_prefix, PtoRunTiming *out_timing
 ) {
+    if (out_timing != NULL) {
+        out_timing->host_wall_ns = 0;
+        out_timing->device_wall_ns = 0;
+    }
     if (ctx == NULL || runtime == NULL) return -1;
     DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
@@ -350,6 +355,8 @@ int run_prepared(
         pthread_setspecific(g_runner_key, nullptr);
     });
 
+    const auto host_t0 = std::chrono::steady_clock::now();
+
     try {
         int rc = runner->prepare_run_context(runner->device_id());
         if (rc != 0) return rc;
@@ -402,6 +409,12 @@ int run_prepared(
 
         rc = validate_runtime_impl(r);
         r->~Runtime();
+        if (out_timing != NULL) {
+            const auto host_t1 = std::chrono::steady_clock::now();
+            out_timing->host_wall_ns =
+                static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_t1 - host_t0).count());
+            out_timing->device_wall_ns = runner->last_device_wall_ns();
+        }
         return rc;
     } catch (...) {
         return -1;
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index a6f8fdba0..59a3a366b 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -30,6 +30,7 @@
 #include <sys/stat.h>
 
 #include <atomic>
+#include <chrono>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -320,6 +321,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Lazy-allocate the 8-byte device_wall buffer on first run.
+    if (device_wall_dev_ptr_ == nullptr) {
+        device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t));
+        if (device_wall_dev_ptr_ != nullptr) {
+            kernel_args_.device_wall_data_base = reinterpret_cast<uint64_t>(device_wall_dev_ptr_);
+            *static_cast<uint64_t *>(device_wall_dev_ptr_) = 0;
+        }
+    }
+
     // Calculate execution parameters
     block_dim_ = block_dim;
     int num_aicore = block_dim * cores_per_blockdim_;
@@ -494,8 +504,17 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     std::vector<std::thread> aicpu_threads;
     aicpu_threads.reserve(over_launch);
     std::atomic<int> aicpu_rc{0};
+
+    // Sim "device wall" capture — see a2a3 sim/onboard kernel.cpp for the
+    // shape. Writes via device_wall_data_base (sim's "device pointer" is a
+    // host malloc'd uint64).
+    if (kernel_args_.device_wall_data_base != 0) {
+        *reinterpret_cast<uint64_t *>(kernel_args_.device_wall_data_base) = 0;
+    }
+    const auto sim_t0 = std::chrono::steady_clock::now();
+
     for (int i = 0; i < over_launch; i++) {
-        aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc]() {
+        aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc, sim_t0]() {
             if (!platform_aicpu_affinity_gate(launch_aicpu_num, over_launch)) {
                 return;
             }
@@ -504,6 +523,11 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
                 int expected = 0;
                 aicpu_rc.compare_exchange_strong(expected, rc, std::memory_order_acq_rel);
             }
+            if (kernel_args_.device_wall_data_base != 0) {
+                const auto t1 = std::chrono::steady_clock::now();
+                *reinterpret_cast<uint64_t *>(kernel_args_.device_wall_data_base) =
+                    static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - sim_t0).count());
+            }
         }));
     }
 
@@ -531,6 +555,13 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     LOG_INFO_V0("All threads completed");
 
+    // Snapshot device_wall buffer into device_wall_ns_ (sim: single deref,
+    // no DMA — mirrors onboard's post-sync rtMemcpy D2H).
+    device_wall_ns_ = 0;
+    if (device_wall_dev_ptr_ != nullptr) {
+        device_wall_ns_ = *static_cast<uint64_t *>(device_wall_dev_ptr_);
+    }
+
     int runtime_rc = aicpu_rc.load(std::memory_order_acquire);
     if (runtime_rc != 0) {
         LOG_ERROR("AICPU execution failed with rc=%d", runtime_rc);
@@ -859,6 +890,11 @@ int DeviceRunner::finalize() {
     mem_alloc_.finalize();
     clear_cpu_sim_shared_storage();
 
+    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    if (device_wall_dev_ptr_ != nullptr) {
+        free_tensor(device_wall_dev_ptr_);
+        device_wall_dev_ptr_ = nullptr;
+    }
     device_id_ = -1;
     worker_count_ = 0;
     last_runtime_ = nullptr;
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 36e8c2016..74b0cd6fa 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -167,6 +167,14 @@ class DeviceRunner {
     void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; }
     const std::string &output_prefix() const { return output_prefix_; }
 
+    /**
+     * Device-side wall (ns) from the most recently completed run, written
+     * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in
+     * run()). Returns 0 before any run completes. Independent of any
+     * profiling / swimlane subsystem.
+     */
+    uint64_t last_device_wall_ns() const { return device_wall_ns_; }
+
     /**
      * Attach the calling thread to the simulated device.
      *
@@ -254,6 +262,15 @@ class DeviceRunner {
     // Simulation state (no actual device resources)
     KernelArgs kernel_args_;
 
+    // Platform-level device wall buffer: 8-byte device-resident slot whose
+    // address rides on KernelArgs.device_wall_data_base. AICPU writes the
+    // run wall (ns) through that pointer; this DeviceRunner pulls it back
+    // via copy_from_device after stream sync and caches it for
+    // last_device_wall_ns(). Allocated once at simpler_init, freed in
+    // finalize.
+    void *device_wall_dev_ptr_{nullptr};
+    uint64_t device_wall_ns_{0};
+
     // Chip-callable buffer pool (sim path). Keyed by FNV-1a 64-bit content
     // hash; each entry owns the host scratch + dlopen handles needed for the
     // children embedded in that buffer. Bulk-freed in finalize().
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 5f5099142..5f398614f 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -24,6 +24,7 @@
 #include <new>
 #include <pthread.h>
 
+#include <chrono>
 #include <utility>
 #include <vector>
 
@@ -270,8 +271,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int /*enable_dep_gen*/,
-    const char *output_prefix
+    const char *output_prefix, PtoRunTiming *out_timing
 ) {
+    if (out_timing != NULL) {
+        out_timing->host_wall_ns = 0;
+        out_timing->device_wall_ns = 0;
+    }
     if (ctx == NULL || runtime == NULL) return -1;
     DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
@@ -283,6 +288,8 @@ int run_prepared(
     pthread_once(&g_runner_key_once, create_runner_key);
     pthread_setspecific(g_runner_key, ctx);
 
+    const auto host_t0 = std::chrono::steady_clock::now();
+
     try {
         Runtime *r = new (runtime) Runtime();
         r->host_api.device_malloc = device_malloc;
@@ -327,6 +334,12 @@ int run_prepared(
         rc = validate_runtime_impl(r);
         r->~Runtime();
         pthread_setspecific(g_runner_key, nullptr);
+        if (out_timing != NULL) {
+            const auto host_t1 = std::chrono::steady_clock::now();
+            out_timing->host_wall_ns =
+                static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_t1 - host_t0).count());
+            out_timing->device_wall_ns = runner->last_device_wall_ns();
+        }
         return rc;
     } catch (...) {
         pthread_setspecific(g_runner_key, nullptr);
diff --git a/src/common/hierarchical/types.h b/src/common/hierarchical/types.h
index aa5121785..938ffe2d4 100644
--- a/src/common/hierarchical/types.h
+++ b/src/common/hierarchical/types.h
@@ -215,3 +215,17 @@ class ReadyQueue {
     std::condition_variable cv_;
     bool shutdown_{false};
 };
+
+// =============================================================================
+// RunTiming — wall-clock breakdown returned by ChipWorker::run
+// =============================================================================
+
+// host_wall_ns is the steady_clock delta wrapping the dispatch; device_wall_ns
+// is on-NPU wall captured by the platform AICPU entry (see KernelArgs::
+// device_wall_ns). Mirrors PtoRunTiming in src/common/worker/pto_runtime_c_api.h
+// so the value flows through unchanged from the dlsym ABI up to the Python
+// binding.
+struct RunTiming {
+    uint64_t host_wall_ns = 0;
+    uint64_t device_wall_ns = 0;
+};
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 9b0c7ef39..a402eb5c3 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -300,12 +300,12 @@ void ChipWorker::prepare_callable(int32_t callable_id, const void *callable) {
     }
 }
 
-void ChipWorker::run(int32_t callable_id, TaskArgsView args, const CallConfig &config) {
+RunTiming ChipWorker::run(int32_t callable_id, TaskArgsView args, const CallConfig &config) {
     ChipStorageTaskArgs chip_storage = view_to_chip_storage(args);
-    run(callable_id, &chip_storage, config);
+    return run(callable_id, &chip_storage, config);
 }
 
-void ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config) {
+RunTiming ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config) {
     config.validate();
     if (!initialized_) {
         throw std::runtime_error("ChipWorker not initialized; call init() first");
@@ -313,13 +313,15 @@ void ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args, const
 
     void *rt = runtime_buf_.data();
 
+    PtoRunTiming timing{0, 0};
     int rc = run_prepared_fn_(
         device_ctx_, rt, callable_id, args, config.block_dim, config.aicpu_thread_num, config.enable_l2_swimlane,
-        config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.output_prefix
+        config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.output_prefix, &timing
     );
     if (rc != 0) {
         throw std::runtime_error("run_prepared failed with code " + std::to_string(rc));
     }
+    return RunTiming{timing.host_wall_ns, timing.device_wall_ns};
 }
 
 void ChipWorker::unregister_callable(int32_t callable_id) {
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index a93bca33d..79c15d4d4 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -19,6 +19,7 @@
 
 #include "../task_interface/call_config.h"
 #include "../task_interface/task_args.h"
+#include "pto_runtime_c_api.h"
 #include "types.h"
 
 class ChipWorker {
@@ -50,12 +51,14 @@ class ChipWorker {
 
     // Launch a cid previously staged via prepare_callable.
     // Materializes a ChipStorageTaskArgs from `args` (one memcpy of T*40B + S*8B
-    // into a stack POD), then delegates to the overload below.
-    void run(int32_t callable_id, TaskArgsView args, const CallConfig &config);
+    // into a stack POD), then delegates to the overload below. Returns
+    // RunTiming with host wall (steady_clock around dispatch) + device wall
+    // (KernelArgs::device_wall_ns captured by the platform AICPU entry).
+    RunTiming run(int32_t callable_id, TaskArgsView args, const CallConfig &config);
     // Same launch, but the caller already holds the runtime.so-ABI POD —
     // skip the view→storage memcpy and hand the pointer straight to the C ABI.
     // Used by the ChipStorageTaskArgs path in the nanobind binding.
-    void run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config);
+    RunTiming run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config);
 
     // Per-callable_id preparation. Requires init() first and a callable_id
     // in [0, MAX_REGISTERED_CALLABLE_IDS) (cap 64).
@@ -128,7 +131,8 @@ class ChipWorker {
     // CANN dlog sync. Reads the current log level off HostLogger itself.
     using SimplerInitFn = int (*)(void *, int, const uint8_t *, size_t, const uint8_t *, size_t);
     using PrepareCallableFn = int (*)(void *, int32_t, const void *);
-    using RunPreparedFn = int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, const char *);
+    using RunPreparedFn =
+        int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, const char *, PtoRunTiming *);
     using UnregisterCallableFn = int (*)(void *, int32_t);
     using GetAicpuDlopenCountFn = size_t (*)(void *);
     using FinalizeDeviceFn = int (*)(void *);
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index 0e2e28cb2..00debb446 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -48,6 +48,31 @@ extern "C" {
 typedef void *RuntimeHandle;
 typedef void *DeviceContextHandle;
 
+/**
+ * Timing breakdown for a single run_prepared() invocation.
+ *
+ *   host_wall_ns   — wall-clock around the host-side dispatch (steady_clock
+ *                    delta wrapping the platform run() call). Always populated
+ *                    when out_timing != NULL.
+ *
+ *   device_wall_ns — on-NPU wall of the most recent orchestrator phase
+ *                    (orch_summary.end_time - .start_time, converted to ns
+ *                    using arch frequency). Populated whenever the runtime
+ *                    was built with PTO2_PROFILING (the default), regardless
+ *                    of enable_l2_swimlane. The orch_summary capture is
+ *                    decoupled from the per-record swimlane buffer pool —
+ *                    only the lightweight phase header allocation is needed
+ *                    for this field. Zero only when PTO2_PROFILING was off
+ *                    at runtime build time.
+ *
+ * Both fields are zeroed by the callee on entry (including on error paths)
+ * so callers can pass an uninitialized struct.
+ */
+typedef struct PtoRunTiming {
+    uint64_t host_wall_ns;
+    uint64_t device_wall_ns;
+} PtoRunTiming;
+
 /* ===========================================================================
  * Public API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -160,12 +185,16 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
  * `device_id` and the executor binaries are not threaded through this entry
  * — they were captured by `simpler_init` and live on the DeviceRunner.
  *
+ * If `out_timing` is non-NULL, the callee writes the wall-clock breakdown for
+ * this invocation into it (see PtoRunTiming above). The struct is zeroed on
+ * entry and partially populated on early-error returns.
+ *
  * @return 0 on success, negative on error (no prep state, NULL ctx, etc.).
  */
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
-    const char *output_prefix
+    const char *output_prefix, PtoRunTiming *out_timing
 );
 
 /**
diff --git a/tests/ut/py/test_run_timing.py b/tests/ut/py/test_run_timing.py
new file mode 100644
index 000000000..6b8aaed12
--- /dev/null
+++ b/tests/ut/py/test_run_timing.py
@@ -0,0 +1,122 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Tests for the RunTiming binding returned by Worker.run / run_prepared.
+
+Pure-Python unit tests — no device, no kernels. Verifies that the nanobind
+bound class behaves as documented at the boundary so callers (benchmark
+scripts, perf dashboards) can rely on the contract without spinning up a
+worker.
+"""
+
+import inspect
+
+import pytest
+from _task_interface import RunTiming  # pyright: ignore[reportMissingImports]
+
+
+class TestRunTimingConstruction:
+    def test_default_constructor_zeros(self):
+        t = RunTiming()
+        assert t.host_wall_ns == 0
+        assert t.device_wall_ns == 0
+        assert t.host_wall_us == 0.0
+        assert t.device_wall_us == 0.0
+
+    def test_explicit_values(self):
+        t = RunTiming(1500, 2500)
+        assert t.host_wall_ns == 1500
+        assert t.device_wall_ns == 2500
+
+    def test_device_wall_defaults_to_zero(self):
+        # The (host_ns, device_ns=0) constructor is the path Worker.run uses
+        # for L3+ DAGs, where per-task device cycles aren't aggregated yet.
+        t = RunTiming(9999)
+        assert t.host_wall_ns == 9999
+        assert t.device_wall_ns == 0
+
+
+class TestRunTimingUnitConversion:
+    def test_ns_to_us_host(self):
+        assert RunTiming(1_000, 0).host_wall_us == pytest.approx(1.0)
+        assert RunTiming(1_500_000, 0).host_wall_us == pytest.approx(1500.0)
+
+    def test_ns_to_us_device(self):
+        assert RunTiming(0, 1_000).device_wall_us == pytest.approx(1.0)
+        assert RunTiming(0, 7_250).device_wall_us == pytest.approx(7.25)
+
+    def test_us_returns_float(self):
+        t = RunTiming(1000, 2000)
+        assert isinstance(t.host_wall_us, float)
+        assert isinstance(t.device_wall_us, float)
+
+    def test_ns_returns_int(self):
+        t = RunTiming(1000, 2000)
+        assert isinstance(t.host_wall_ns, int)
+        assert isinstance(t.device_wall_ns, int)
+
+
+class TestRunTimingRepr:
+    def test_repr_includes_both_walls(self):
+        r = repr(RunTiming(1500, 2500))
+        assert "RunTiming" in r
+        assert "host_wall_us=1.5" in r
+        assert "device_wall_us=2.5" in r
+
+    def test_repr_zero(self):
+        r = repr(RunTiming())
+        assert "host_wall_us=0" in r
+        assert "device_wall_us=0" in r
+
+
+class TestRunTimingExports:
+    """Guard against the symbol getting dropped from the public surface — it's
+    re-exported from simpler.worker so end-user code can write
+    `from simpler.worker import RunTiming`.
+    """
+
+    def test_exported_from_task_interface_module(self):
+        import _task_interface  # noqa: PLC0415
+
+        assert hasattr(_task_interface, "RunTiming")
+        assert _task_interface.RunTiming is RunTiming
+
+    def test_exported_from_simpler_worker(self):
+        from simpler.worker import RunTiming as WorkerRunTiming  # noqa: PLC0415
+
+        assert WorkerRunTiming is RunTiming
+
+
+class TestWorkerRunSignature:
+    """Worker.run signature contract: documented to return a RunTiming.
+    We can't actually drive a worker without a device, but we can inspect the
+    wrapper signature + docstring to catch accidental drift (e.g. someone
+    reverting the return to None).
+    """
+
+    def test_worker_run_return_annotation(self):
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        sig = inspect.signature(Worker.run)
+        # The annotation is the RunTiming class itself (PEP 484 style).
+        assert sig.return_annotation is RunTiming, (
+            f"Worker.run lost its RunTiming return annotation; got {sig.return_annotation!r}"
+        )
+
+    def test_chip_worker_wrapper_returns_run_timing(self):
+        # The simpler.task_interface.ChipWorker wrapper is what
+        # Worker._chip_worker uses internally; it must forward the return
+        # rather than swallowing it.
+        from simpler.task_interface import ChipWorker  # noqa: PLC0415  # pyright: ignore[reportAttributeAccessIssue]
+
+        src = inspect.getsource(ChipWorker.run)
+        # Surgical drift guard: the wrapper must `return self._impl.run(...)`
+        # rather than calling it for side effects only. If someone reverts to
+        # the side-effect-only form, RunTiming would silently become None for
+        # every caller without anything else breaking.
+        assert "return self._impl.run" in src
diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh
index 84fc7486e..dc29ef329 100755
--- a/tools/benchmark_rounds.sh
+++ b/tools/benchmark_rounds.sh
@@ -7,8 +7,10 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
-# Benchmark wrapper: run examples on hardware,
-# then parse device-log timing lines to report per-round latency.
+# Benchmark wrapper: run examples on hardware, then parse per-round latency
+# emitted by the test framework directly (see simpler_setup/scene_test.py::
+# _log_round_timings). Worker.run now returns a RunTiming struct so we no
+# longer scrape device logs for sched/orch timestamps.
 #
 # Usage:
 #   ./tools/benchmark_rounds.sh [-p <platform>] [-d <device>] [-n <rounds>] [-r <runtime>]
@@ -22,11 +24,8 @@ PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
 
 # ---------------------------------------------------------------------------
 # Examples to benchmark and their case lists, per runtime.
-# Key   = directory name under tests/st/<platform>/<runtime>/
-# Value = comma-separated case names to run (empty string = run DEFAULT_CASE)
 # ---------------------------------------------------------------------------
 
-# --- tensormap_and_ringbuffer ---
 declare -A TMR_EXAMPLE_CASES=(
     [alternating_matmul_add]="Case1"
     [benchmark_bgemm]="Case0"
@@ -44,9 +43,6 @@ TMR_EXAMPLE_ORDER=(
     spmd_paged_attention
 )
 
-# ---------------------------------------------------------------------------
-# Parse arguments
-# ---------------------------------------------------------------------------
 DEVICE_ID=0
 ROUNDS=100
 PLATFORM=a2a3
@@ -56,62 +52,33 @@ EXTRA_ARGS=()
 
 while [[ $# -gt 0 ]]; do
     case "$1" in
-        -p|--platform)
-            PLATFORM="$2"
-            shift 2
-            ;;
-        -d|--device)
-            DEVICE_ID="$2"
-            shift 2
-            ;;
-        -n|--rounds)
-            ROUNDS="$2"
-            shift 2
-            ;;
-        -r|--runtime)
-            RUNTIME="$2"
-            shift 2
-            ;;
-        -v|--verbose)
-            VERBOSE=1
-            shift
-            ;;
+        -p|--platform) PLATFORM="$2"; shift 2 ;;
+        -d|--device)   DEVICE_ID="$2"; shift 2 ;;
+        -n|--rounds)   ROUNDS="$2"; shift 2 ;;
+        -r|--runtime)  RUNTIME="$2"; shift 2 ;;
+        -v|--verbose)  VERBOSE=1; shift ;;
         --help|-h)
             cat <<'USAGE'
-benchmark_rounds.sh — run all examples and report per-round timing from device logs
-
-Usage:
-  ./tools/benchmark_rounds.sh [-p <platform>] [-d <device>] [-n <rounds>] [-r <runtime>] [-v]
+benchmark_rounds.sh — run examples and report per-round host/device wall
+emitted directly by Worker.run().
 
 Options:
-  -p, --platform Platform to run on (default: a2a3)
-  -d, --device   Device ID (default: 0)
-  -n, --rounds   Override number of rounds for each example (default: 100)
-  -r, --runtime  Runtime to benchmark: tensormap_and_ringbuffer (default)
-  -v, --verbose  Save detailed test_*.py output to a timestamped log file
-  -h, --help     Show this help
-
-All other options are passed through to the underlying `python test_*.py`
-invocation (e.g. --case).
-
-Edit the EXAMPLE_CASES map at the top of this script to control which
-examples and cases to benchmark.
-
-Output:
-  Average elapsed time in microseconds for each example.
+  -p, --platform  Platform (default: a2a3)
+  -d, --device    Device ID (default: 0)
+  -n, --rounds    Rounds per example (default: 100)
+  -r, --runtime   Runtime (default: tensormap_and_ringbuffer)
+  -v, --verbose   Save test output to a timestamped log
+  -h, --help      Show this help
+
+Device-wall numbers are zero only when the runtime is built without
+PTO2_PROFILING. The default build has it on, so the device column is
+populated for every round regardless of --enable-l2-swimlane.
 USAGE
-            exit 0
-            ;;
-        *)
-            EXTRA_ARGS+=("$1")
-            shift
-            ;;
+            exit 0 ;;
+        *) EXTRA_ARGS+=("$1"); shift ;;
     esac
 done
 
-# ---------------------------------------------------------------------------
-# Verbose logging setup
-# ---------------------------------------------------------------------------
 VERBOSE_LOG=""
 if [[ $VERBOSE -eq 1 ]]; then
     mkdir -p "$PROJECT_ROOT/outputs"
@@ -119,357 +86,95 @@ if [[ $VERBOSE -eq 1 ]]; then
     echo "Verbose log: $VERBOSE_LOG"
 fi
 
-vlog() {
-    if [[ -n "$VERBOSE_LOG" ]]; then
-        echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$VERBOSE_LOG"
-    fi
-}
-
-# ---------------------------------------------------------------------------
-# Derive arch from platform and set examples directories
-# ---------------------------------------------------------------------------
-# Search both examples/ (migrated tests) and tests/st/ (legacy tests)
-ARCH="${PLATFORM%%sim}"  # strip "sim" suffix if present
+ARCH="${PLATFORM%%sim}"
 EXAMPLES_DIRS=(
     "$PROJECT_ROOT/tests/st/${ARCH}/${RUNTIME}"
     "$PROJECT_ROOT/examples/${ARCH}/${RUNTIME}"
 )
 
-# Clock frequency (MHz) for converting cycle counts to microseconds
-case "$PLATFORM" in
-    a2a3) FREQ=50 ;;
-    a5)   FREQ=1000 ;;
-    *)    echo "ERROR: unsupported platform '$PLATFORM'. Use a2a3 or a5."; exit 1 ;;
-esac
-
-# Select example cases and order based on runtime
 case "$RUNTIME" in
     tensormap_and_ringbuffer)
         declare -n EXAMPLE_CASES=TMR_EXAMPLE_CASES
         EXAMPLE_ORDER=("${TMR_EXAMPLE_ORDER[@]}")
         ;;
     *)
-        echo "ERROR: unknown runtime '$RUNTIME'. Use tensormap_and_ringbuffer."
-        exit 1
-        ;;
+        echo "ERROR: unknown runtime '$RUNTIME'." >&2
+        exit 1 ;;
 esac
 
 # ---------------------------------------------------------------------------
-# Resolve CANN device log directory: $ASCEND_WORK_PATH/log/debug or ~/ascend/log/debug
+# Parse "Avg Host: X.X us  |  Avg Device: Y.Y us  (N rounds)" emitted by
+# _log_round_timings. Returns "host_avg device_avg" (device may be "-").
 # ---------------------------------------------------------------------------
-if [[ -n "${ASCEND_WORK_PATH:-}" ]]; then
-    LOG_ROOT="$ASCEND_WORK_PATH/log/debug"
-    if [[ ! -d "$LOG_ROOT" ]]; then
-        LOG_ROOT="$HOME/ascend/log/debug"
-    fi
-else
-    LOG_ROOT="$HOME/ascend/log/debug"
-fi
-DEVICE_LOG_DIR="$LOG_ROOT/device-${DEVICE_ID}"
-
-# ---------------------------------------------------------------------------
-# parse_timing <log_file>
-#   Grep for orch_start / end lines, compute per-round elapsed, print summary.
-# ---------------------------------------------------------------------------
-parse_timing() {
-    local log_file="$1"
-
-    local timing
-    timing=$(grep -E 'Thread [0-9]+: (sched_start|orch_start|orch_end|sched_end|orch_stage_end)' "$log_file" || true)
-
-    if [[ -z "$timing" ]]; then
-        echo "  (no benchmark timing data — was PTO2_PROFILING enabled?)"
-        return 1
-    fi
-
-    echo "$timing" | awk -v freq="$FREQ" '
-    function new_round() {
-        flush_round()
-        round++
-        min_start = 0; max_end = 0
-        min_sched_start = 0; max_sched_end = 0
-        min_orch_start = 0; max_orch_end = 0
-        delete sched_seen
-        delete orch_seen
-    }
-    function flush_round() {
-        if (round >= 0 && max_end > 0 && min_start > 0) {
-            results[round] = (max_end - min_start) / freq
-            if (max_sched_end > 0 && min_sched_start > 0)
-                sched_results[round] = (max_sched_end - min_sched_start) / freq
-            if (max_orch_end > 0 && min_orch_start > 0)
-                orch_results[round] = (max_orch_end - min_orch_start) / freq
-            count++
-        }
-    }
-    BEGIN {
-        round = 0; count = 0
-        min_start = 0; max_end = 0
-        min_sched_start = 0; max_sched_end = 0
-        min_orch_start = 0; max_orch_end = 0
-        has_sched = 0; has_orch_end = 0
-    }
-    /sched_start=/ {
-        match($0, /Thread ([0-9]+):/, tm)
-        tid = tm[1] + 0
-        if (tid in sched_seen) new_round()
-        sched_seen[tid] = 1
-        has_sched = 1
-        match($0, /sched_start=([0-9]+)/, m)
-        val = m[1] + 0
-        if (min_sched_start == 0 || val < min_sched_start) min_sched_start = val
-        if (min_start == 0 || val < min_start) min_start = val
-    }
-    /orch_start=/ {
-        match($0, /Thread ([0-9]+):/, tm)
-        tid = tm[1] + 0
-        if (tid in orch_seen) new_round()
-        orch_seen[tid] = 1
-        match($0, /orch_start=([0-9]+)/, m)
-        val = m[1] + 0
-        if (min_orch_start == 0 || val < min_orch_start) min_orch_start = val
-        if (min_start == 0 || val < min_start) min_start = val
-    }
-    /sched_end[^=]*=/ {
-        match($0, /sched_end[^=]*=([0-9]+)/, m)
-        val = m[1] + 0
-        if (val > max_sched_end) max_sched_end = val
-        if (val > max_end) max_end = val
-    }
-    /orch_end=/ {
-        match($0, /orch_end=([0-9]+)/, m)
-        val = m[1] + 0
-        has_orch_end = 1
-        if (val > max_orch_end) max_orch_end = val
-        if (val > max_end) max_end = val
-    }
-    /orch_stage_end=/ {
-        match($0, /orch_stage_end=([0-9]+)/, m)
-        val = m[1] + 0
-        if (val > max_end) max_end = val
-    }
-    END {
-        flush_round()
-        if (count == 0) { print "  (no rounds parsed)"; exit 1 }
-
-        show_sched = has_sched
-        show_orch = has_orch_end
-
-        # Header
-        hdr = sprintf("  %-8s  %12s", "Round", "Elapsed (us)")
-        sep = sprintf("  %-8s  %12s", "-----", "------------")
-        if (show_sched) { hdr = hdr sprintf("  %12s", "Sched (us)"); sep = sep sprintf("  %12s", "----------") }
-        if (show_orch)  { hdr = hdr sprintf("  %12s", "Orch (us)");  sep = sep sprintf("  %12s", "---------")  }
-        print hdr; print sep
-
-        sum_v = 0; min_v = results[0]; max_v = results[0]
-        sum_s = 0; min_s = sched_results[0]; max_s = sched_results[0]
-        sum_o = 0; min_o = orch_results[0]; max_o = orch_results[0]
-
-        for (i = 0; i < count; i++) {
-            line = sprintf("  %-8d  %12.1f", i, results[i])
-            sum_v += results[i]
-            if (results[i] < min_v) min_v = results[i]
-            if (results[i] > max_v) max_v = results[i]
-            if (show_sched) {
-                line = line sprintf("  %12.1f", sched_results[i])
-                sum_s += sched_results[i]
-                if (sched_results[i] < min_s) min_s = sched_results[i]
-                if (sched_results[i] > max_s) max_s = sched_results[i]
-            }
-            if (show_orch) {
-                line = line sprintf("  %12.1f", orch_results[i])
-                sum_o += orch_results[i]
-                if (orch_results[i] < min_o) min_o = orch_results[i]
-                if (orch_results[i] > max_o) max_o = orch_results[i]
-            }
-            print line
-        }
-
-        printf "\n  Avg: %.1f us", sum_v / count
-        if (show_sched) printf "  |  Sched Avg: %.1f us", sum_s / count
-        if (show_orch)  printf "  |  Orch Avg: %.1f us", sum_o / count
-        printf "  (%d rounds)\n", count
-
-        TRIM = 10
-        if (count > 2 * TRIM) {
-            # Insertion sort for each metric
-            for (i = 0; i < count; i++) sv[i] = results[i]
-            for (i = 1; i < count; i++) {
-                k = sv[i]; j = i - 1
-                while (j >= 0 && sv[j] > k) { sv[j+1] = sv[j]; j-- }
-                sv[j+1] = k
-            }
-            tc = count - 2 * TRIM; ts = 0
-            for (i = TRIM; i < count - TRIM; i++) ts += sv[i]
-            printf "  Trimmed Avg: %.1f us  (dropped %d low + %d high, %d rounds used)\n", ts / tc, TRIM, TRIM, tc
-
-            if (show_sched) {
-                for (i = 0; i < count; i++) ss[i] = sched_results[i]
-                for (i = 1; i < count; i++) {
-                    k = ss[i]; j = i - 1
-                    while (j >= 0 && ss[j] > k) { ss[j+1] = ss[j]; j-- }
-                    ss[j+1] = k
-                }
-                ts2 = 0
-                for (i = TRIM; i < count - TRIM; i++) ts2 += ss[i]
-                printf "  Sched Trimmed Avg: %.1f us  (dropped %d low + %d high)\n", ts2 / tc, TRIM, TRIM
-            }
-            if (show_orch) {
-                for (i = 0; i < count; i++) so[i] = orch_results[i]
-                for (i = 1; i < count; i++) {
-                    k = so[i]; j = i - 1
-                    while (j >= 0 && so[j] > k) { so[j+1] = so[j]; j-- }
-                    so[j+1] = k
-                }
-                ts3 = 0
-                for (i = TRIM; i < count - TRIM; i++) ts3 += so[i]
-                printf "  Orch Trimmed Avg: %.1f us  (dropped %d low + %d high)\n", ts3 / tc, TRIM, TRIM
-            }
-        }
-    }'
-}
-
-# ---------------------------------------------------------------------------
-# wait_for_new_log <pre_run_logs_file>
-#   Wait up to 15s for a new .log file in DEVICE_LOG_DIR. Prints the path.
-# ---------------------------------------------------------------------------
-wait_for_new_log() {
-    local pre_file="$1"
-    local new_log=""
-    local deadline=$((SECONDS + 15))
-
-    while [[ $SECONDS -lt $deadline ]]; do
-        if [[ -d "$DEVICE_LOG_DIR" ]]; then
-            new_log=$(comm -13 "$pre_file" <(ls -1 "$DEVICE_LOG_DIR"/*.log 2>/dev/null | sort) 2>/dev/null | tail -1 || true)
-            if [[ -n "$new_log" ]]; then
-                echo "$new_log"
-                return 0
-            fi
-        fi
-        sleep 0.5
-    done
-
-    # Fallback: newest log
-    if [[ -d "$DEVICE_LOG_DIR" ]]; then
-        new_log=$(ls -t "$DEVICE_LOG_DIR"/*.log 2>/dev/null | head -1 || true)
-        if [[ -n "$new_log" ]]; then
-            echo "$new_log"
-            return 0
-        fi
-    fi
-    return 1
+parse_avgs() {
+    local output="$1"
+    local host_avg device_avg
+    host_avg=$(echo "$output" | grep -oE 'Avg Host: [0-9.]+' | tail -1 | awk '{print $3}')
+    device_avg=$(echo "$output" | grep -oE 'Avg Device: [0-9.]+' | tail -1 | awk '{print $3}')
+    [[ -z "$host_avg"   ]] && host_avg="-"
+    [[ -z "$device_avg" ]] && device_avg="-"
+    echo "$host_avg $device_avg"
 }
 
-# ---------------------------------------------------------------------------
-# run_bench <example> <example_dir> [case_name]
-#   Run one benchmark invocation (via `python test_*.py`) and parse timing
-#   from the resulting log. Skips the example if it has no test_*.py.
-#   Sets global PASS / FAIL counters.
-# ---------------------------------------------------------------------------
 run_bench() {
     local example="$1" example_dir="$2" case_name="${3:-}"
+    [[ -n "$case_name" ]] && echo "  ---- $case_name ----"
 
-    if [[ -n "$case_name" ]]; then
-        echo "  ---- $case_name ----"
-    fi
-
-    # Snapshot existing logs
-    local pre_log_file
-    pre_log_file=$(mktemp)
-    trap 'rm -f -- "$pre_log_file"' RETURN
-    ls -1 "$DEVICE_LOG_DIR"/*.log 2>/dev/null | sort > "$pre_log_file" || true
-
-    # Build run command using test_*.py
     local test_file
     test_file=$(find "$example_dir" -maxdepth 1 -name 'test_*.py' -print -quit 2>/dev/null || true)
-
-    local run_cmd
-    if [[ -n "$test_file" ]]; then
-        run_cmd=(
-            python3 "$test_file"
-            --platform "$PLATFORM" --device "$DEVICE_ID"
-            --rounds "$ROUNDS" --skip-golden
-        )
-    else
+    if [[ -z "$test_file" ]]; then
         echo "  SKIPPED: no test_*.py found in $example_dir"
         return
     fi
+
+    local run_cmd=(
+        python3 "$test_file"
+        --platform "$PLATFORM" --device "$DEVICE_ID"
+        --rounds "$ROUNDS" --skip-golden
+    )
     if [[ -n "$case_name" ]]; then
-        run_cmd+=(--case "$case_name")
-        [[ -n "$test_file" ]] && run_cmd+=(--manual include)
+        run_cmd+=(--case "$case_name" --manual include)
     fi
     run_cmd+=("${EXTRA_ARGS[@]}")
 
-    # Run example
-    vlog "Running: ${run_cmd[*]}"
-    local rc=0
+    local output rc=0
+    output=$("${run_cmd[@]}" 2>&1) || rc=$?
     if [[ -n "$VERBOSE_LOG" ]]; then
-        local run_output
-        run_output=$("${run_cmd[@]}" 2>&1) || rc=$?
-        if [[ -n "$run_output" ]]; then echo "$run_output" >> "$VERBOSE_LOG"; fi
-    else
-        "${run_cmd[@]}" > /dev/null 2>&1 || rc=$?
+        echo "==== $example ${case_name} ====" >> "$VERBOSE_LOG"
+        echo "$output" >> "$VERBOSE_LOG"
     fi
     if [[ $rc -ne 0 ]]; then
-        echo "  FAILED: benchmark run returned non-zero"
-        vlog "FAILED: exit code $rc"
+        echo "  FAILED: exit code $rc"
         ((FAIL++)) || true
         return
     fi
 
-    # Find new device log
-    local new_log
-    new_log=$(wait_for_new_log "$pre_log_file")
+    local avgs host_avg device_avg
+    avgs=$(parse_avgs "$output")
+    host_avg=$(echo "$avgs" | awk '{print $1}')
+    device_avg=$(echo "$avgs" | awk '{print $2}')
 
-    if [[ -z "$new_log" ]]; then
-        echo "  FAILED: no device log found in $DEVICE_LOG_DIR"
+    if [[ "$host_avg" == "-" ]]; then
+        echo "  (no per-round timing — was --rounds > 1?)"
         ((FAIL++)) || true
         return
     fi
 
-    echo "  Log: $new_log"
-    local timing_output
-    local parse_rc=0
-    timing_output=$(parse_timing "$new_log") || parse_rc=$?
-    echo "$timing_output"
-
-    if [[ $parse_rc -ne 0 ]]; then
-        ((FAIL++)) || true
-        return
-    fi
+    echo "  Avg Host: $host_avg us  |  Avg Device: $device_avg us  ($ROUNDS rounds)"
     ((PASS++)) || true
 
-    # Extract averages for summary table
     local label="$example"
     [[ -n "$case_name" ]] && label="$example ($case_name)"
-
-    local avg_line
-    avg_line=$(echo "$timing_output" | grep "^  Avg:" || true)
-    local avg_elapsed="-" avg_sched="-" avg_orch="-"
-    if [[ -n "$avg_line" ]]; then
-        avg_elapsed=$(echo "$avg_line" | awk '{print $2}')
-        avg_sched=$(echo "$avg_line" | grep -o 'Sched Avg: [0-9.]*' | awk '{print $3}') || avg_sched="-"
-        avg_orch=$(echo "$avg_line" | grep -o 'Orch Avg: [0-9.]*' | awk '{print $3}') || avg_orch="-"
-    fi
-
     SUMMARY_NAMES+=("$label")
-    SUMMARY_ELAPSED+=("$avg_elapsed")
-    SUMMARY_SCHED+=("$avg_sched")
-    SUMMARY_ORCH+=("$avg_orch")
+    SUMMARY_HOST+=("$host_avg")
+    SUMMARY_DEVICE+=("$device_avg")
 }
 
-# ---------------------------------------------------------------------------
-# Main loop
-# ---------------------------------------------------------------------------
 PASS=0
 FAIL=0
-
-# Summary collection arrays
 SUMMARY_NAMES=()
-SUMMARY_ELAPSED=()
-SUMMARY_SCHED=()
-SUMMARY_ORCH=()
+SUMMARY_HOST=()
+SUMMARY_DEVICE=()
 
 echo ""
 echo "Runtime: $RUNTIME"
@@ -477,25 +182,13 @@ echo "Runtime: $RUNTIME"
 for example in "${EXAMPLE_ORDER[@]}"; do
     case_list="${EXAMPLE_CASES[$example]:-}"
 
-    # Search for example: prefer test_*.py (new style), fall back to golden.py (legacy).
-    # tests/st/ is searched before examples/ since benchmarks use production-scale cases.
     EXAMPLE_DIR=""
     for dir in "${EXAMPLES_DIRS[@]}"; do
         candidate="$dir/$example"
         if [[ -d "$candidate" ]] && ls "$candidate"/test_*.py 1>/dev/null 2>&1; then
-            EXAMPLE_DIR="$candidate"
-            break
+            EXAMPLE_DIR="$candidate"; break
         fi
     done
-    if [[ -z "$EXAMPLE_DIR" ]]; then
-        for dir in "${EXAMPLES_DIRS[@]}"; do
-            candidate="$dir/$example"
-            if [[ -f "$candidate/golden.py" && -d "$candidate/kernels" ]]; then
-                EXAMPLE_DIR="$candidate"
-                break
-            fi
-        done
-    fi
 
     echo ""
     echo "================================================================"
@@ -503,7 +196,7 @@ for example in "${EXAMPLE_ORDER[@]}"; do
     echo "================================================================"
 
     if [[ -z "$EXAMPLE_DIR" ]]; then
-        echo "  SKIP: not found in any search directory"
+        echo "  SKIP: not found"
         ((FAIL++)) || true
         continue
     fi
@@ -518,54 +211,19 @@ for example in "${EXAMPLE_ORDER[@]}"; do
     fi
 done
 
-# ---------------------------------------------------------------------------
-# Performance Summary Table
-# ---------------------------------------------------------------------------
 if [[ ${#SUMMARY_NAMES[@]} -gt 0 ]]; then
-    # Check if any sched/orch data exists across all runs
-    _has_sched=0
-    _has_orch=0
-    for _i in "${!SUMMARY_NAMES[@]}"; do
-        [[ "${SUMMARY_SCHED[$_i]}" != "-" ]] && _has_sched=1
-        [[ "${SUMMARY_ORCH[$_i]}" != "-" ]] && _has_orch=1
-    done
-
     echo ""
     echo "================================================================"
     echo "  Performance Summary ($RUNTIME)"
     echo "================================================================"
     echo ""
-
-    # Header
-    _hdr=$(printf "  %-40s  %12s" "Example" "Elapsed (us)")
-    _sep=$(printf "  %-40s  %12s" "----------------------------------------" "------------")
-    if [[ $_has_sched -eq 1 ]]; then
-        _hdr=$(printf "%s  %12s" "$_hdr" "Sched (us)")
-        _sep=$(printf "%s  %12s" "$_sep" "------------")
-    fi
-    if [[ $_has_orch -eq 1 ]]; then
-        _hdr=$(printf "%s  %12s" "$_hdr" "Orch (us)")
-        _sep=$(printf "%s  %12s" "$_sep" "------------")
-    fi
-    echo "$_hdr"
-    echo "$_sep"
-
-    # Rows
-    for _i in "${!SUMMARY_NAMES[@]}"; do
-        _row=$(printf "  %-40s  %12s" "${SUMMARY_NAMES[$_i]}" "${SUMMARY_ELAPSED[$_i]}")
-        if [[ $_has_sched -eq 1 ]]; then
-            _row=$(printf "%s  %12s" "$_row" "${SUMMARY_SCHED[$_i]}")
-        fi
-        if [[ $_has_orch -eq 1 ]]; then
-            _row=$(printf "%s  %12s" "$_row" "${SUMMARY_ORCH[$_i]}")
-        fi
-        echo "$_row"
+    printf "  %-40s  %12s  %12s\n" "Example" "Host (us)" "Device (us)"
+    printf "  %-40s  %12s  %12s\n" "----------------------------------------" "------------" "------------"
+    for i in "${!SUMMARY_NAMES[@]}"; do
+        printf "  %-40s  %12s  %12s\n" "${SUMMARY_NAMES[$i]}" "${SUMMARY_HOST[$i]}" "${SUMMARY_DEVICE[$i]}"
     done
 fi
 
-# ---------------------------------------------------------------------------
-# Summary
-# ---------------------------------------------------------------------------
 TOTAL=$((PASS + FAIL))
 echo ""
 echo "================================================================"