From c4ec5db6972c7d6ff62dd53f1dc78c99154edc73 Mon Sep 17 00:00:00 2001
From: wcwxy <26245345+ChaoWao@users.noreply.github.com>
Date: Sat, 16 May 2026 08:35:12 +0800
Subject: [PATCH] Add: RunTiming return from Worker.run / run_prepared
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Worker.run and run_prepared now return a RunTiming struct (host_wall /
device_wall in ns + µs accessors) so benchmarks can read timing
directly instead of scraping device logs.

- C ABI: add PtoRunTiming struct + trailing nullable out_timing param
  on run_prepared in src/common/worker/pto_runtime_c_api.h. Updated in
  lockstep across all 4 platform impls
  (src/{a2a3,a5}/platform/{onboard,sim}/host/pto_runtime_c_api.cpp).
- host_wall_ns: steady_clock delta wrapping the dispatch.
- device_wall_ns: an always-on mailbox in PTO2SharedMemoryHeader
  (orch_start_cycle / orch_end_cycle, std::atomic<uint64_t>). AICPU's
  aicpu_executor publishes the two cycle counters once per run under
  PTO2_PROFILING (no swimlane gate); host reads them via the existing
  copy_from_device of the runtime header that runtime_maker already
  does for graph_output_ptr — same shared region, zero additional
  copies. A new extern "C" pto2_read_orch_wall_ns(Runtime*) in
  runtime_maker.cpp does the cycles → ns conversion; the C ABI in
  every platform impl calls it to fill out_timing->device_wall_ns.
  Independent of L2PerfCollector / AicpuPhaseRecord — the RunTiming
  feature owns its own data path so it stays available on every build
  with PTO2_PROFILING, regardless of swimlane / phase capture.
- C++: ChipWorker::run / run_prepared and IWorker::run now return
  RunTiming. RunPreparedFn typedef extended.
- Python: nanobind RunTiming class exposed as _task_interface.RunTiming
  with host_wall_us / device_wall_us / *_ns properties and an
  (host_ns, device_ns=0) constructor. Worker.run / run_prepared and the
  simpler.task_interface.ChipWorker wrapper return it.
- Benchmark: simpler_setup/scene_test.py collects per-round timings and
  prints a host/device wall table via new _log_round_timings.
  tools/benchmark_rounds.sh rewritten to parse the printed Avg lines
  instead of grepping device logs (drops awk parser, wait_for_new_log,
  ASCEND_WORK_PATH discovery).

device_wall_ns is now populated on every run that has PTO2_PROFILING
compiled in (the default), regardless of enable_l2_swimlane. This
matters for benchmarks: scene_test only keeps swimlane on for round 0
to avoid the JSON-dump overhead — before this PR, rounds 1..N-1 read
device_wall = 0 because the old orch_summary lived in the L2 perf
shared region which is only allocated when swimlane is on. The mailbox
lives in PTO2SharedMemoryHeader instead, which is always allocated.

Verified locally on a5sim with --enable-l2-swimlane=OFF: vector_add
returns host_wall_us=582298.583, device_wall_us=4.000, both > 0,
host > device. tests/ut/py/test_run_timing.py: 13/13 pass.
---
 examples/workers/l2/vector_add/main.py        |   6 +-
 .../workers/l2/vector_add/test_run_timing.py  | 115 +++++
 python/bindings/task_interface.cpp            |  63 ++-
 python/simpler/task_interface.py              |   4 +-
 python/simpler/worker.py                      |  62 ++-
 simpler_setup/scene_test.py                   |  96 +++-
 .../platform/include/common/kernel_args.h     |  13 +
 src/a2a3/platform/onboard/aicpu/kernel.cpp    |  28 ++
 .../platform/onboard/host/device_runner.cpp   |  36 ++
 .../platform/onboard/host/device_runner.h     |  17 +
 .../onboard/host/pto_runtime_c_api.cpp        |  15 +-
 src/a2a3/platform/sim/host/device_runner.cpp  |  45 +-
 src/a2a3/platform/sim/host/device_runner.h    |  17 +
 .../platform/sim/host/pto_runtime_c_api.cpp   |  15 +-
 src/a5/platform/include/common/kernel_args.h  |   6 +
 src/a5/platform/onboard/aicpu/kernel.cpp      |  22 +
 .../platform/onboard/host/device_runner.cpp   |  31 ++
 src/a5/platform/onboard/host/device_runner.h  |  17 +
 .../onboard/host/pto_runtime_c_api.cpp        |  15 +-
 src/a5/platform/sim/host/device_runner.cpp    |  38 +-
 src/a5/platform/sim/host/device_runner.h      |  17 +
 .../platform/sim/host/pto_runtime_c_api.cpp   |  15 +-
 src/common/hierarchical/types.h               |  14 +
 src/common/worker/chip_worker.cpp             |  10 +-
 src/common/worker/chip_worker.h               |  12 +-
 src/common/worker/pto_runtime_c_api.h         |  31 +-
 tests/ut/py/test_run_timing.py                | 122 +++++
 tools/benchmark_rounds.sh                     | 476 +++---------------
 28 files changed, 896 insertions(+), 462 deletions(-)
 create mode 100644 examples/workers/l2/vector_add/test_run_timing.py
 create mode 100644 tests/ut/py/test_run_timing.py

diff --git a/examples/workers/l2/vector_add/main.py b/examples/workers/l2/vector_add/main.py
index 6ad1480ad..6db910b84 100644
--- a/examples/workers/l2/vector_add/main.py
+++ b/examples/workers/l2/vector_add/main.py
@@ -127,7 +127,7 @@ def build_chip_callable(platform: str) -> ChipCallable:
     )
 
 
-def _run(worker: Worker, chip_cid: int) -> None:
+def _run(worker: Worker, chip_cid: int):
     """Allocate device memory, copy inputs, execute, copy outputs back, verify."""
     # --- 1. Prepare host arrays ---
     torch.manual_seed(42)
@@ -155,7 +155,8 @@ def _run(worker: Worker, chip_cid: int) -> None:
     # --- 4. Run. CallConfig() defaults are fine for this kernel. ---
     config = CallConfig()
     print("[vector_add] running on device...")
-    worker.run(chip_cid, args, config)
+    timing = worker.run(chip_cid, args, config)
+    print(f"[vector_add] {timing}")
 
     # --- 5. D2H copy back + verify ---
     worker.copy_from(host_out.data_ptr(), dev_out, NBYTES)
@@ -169,6 +170,7 @@ def _run(worker: Worker, chip_cid: int) -> None:
     print(f"[vector_add] max |host_out - expected| = {max_diff:.3e}")
     assert torch.allclose(host_out, expected, rtol=1e-5, atol=1e-5)
     print("[vector_add] golden check PASSED")
+    return timing
 
 
 def run(platform: str, device_id: int) -> int:
diff --git a/examples/workers/l2/vector_add/test_run_timing.py b/examples/workers/l2/vector_add/test_run_timing.py
new file mode 100644
index 000000000..a3944c087
--- /dev/null
+++ b/examples/workers/l2/vector_add/test_run_timing.py
@@ -0,0 +1,115 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Hardware ST asserting Worker.run returns a usable RunTiming.
+
+Reuses the vector_add example's kernel + ChipCallable build so this test
+doesn't drag in its own kernel sources. The contract being verified:
+
+    * `worker.run(...)` returns a `RunTiming` instance (not None).
+    * `host_wall_us` is strictly positive — there's no way a real dispatch
+      took zero steady-clock time, so this would catch a regression where
+      the C ABI stopped writing the timing back to the out-param.
+    * `device_wall_us` is strictly positive on the default build, even
+      without `--enable-l2-swimlane`. The orch_summary write was decoupled
+      from the swimlane shared-region path so device timing is always
+      available when PTO2_PROFILING is compiled in (default).
+    * `host_wall_us > device_wall_us` — host wall must wrap the device
+      dispatch, so the host clock should always read longer.
+"""
+
+import pytest
+from _task_interface import RunTiming  # pyright: ignore[reportMissingImports]
+from simpler.task_interface import CallConfig, ChipStorageTaskArgs, ContinuousTensor, DataType
+from simpler.worker import Worker
+
+from .main import N_COLS, N_ELEMS, N_ROWS, NBYTES, build_chip_callable
+
+
+def _drive_one_run(platform: str, device_id: int, *, enable_l2_swimlane: bool = False):
+    import torch  # noqa: PLC0415
+
+    worker = Worker(
+        level=2,
+        platform=platform,
+        runtime="tensormap_and_ringbuffer",
+        device_id=device_id,
+    )
+    chip_callable = build_chip_callable(platform)
+    chip_cid = worker.register(chip_callable)
+    worker.init()
+    try:
+        # Use deterministic inputs so the run never accidentally hits a
+        # degenerate kernel path that fails to publish a perf record.
+        host_a = torch.full((N_ROWS, N_COLS), 1.0, dtype=torch.float32)
+        host_b = torch.full((N_ROWS, N_COLS), 2.0, dtype=torch.float32)
+
+        dev_a = worker.malloc(NBYTES)
+        dev_b = worker.malloc(NBYTES)
+        dev_out = worker.malloc(NBYTES)
+        worker.copy_to(dev_a, host_a.data_ptr(), NBYTES)
+        worker.copy_to(dev_b, host_b.data_ptr(), NBYTES)
+
+        args = ChipStorageTaskArgs()
+        args.add_tensor(ContinuousTensor.make(dev_a, (N_ROWS, N_COLS), DataType.FLOAT32))
+        args.add_tensor(ContinuousTensor.make(dev_b, (N_ROWS, N_COLS), DataType.FLOAT32))
+        args.add_tensor(ContinuousTensor.make(dev_out, (N_ROWS, N_COLS), DataType.FLOAT32))
+
+        config = CallConfig()
+        config.enable_l2_swimlane = enable_l2_swimlane
+
+        timing = worker.run(chip_cid, args, config)
+
+        # Verify the output is sane (so we know the kernel actually ran and
+        # the timing isn't from an early-error path).
+        host_out = torch.zeros(N_ROWS, N_COLS, dtype=torch.float32)
+        worker.copy_from(host_out.data_ptr(), dev_out, NBYTES)
+        worker.free(dev_a)
+        worker.free(dev_b)
+        worker.free(dev_out)
+        assert torch.allclose(host_out, host_a + host_b, rtol=1e-5, atol=1e-5), (
+            f"vector_add kernel output diverged; max |a+b - out| = "
+            f"{float(torch.max(torch.abs(host_out - (host_a + host_b)))):.3e} "
+            f"(N_ELEMS={N_ELEMS})"
+        )
+        return timing
+    finally:
+        worker.close()
+
+
+@pytest.mark.platforms(["a2a3sim", "a2a3"])
+@pytest.mark.runtime("tensormap_and_ringbuffer")
+@pytest.mark.device_count(1)
+def test_worker_run_returns_run_timing(st_platform, st_device_ids):
+    timing = _drive_one_run(st_platform, int(st_device_ids[0]))
+
+    assert isinstance(timing, RunTiming), (
+        f"Worker.run must return a RunTiming (got {type(timing).__name__}); "
+        f"a None return means the ChipWorker Python wrapper dropped the C++ return value."
+    )
+    # Host wall is measured with steady_clock around the dispatch; even on
+    # sim it covers thread + IPC overhead so the only way to see 0 is a
+    # bug in the out-param plumbing.
+    assert timing.host_wall_us > 0.0, f"host_wall_us must be > 0, got {timing.host_wall_us}"
+    assert timing.host_wall_ns > 0, f"host_wall_ns must be > 0, got {timing.host_wall_ns}"
+    # device_wall must also be > 0 without --enable-l2-swimlane after the
+    # Phase B decoupling: orch_summary is written unconditionally when
+    # PTO2_PROFILING is on (default build). Hitting 0 here means either:
+    #   - the AICPU's l2_perf_aicpu_write_orch_summary path regressed back
+    #     under an is_l2_swimlane_enabled() gate, or
+    #   - the host stopped reading the phase header after the run.
+    assert timing.device_wall_us > 0.0, (
+        f"device_wall_us must be > 0 on the default PTO2_PROFILING build, got {timing.device_wall_us}; "
+        f"regression in the orch_summary capture path."
+    )
+    assert timing.device_wall_ns > 0
+    # Host wall must wrap the device dispatch — anything else points at
+    # a steady-clock / cycle-conversion bug.
+    assert timing.host_wall_us >= timing.device_wall_us, (
+        f"host_wall_us ({timing.host_wall_us}) must wrap device_wall_us ({timing.device_wall_us})"
+    )
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index a6f07c48c..5d4d874fa 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -652,6 +652,57 @@ NB_MODULE(_task_interface, m) {
     // one, change the other.
     m.attr("DEFAULT_LOG_THRESHOLD") = 20;  // V5 = Python INFO
 
+    // --- RunTiming ---
+    // Returned by ChipWorker.run_prepared* / Worker.run. Cycles → ns conversion
+    // happens on the platform side (frequency known there); units exposed to
+    // Python are µs as floats to match historical benchmark_rounds.sh output.
+    nb::class_<RunTiming>(m, "RunTiming")
+        .def(nb::init<>())
+        .def(
+            "__init__",
+            [](RunTiming *self, uint64_t host_wall_ns, uint64_t device_wall_ns) {
+                new (self) RunTiming{host_wall_ns, device_wall_ns};
+            },
+            nb::arg("host_wall_ns"), nb::arg("device_wall_ns") = 0,
+            "Construct with explicit ns values (used by the Python Worker.run "
+            "wrapper to surface host-side timing for L3+ DAGs)."
+        )
+        .def_prop_ro(
+            "host_wall_us",
+            [](const RunTiming &t) {
+                return t.host_wall_ns / 1000.0;
+            },
+            "Host steady-clock wall around the dispatch, in microseconds."
+        )
+        .def_prop_ro(
+            "device_wall_us",
+            [](const RunTiming &t) {
+                return t.device_wall_ns / 1000.0;
+            },
+            "On-NPU wall (orch end - orch start), in microseconds. Populated whenever the "
+            "runtime was built with PTO2_PROFILING (the default); independent of "
+            "enable_l2_swimlane after the orch_summary capture was decoupled from the "
+            "swimlane shared region. Zero only on a PTO2_PROFILING-off build."
+        )
+        .def_prop_ro(
+            "host_wall_ns",
+            [](const RunTiming &t) {
+                return t.host_wall_ns;
+            }
+        )
+        .def_prop_ro(
+            "device_wall_ns",
+            [](const RunTiming &t) {
+                return t.device_wall_ns;
+            }
+        )
+        .def("__repr__", [](const RunTiming &t) {
+            std::ostringstream os;
+            os << "RunTiming(host_wall_us=" << t.host_wall_ns / 1000.0
+               << ", device_wall_us=" << t.device_wall_ns / 1000.0 << ")";
+            return os.str();
+        });
+
     // --- ChipWorker ---
     nb::class_<ChipWorker>(m, "_ChipWorker")
         .def(nb::init<>())
@@ -685,19 +736,21 @@ NB_MODULE(_task_interface, m) {
         .def(
             "run",
             [](ChipWorker &self, int32_t callable_id, ChipStorageTaskArgs &args, const CallConfig &config) {
-                self.run(callable_id, &args, config);
+                return self.run(callable_id, &args, config);
             },
             nb::arg("callable_id"), nb::arg("args"), nb::arg("config"),
-            "Launch a callable_id previously staged via prepare_callable."
+            "Launch a callable_id previously staged via prepare_callable. "
+            "Returns RunTiming with host/device wall."
         )
         .def(
             "run",
             [](ChipWorker &self, int32_t callable_id, TaskArgs &args, const CallConfig &config) {
                 TaskArgsView view = make_view(args);
-                self.run(callable_id, view, config);
+                return self.run(callable_id, view, config);
             },
             nb::arg("callable_id"), nb::arg("args"), nb::arg("config"),
-            "Launch a callable_id from a TaskArgs (used for in-process callers)."
+            "Launch a callable_id from a TaskArgs (used for in-process callers). "
+            "Returns RunTiming."
         )
         .def(
             "run_prepared_from_blob",
@@ -710,7 +763,7 @@ NB_MODULE(_task_interface, m) {
                 // loops never re-implement the tensor/scalar layout in Python
                 // (where it has historically dropped fields like child_memory).
                 TaskArgsView view = read_blob(reinterpret_cast<const uint8_t *>(args_blob_ptr), blob_capacity);
-                self.run(callable_id, view, config);
+                return self.run(callable_id, view, config);
             },
             nb::arg("callable_id"), nb::arg("args_blob_ptr"), nb::arg("blob_capacity"), nb::arg("config"),
             "Launch a callable_id from a raw mailbox-blob pointer + capacity "
diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py
index 2e0398898..49e8471d6 100644
--- a/python/simpler/task_interface.py
+++ b/python/simpler/task_interface.py
@@ -521,12 +521,14 @@ def run(self, callable_id, args, config=None, **kwargs):
             args: ChipStorageTaskArgs for this invocation.
             config: Optional CallConfig. If None, a default is created.
             **kwargs: Overrides applied to config (e.g. block_dim=24).
+
+        Returns a :class:`RunTiming` with host + device wall.
         """
         if config is None:
             config = CallConfig()
         for k, v in kwargs.items():
             setattr(config, k, v)
-        self._impl.run(int(callable_id), args, config)
+        return self._impl.run(int(callable_id), args, config)
 
     def unregister_callable(self, callable_id):
         """Drop prepared state for ``callable_id`` and release its orch SO share."""
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 64c678236..ba02d5e86 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -70,6 +70,7 @@ def my_l4_orch(orch, args, config):
     MAX_REGISTERED_CALLABLE_IDS,
     ChipBootstrapChannel,
     ChipBootstrapMailboxState,
+    RunTiming,
     _mailbox_load_i32,
     _mailbox_store_i32,
     read_args_from_blob,
@@ -1444,7 +1445,7 @@ def copy_from(self, dst: int, src: int, size: int, worker_id: int = 0) -> None:
     # run — uniform entry point
     # ------------------------------------------------------------------
 
-    def run(self, callable, args=None, config=None) -> None:
+    def run(self, callable, args=None, config=None) -> RunTiming:
         """Execute one task (L2) or one DAG (L3+) synchronously.
 
         Dispatch:
@@ -1455,35 +1456,48 @@ def run(self, callable, args=None, config=None) -> None:
 
         ``args``  : TaskArgs (optional)
         ``config``: CallConfig (optional, default-constructed if None)
+
+        Returns a :class:`RunTiming` with ``host_wall_us`` (Python wall-clock
+        around the dispatch) and ``device_wall_us`` (on-NPU orchestrator wall,
+        populated whenever the runtime was built with ``PTO2_PROFILING`` —
+        the default build has it on). For L3+ DAGs, ``host_wall_us`` covers
+        the whole orch fn and ``device_wall_us`` is unset (0) — per-task
+        device timings are not aggregated here.
         """
         assert self._initialized, "Worker not initialized; call init() first"
         cfg = config if config is not None else CallConfig()
 
         if self.level == 2:
             assert self._chip_worker is not None
-            self._chip_worker.run(int(callable), args, cfg)
-        else:
-            self._start_hierarchical()
-            assert self._orch is not None
-            assert self._worker is not None
-            # Drop any error stashed by a previous run() so this call starts
-            # clean. drain() rethrows on the way out; every successful run()
-            # leaves the error slot empty, but an unrelated caller may have
-            # poked it.
-            self._orch._clear_error()
-            self._orch._scope_begin()
-            try:
-                callable(self._orch, args, cfg)
-            finally:
-                # Always release scope refs and drain so ring slots aren't
-                # stranded when the orch fn raises mid-DAG. drain() also
-                # rethrows the first dispatch failure for this run — that
-                # is how child-task exceptions surface to the caller of
-                # Worker.run(). scope_end deliberately does NOT throw: if
-                # it did, released refs would be incomplete and drain
-                # would hang on in-flight tasks.
-                self._orch._scope_end()
-                self._orch._drain()
+            return self._chip_worker.run(int(callable), args, cfg)
+
+        self._start_hierarchical()
+        assert self._orch is not None
+        assert self._worker is not None
+        # Drop any error stashed by a previous run() so this call starts
+        # clean. drain() rethrows on the way out; every successful run()
+        # leaves the error slot empty, but an unrelated caller may have
+        # poked it.
+        self._orch._clear_error()
+        self._orch._scope_begin()
+        t_start = time.perf_counter_ns()
+        try:
+            callable(self._orch, args, cfg)
+        finally:
+            # Always release scope refs and drain so ring slots aren't
+            # stranded when the orch fn raises mid-DAG. drain() also
+            # rethrows the first dispatch failure for this run — that
+            # is how child-task exceptions surface to the caller of
+            # Worker.run(). scope_end deliberately does NOT throw: if
+            # it did, released refs would be incomplete and drain
+            # would hang on in-flight tasks.
+            self._orch._scope_end()
+            self._orch._drain()
+        # device_wall stays 0 for L3+: aggregating per-task device cycles
+        # across a DAG isn't implemented here (would need accumulation in the
+        # ring scheduler). Callers wanting per-task device wall should issue
+        # individual run calls.
+        return RunTiming(time.perf_counter_ns() - t_start, 0)
 
     def prepare_callable(self, callable_id: int, callable) -> None:
         """L2 only: pre-stage a callable under ``callable_id`` (see
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index e681fa167..5a994b332 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -325,6 +325,69 @@ def _temporary_env(env_updates):
                 os.environ[k] = v
 
 
+def _log_round_timings(timings):
+    """Print per-round + summary host/device wall (µs) for multi-round runs.
+
+    Replaces the device-log scraping that ``tools/benchmark_rounds.sh`` used to
+    do — Worker.run now returns the timing directly, so the per-round table
+    can be emitted by the framework without grep+awk over device logs.
+
+    Output goes to ``print`` (not ``logger.info``) because benchmark numbers
+    are a user-facing artifact and the project's default log level suppresses
+    INFO in standalone test_*.py runs.
+
+    ``timings`` is a list of (host_wall_us, device_wall_us) tuples. The device
+    column reports 0 when the runtime was built without PTO2_PROFILING (the
+    default build has it on) or when an L3+ DAG is in use (per-task device
+    cycles aren't aggregated).
+    """
+    if not timings:
+        return
+    n = len(timings)
+    host_vals = sorted(t[0] for t in timings)
+    dev_vals = sorted(t[1] for t in timings)
+    host_avg = sum(host_vals) / n
+    # Device avg averages over non-zero rounds only. When --rounds > 1 the
+    # framework restricts enable_l2_swimlane (and thus orch_summary capture)
+    # to round 0, so the other rounds report 0; including those zeros in the
+    # average would silently halve / quarter the reported device wall and
+    # mislead the benchmark consumer. dev_count carries the sample size into
+    # the summary line for transparency.
+    dev_nonzero = [v for v in dev_vals if v > 0.0]
+    dev_count = len(dev_nonzero)
+    dev_avg = (sum(dev_nonzero) / dev_count) if dev_count else 0.0
+    show_device = dev_count > 0
+
+    header = f"  {'Round':<6}  {'Host (us)':>12}"
+    if show_device:
+        header += f"  {'Device (us)':>12}"
+    print(header)
+    print("  " + "-" * (len(header) - 2))
+    for i, (h, d) in enumerate(timings):
+        line = f"  {i:<6d}  {h:>12.1f}"
+        if show_device:
+            line += f"  {d:>12.1f}"
+        print(line)
+    summary = f"  Avg Host: {host_avg:.1f} us"
+    if show_device:
+        summary += f"  |  Avg Device: {dev_avg:.1f} us [{dev_count}/{n} rounds captured]"
+    summary += f"  ({n} rounds)"
+    print(summary)
+
+    trim = 10
+    if n > 2 * trim:
+        tc = n - 2 * trim
+        host_trim = sum(host_vals[trim:-trim]) / tc
+        msg = f"  Trimmed Avg Host: {host_trim:.1f} us"
+        if show_device and dev_count > 2 * trim:
+            dev_nonzero_sorted = sorted(dev_nonzero)
+            dev_trim_count = dev_count - 2 * trim
+            dev_trim = sum(dev_nonzero_sorted[trim:-trim]) / dev_trim_count
+            msg += f"  |  Trimmed Avg Device: {dev_trim:.1f} us"
+        msg += f"  (dropped {trim} low + {trim} high, {tc} rounds used)"
+        print(msg)
+
+
 def _resolve_callable_paths(cls, cls_dir):
     """Resolve relative source paths in CALLABLE against cls_dir."""
     callable_spec = cls.CALLABLE
@@ -909,26 +972,37 @@ def _run_and_validate_l2(
                 initial_outputs[name] = getattr(test_args, name).clone()
 
         # Execute rounds
+        timings = []  # populated only when rounds > 1
         for round_idx in range(rounds):
             if round_idx > 0:
                 for name, initial in initial_outputs.items():
                     getattr(test_args, name).copy_(initial)
 
+            # enable_l2_swimlane / enable_dep_gen are already forced False by
+            # the upstream gate in test_run / run_module when rounds > 1, so an
+            # extra `and round_idx == 0` here is dead code; pass them through
+            # verbatim. (If the upstream gate is ever relaxed, restore the
+            # per-round masking here.)
             config = self._build_config(
                 config_dict,
-                enable_l2_swimlane=(enable_l2_swimlane if round_idx == 0 else 0),
+                enable_l2_swimlane=enable_l2_swimlane,
                 enable_dump_tensor=enable_dump_tensor,
                 enable_pmu=enable_pmu,
-                enable_dep_gen=(enable_dep_gen and round_idx == 0),
+                enable_dep_gen=enable_dep_gen,
                 output_prefix=output_prefix,
             )
 
             with _temporary_env(self._resolve_env()):
-                worker.run(cid, chip_args, config=config)
+                timing = worker.run(cid, chip_args, config=config)
+            if rounds > 1 and timing is not None:
+                timings.append((timing.host_wall_us, timing.device_wall_us))
 
             if not skip_golden:
                 _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
+        if timings:
+            _log_round_timings(timings)
+
     def _run_and_validate_l3(  # noqa: PLR0913 -- threads CLI diagnostic flags + L3 ns context
         self,
         worker,
@@ -981,17 +1055,20 @@ def _run_and_validate_l3(  # noqa: PLR0913 -- threads CLI diagnostic flags + L3
         orch_fn = self.CALLABLE["orchestration"]
 
         # Execute rounds
+        timings = []
         for round_idx in range(rounds):
             if round_idx > 0:
                 for name, initial in initial_tensors.items():
                     getattr(test_args, name).copy_(initial)
 
+            # See _run_and_validate_l2: the per-round masking is dead code
+            # under the existing upstream gate. Keep parity by passing through.
             config = self._build_config(
                 config_dict,
-                enable_l2_swimlane=(enable_l2_swimlane if round_idx == 0 else 0),
+                enable_l2_swimlane=enable_l2_swimlane,
                 enable_dump_tensor=enable_dump_tensor,
                 enable_pmu=enable_pmu,
-                enable_dep_gen=(enable_dep_gen and round_idx == 0),
+                enable_dep_gen=enable_dep_gen,
                 output_prefix=output_prefix,
             )
 
@@ -1001,11 +1078,18 @@ def task_orch(orch, _args, _cfg, _ns=ns, _test_args=test_args, _config=config):
                 orch_fn(orch, _ns, _test_args, _config)
 
             with _temporary_env(self._resolve_env()):
-                worker.run(task_orch)
+                timing = worker.run(task_orch)
+            if rounds > 1 and timing is not None:
+                # L3+ DAGs surface host wall only; device_wall_us is 0 because
+                # per-task device cycles aren't aggregated up to Worker.run.
+                timings.append((timing.host_wall_us, timing.device_wall_us))
 
             if not skip_golden:
                 _compare_outputs(test_args, golden_args, all_tensor_names, self.RTOL, self.ATOL)
 
+        if timings:
+            _log_round_timings(timings)
+
     # ------------------------------------------------------------------
     # pytest auto test method
     # ------------------------------------------------------------------
diff --git a/src/a2a3/platform/include/common/kernel_args.h b/src/a2a3/platform/include/common/kernel_args.h
index 9cffcd169..99e72fdea 100644
--- a/src/a2a3/platform/include/common/kernel_args.h
+++ b/src/a2a3/platform/include/common/kernel_args.h
@@ -95,6 +95,19 @@ struct KernelArgs {
     uint32_t log_info_v{5};             // INFO verbosity threshold (0..9); default V5
     uint32_t enable_profiling_flag{0};  // Profiling umbrella bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu
     uint32_t _pad{0};                   // Alignment padding
+
+    // Device pointer to an 8-byte buffer that the platform AICPU entry writes
+    // the run-wall (ns) into. Allocated once at simpler_init, kept resident.
+    // Onboard AICPU receives KernelArgs as a CANN-private copy (see
+    // rtAicpuKernelLaunchExWithArgs in launch_aicpu_kernel), so an inline
+    // field on KernelArgs is write-only from AICPU — host has no way to read
+    // it back. The dedicated device buffer is host-allocated and the address
+    // travels via this field; AICPU writes `*(uint64_t*)device_wall_data_base
+    // = wall_ns`, host pulls the 8 bytes via rtMemcpy(... DEVICE_TO_HOST)
+    // after stream sync. Sim's "device pointer" is a host malloc'd uint64
+    // (no special-casing — write-through works because sim AICPU and host
+    // share memory). Zero when the buffer was not allocated.
+    uint64_t device_wall_data_base{0};
 };
 
 #ifdef __cplusplus
diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp
index 843dc9fd6..32e24a526 100644
--- a/src/a2a3/platform/onboard/aicpu/kernel.cpp
+++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp
@@ -15,6 +15,7 @@
 #include "common/platform_config.h"
 #include "aicpu/dep_gen_collector_aicpu.h"
 #include "aicpu/device_log.h"
+#include "aicpu/device_time.h"
 #include "aicpu/l2_perf_collector_aicpu.h"
 #include "aicpu/pmu_collector_aicpu.h"
 #include "aicpu/platform_regs.h"
@@ -22,6 +23,15 @@
 #include "aicpu/tensor_dump_aicpu.h"
 #include "runtime.h"
 
+// Run-wall capture: g_device_start_cycle is set once in
+// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread
+// of the multi-threaded DynTileFwkBackendKernelServer writes the converted
+// (end - start) into KernelArgs.device_wall_ns on exit. Plain stores —
+// last-writer-wins is fine for wall measurement (concurrent exiting threads'
+// `my_end` values differ by µs, the final overwrite is within benchmark
+// noise of the true max-end - min-start).
+static uint64_t g_device_start_cycle = 0;
+
 // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp)
 extern "C" int aicpu_execute(Runtime *arg);
 
@@ -55,6 +65,15 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     set_log_level(static_cast<int>(k_args->log_level));
     set_log_info_v(static_cast<int>(k_args->log_info_v));
 
+    // Init is launched single-threaded (block_dim=1), so the race-free spot
+    // to capture run start and reset the wall accumulator. Subsequent
+    // DynTileFwkBackendKernelServer threads stamp end on their way out, via
+    // the device-resident 8-byte buffer addressed by device_wall_data_base.
+    g_device_start_cycle = get_sys_cnt_aicpu();
+    if (k_args->device_wall_data_base != 0) {
+        *reinterpret_cast<uint64_t *>(k_args->device_wall_data_base) = 0;
+    }
+
     LOG_INFO_V0("%s", "Runtime Executor Init: Initializing AICPU kernel");
     return 0;
 }
@@ -117,5 +136,14 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     }
     LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully");
 
+    // Stamp end into the device_wall buffer (addressed via
+    // device_wall_data_base). Last-writer-wins across threads — wall
+    // measurements are µs-scale tolerant, see g_device_start_cycle comment.
+    uint64_t my_end = get_sys_cnt_aicpu();
+    if (k_args->device_wall_data_base != 0 && my_end > g_device_start_cycle) {
+        *reinterpret_cast<uint64_t *>(k_args->device_wall_data_base) =
+            static_cast<uint64_t>(cycles_to_us(my_end - g_device_start_cycle) * 1000.0);
+    }
+
     return rc;
 }
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 7e51b5c94..933247b4f 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -22,6 +22,7 @@
 #include <dlfcn.h>
 
 #include <cassert>
+#include <cstddef>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -512,6 +513,19 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Lazy-allocate the 8-byte device_wall buffer on first run. AICPU writes
+    // the run wall (ns) into this buffer via KernelArgs::device_wall_data_base;
+    // host pulls it back via rtMemcpy D2H after stream sync (see post-sync
+    // block below). Kept resident across runs; freed in finalize.
+    if (device_wall_dev_ptr_ == nullptr) {
+        device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t));
+        if (device_wall_dev_ptr_ != nullptr) {
+            kernel_args_.args.device_wall_data_base = reinterpret_cast<uint64_t>(device_wall_dev_ptr_);
+            uint64_t zero = 0;
+            (void)copy_to_device(device_wall_dev_ptr_, &zero, sizeof(uint64_t));
+        }
+    }
+
     // Validate block_dim against stream resource limits
     rc = validate_block_dim(stream_aicore_, block_dim);
     if (rc != 0) {
@@ -763,6 +777,23 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Pull the platform-level device wall (ns) back from the 8-byte device
+    // buffer that AICPU writes through via KernelArgs::device_wall_data_base.
+    // (We can't use the device_k_args_ shadow here — CANN's
+    // rtAicpuKernelLaunchExWithArgs copies KernelArgs into AICPU-private
+    // memory at launch, so AICPU's writes to its local copy don't propagate
+    // to device_k_args_.) Failure path is a soft warn — wall stays zero.
+    device_wall_ns_ = 0;
+    if (device_wall_dev_ptr_ != nullptr) {
+        int wall_rc = rtMemcpy(
+            &device_wall_ns_, sizeof(uint64_t), device_wall_dev_ptr_, sizeof(uint64_t), RT_MEMCPY_DEVICE_TO_HOST
+        );
+        if (wall_rc != 0) {
+            LOG_WARN("rtMemcpy(device_wall_ns) D2H failed: %d", wall_rc);
+            device_wall_ns_ = 0;
+        }
+    }
+
     // Tear down collectors. stop() joins mgmt then collector in the only safe
     // order (mgmt's final-drain pass into L2 has poll as its consumer).
     // Diagnostic exports use the per-task `output_prefix_` directory the user
@@ -1102,6 +1133,11 @@ int DeviceRunner::finalize() {
         acl_ready_ = false;
     }
 
+    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    if (device_wall_dev_ptr_ != nullptr) {
+        free_tensor(device_wall_dev_ptr_);
+        device_wall_dev_ptr_ = nullptr;
+    }
     device_id_ = -1;
     block_dim_ = 0;
     worker_count_ = 0;
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 812ba3c4a..99ad566f4 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -294,6 +294,14 @@ class DeviceRunner {
     void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; }
     const std::string &output_prefix() const { return output_prefix_; }
 
+    /**
+     * Device-side wall (ns) from the most recently completed run, written
+     * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in
+     * run()). Returns 0 before any run completes. Independent of any
+     * profiling / swimlane subsystem.
+     */
+    uint64_t last_device_wall_ns() const { return device_wall_ns_; }
+
     /**
      * Print handshake results from device
      *
@@ -550,6 +558,15 @@ class DeviceRunner {
     rtStream_t stream_aicore_{nullptr};
     AicpuSoInfo so_info_;
     KernelArgsHelper kernel_args_;
+
+    // Platform-level device wall buffer: 8-byte device-resident slot whose
+    // address rides on KernelArgs.device_wall_data_base. AICPU writes the
+    // run wall (ns) through that pointer; this DeviceRunner pulls it back
+    // via copy_from_device after stream sync and caches it for
+    // last_device_wall_ns(). Allocated once at simpler_init, freed in
+    // finalize.
+    void *device_wall_dev_ptr_{nullptr};
+    uint64_t device_wall_ns_{0};
     DeviceArgs device_args_;
 
     // Kernel binary management
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index 267114b7a..b287e277b 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -23,6 +23,7 @@
 
 #include <pthread.h>
 
+#include <chrono>
 #include <cstdlib>
 #include <utility>
 #include <vector>
@@ -298,8 +299,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
-    const char *output_prefix
+    const char *output_prefix, PtoRunTiming *out_timing
 ) {
+    if (out_timing != NULL) {
+        out_timing->host_wall_ns = 0;
+        out_timing->device_wall_ns = 0;
+    }
     if (ctx == NULL || runtime == NULL) return -1;
     DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
@@ -314,6 +319,8 @@ int run_prepared(
         pthread_setspecific(g_runner_key, nullptr);
     });
 
+    const auto host_t0 = std::chrono::steady_clock::now();
+
     try {
         int rc = runner->prepare_run_context(runner->device_id());
         if (rc != 0) return rc;
@@ -367,6 +374,12 @@ int run_prepared(
 
         rc = validate_runtime_impl(r);
         r->~Runtime();
+        if (out_timing != NULL) {
+            const auto host_t1 = std::chrono::steady_clock::now();
+            out_timing->host_wall_ns =
+                static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_t1 - host_t0).count());
+            out_timing->device_wall_ns = runner->last_device_wall_ns();
+        }
         return rc;
     } catch (...) {
         return -1;
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 46d3e6c5b..bd2b9a719 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -30,6 +30,7 @@
 #include <sys/stat.h>
 
 #include <atomic>
+#include <chrono>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -351,6 +352,18 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Lazy-allocate the 8-byte device_wall buffer on first run. Sim's
+    // "device pointer" is a host malloc returned by allocate_tensor; the
+    // sim AICPU thread (host CPU thread) writes through this pointer just
+    // like onboard's AICPU does.
+    if (device_wall_dev_ptr_ == nullptr) {
+        device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t));
+        if (device_wall_dev_ptr_ != nullptr) {
+            kernel_args_.device_wall_data_base = reinterpret_cast<uint64_t>(device_wall_dev_ptr_);
+            *static_cast<uint64_t *>(device_wall_dev_ptr_) = 0;
+        }
+    }
+
     // Calculate execution parameters
     block_dim_ = block_dim;
     int num_aicore = block_dim * cores_per_blockdim_;
@@ -578,8 +591,20 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     std::vector<std::thread> aicpu_threads;
     aicpu_threads.reserve(over_launch);
     std::atomic<int> aicpu_rc{0};
+
+    // Sim "device wall" capture — mirrors the onboard kernel.cpp path via the
+    // device_wall_data_base buffer (sim's "device pointer" is a host malloc'd
+    // uint64). sim_t0 is the host steady_clock equivalent of get_sys_cnt_aicpu
+    // in Init; each thread on exit overwrites the buffer with (now - sim_t0).
+    // Last-writer-wins — sim threads are host CPU threads, variance is well
+    // below benchmark noise.
+    if (kernel_args_.device_wall_data_base != 0) {
+        *reinterpret_cast<uint64_t *>(kernel_args_.device_wall_data_base) = 0;
+    }
+    const auto sim_t0 = std::chrono::steady_clock::now();
+
     for (int i = 0; i < over_launch; i++) {
-        aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc]() {
+        aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc, sim_t0]() {
             if (!platform_aicpu_affinity_gate(launch_aicpu_num, over_launch)) {
                 return;
             }
@@ -588,6 +613,11 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
                 int expected = 0;
                 aicpu_rc.compare_exchange_strong(expected, rc, std::memory_order_acq_rel);
             }
+            if (kernel_args_.device_wall_data_base != 0) {
+                const auto t1 = std::chrono::steady_clock::now();
+                *reinterpret_cast<uint64_t *>(kernel_args_.device_wall_data_base) =
+                    static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - sim_t0).count());
+            }
         }));
     }
 
@@ -615,6 +645,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     }
     LOG_INFO_V0("All threads completed");
 
+    // Snapshot the device_wall buffer into device_wall_ns_. Sim's "device
+    // memory" is host memory, so this is a single dereference — no DMA.
+    // Mirrors the post-sync rtMemcpy D2H that the onboard DeviceRunner does.
+    device_wall_ns_ = 0;
+    if (device_wall_dev_ptr_ != nullptr) {
+        device_wall_ns_ = *static_cast<uint64_t *>(device_wall_dev_ptr_);
+    }
+
     int runtime_rc = aicpu_rc.load(std::memory_order_acquire);
     if (runtime_rc != 0) {
         LOG_ERROR("AICPU execution failed with rc=%d", runtime_rc);
@@ -955,6 +993,11 @@ int DeviceRunner::finalize() {
     mem_alloc_.finalize();
     clear_cpu_sim_shared_storage();
 
+    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    if (device_wall_dev_ptr_ != nullptr) {
+        free_tensor(device_wall_dev_ptr_);
+        device_wall_dev_ptr_ = nullptr;
+    }
     device_id_ = -1;
     worker_count_ = 0;
     last_runtime_ = nullptr;
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 573236634..748b4e7ab 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -174,6 +174,14 @@ class DeviceRunner {
     void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; }
     const std::string &output_prefix() const { return output_prefix_; }
 
+    /**
+     * Device-side wall (ns) from the most recently completed run, written
+     * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in
+     * run()). Returns 0 before any run completes. Independent of any
+     * profiling / swimlane subsystem.
+     */
+    uint64_t last_device_wall_ns() const { return device_wall_ns_; }
+
     /**
      * Attach the calling thread to the simulated device.
      *
@@ -257,6 +265,15 @@ class DeviceRunner {
     // Simulation state (no actual device resources)
     KernelArgs kernel_args_;
 
+    // Platform-level device wall buffer: 8-byte device-resident slot whose
+    // address rides on KernelArgs.device_wall_data_base. AICPU writes the
+    // run wall (ns) through that pointer; this DeviceRunner pulls it back
+    // via copy_from_device after stream sync and caches it for
+    // last_device_wall_ns(). Allocated once at simpler_init, freed in
+    // finalize.
+    void *device_wall_dev_ptr_{nullptr};
+    uint64_t device_wall_ns_{0};
+
     // Chip-callable buffer pool (sim path). Keyed by FNV-1a 64-bit content
     // hash of the ChipCallable bytes. Each entry owns a host scratch holding
     // the ChipCallable with each child's resolved_addr_ fixed up to the
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index afbd85718..b6276f184 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -24,6 +24,7 @@
 #include <new>
 #include <pthread.h>
 
+#include <chrono>
 #include <utility>
 #include <vector>
 
@@ -273,8 +274,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
-    const char *output_prefix
+    const char *output_prefix, PtoRunTiming *out_timing
 ) {
+    if (out_timing != NULL) {
+        out_timing->host_wall_ns = 0;
+        out_timing->device_wall_ns = 0;
+    }
     if (ctx == NULL || runtime == NULL) return -1;
     DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
@@ -286,6 +291,8 @@ int run_prepared(
     pthread_once(&g_runner_key_once, create_runner_key);
     pthread_setspecific(g_runner_key, ctx);
 
+    const auto host_t0 = std::chrono::steady_clock::now();
+
     try {
         Runtime *r = new (runtime) Runtime();
         r->host_api.device_malloc = device_malloc;
@@ -331,6 +338,12 @@ int run_prepared(
         rc = validate_runtime_impl(r);
         r->~Runtime();
         pthread_setspecific(g_runner_key, nullptr);
+        if (out_timing != NULL) {
+            const auto host_t1 = std::chrono::steady_clock::now();
+            out_timing->host_wall_ns =
+                static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_t1 - host_t0).count());
+            out_timing->device_wall_ns = runner->last_device_wall_ns();
+        }
         return rc;
     } catch (...) {
         pthread_setspecific(g_runner_key, nullptr);
diff --git a/src/a5/platform/include/common/kernel_args.h b/src/a5/platform/include/common/kernel_args.h
index 8494ac369..9a5416806 100644
--- a/src/a5/platform/include/common/kernel_args.h
+++ b/src/a5/platform/include/common/kernel_args.h
@@ -81,6 +81,12 @@ struct KernelArgs {
     uint32_t log_info_v{5};                 // INFO verbosity threshold (0..9); default V5
     uint32_t enable_profiling_flag{0};      // Profiling umbrella bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu
     uint32_t _pad{0};                       // Alignment padding
+
+    // Device pointer to an 8-byte buffer that the platform AICPU entry writes
+    // the run-wall (ns) into. Allocated once at simpler_init, kept resident.
+    // See the a2a3 kernel_args.h for the full design rationale (CANN's
+    // AICPU args copy makes inline fields write-only).
+    uint64_t device_wall_data_base{0};
 };
 
 #ifdef __cplusplus
diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp
index 4fbea2945..4337e4429 100644
--- a/src/a5/platform/onboard/aicpu/kernel.cpp
+++ b/src/a5/platform/onboard/aicpu/kernel.cpp
@@ -14,6 +14,7 @@
 #include "common/kernel_args.h"
 #include "common/platform_config.h"
 #include "aicpu/device_log.h"
+#include "aicpu/device_time.h"
 #include "aicpu/l2_perf_collector_aicpu.h"
 #include "aicpu/platform_regs.h"
 #include "aicpu/platform_aicpu_affinity.h"
@@ -21,6 +22,13 @@
 #include "aicpu/tensor_dump_aicpu.h"
 #include "runtime.h"
 
+// Run-wall capture: g_device_start_cycle is set once in
+// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread
+// of the multi-threaded DynTileFwkBackendKernelServer writes the converted
+// (end - start) into KernelArgs.device_wall_ns on exit. Plain stores —
+// last-writer-wins is fine for wall measurement.
+static uint64_t g_device_start_cycle = 0;
+
 // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp)
 extern "C" int aicpu_execute(Runtime *arg);
 
@@ -54,6 +62,13 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     set_log_level(static_cast<int>(k_args->log_level));
     set_log_info_v(static_cast<int>(k_args->log_info_v));
 
+    // Init is launched single-threaded (block_dim=1) — race-free spot to
+    // capture run start and reset the device_wall buffer.
+    g_device_start_cycle = get_sys_cnt_aicpu();
+    if (k_args->device_wall_data_base != 0) {
+        *reinterpret_cast<uint64_t *>(k_args->device_wall_data_base) = 0;
+    }
+
     LOG_INFO_V0("%s", "Runtime Executor Init: Initializing AICPU kernel");
     return 0;
 }
@@ -114,5 +129,12 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     }
     LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully");
 
+    // Stamp end into the device_wall buffer. Last-writer-wins across threads.
+    uint64_t my_end = get_sys_cnt_aicpu();
+    if (k_args->device_wall_data_base != 0 && my_end > g_device_start_cycle) {
+        *reinterpret_cast<uint64_t *>(k_args->device_wall_data_base) =
+            static_cast<uint64_t>(cycles_to_us(my_end - g_device_start_cycle) * 1000.0);
+    }
+
     return rc;
 }
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index b27d9c6c9..bb46f24fc 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -23,6 +23,7 @@
 #include <dlfcn.h>
 
 #include <cassert>
+#include <cstddef>
 #include <cstring>
 #include <iostream>
 #include <string>
@@ -375,6 +376,17 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Lazy-allocate the 8-byte device_wall buffer on first run. See the a2a3
+    // onboard equivalent for the rationale.
+    if (device_wall_dev_ptr_ == nullptr) {
+        device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t));
+        if (device_wall_dev_ptr_ != nullptr) {
+            kernel_args_.args.device_wall_data_base = reinterpret_cast<uint64_t>(device_wall_dev_ptr_);
+            uint64_t zero = 0;
+            (void)copy_to_device(device_wall_dev_ptr_, &zero, sizeof(uint64_t));
+        }
+    }
+
     // Validate block_dim against stream resource limits
     rc = validate_block_dim(stream_aicore_, block_dim);
     if (rc != 0) {
@@ -560,6 +572,20 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Pull the platform-level device wall (ns) back from the dedicated
+    // 8-byte device buffer (AICPU writes via KernelArgs::device_wall_data_base
+    // — CANN copies args at launch so the inline field would be unreachable).
+    device_wall_ns_ = 0;
+    if (device_wall_dev_ptr_ != nullptr) {
+        int wall_rc = rtMemcpy(
+            &device_wall_ns_, sizeof(uint64_t), device_wall_dev_ptr_, sizeof(uint64_t), RT_MEMCPY_DEVICE_TO_HOST
+        );
+        if (wall_rc != 0) {
+            LOG_WARN("rtMemcpy(device_wall_ns) D2H failed: %d", wall_rc);
+            device_wall_ns_ = 0;
+        }
+    }
+
     // Tear down collectors. stop() joins mgmt then collector in the only
     // safe order (mgmt's final-drain pass into L2 has poll as its
     // consumer). Diagnostic exports use the per-task `output_prefix_`
@@ -882,6 +908,11 @@ int DeviceRunner::finalize() {
         return rc;
     }
 
+    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    if (device_wall_dev_ptr_ != nullptr) {
+        free_tensor(device_wall_dev_ptr_);
+        device_wall_dev_ptr_ = nullptr;
+    }
     device_id_ = -1;
     block_dim_ = 0;
     worker_count_ = 0;
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index 985f9708b..3762eeaee 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -279,6 +279,14 @@ class DeviceRunner {
     void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; }
     const std::string &output_prefix() const { return output_prefix_; }
 
+    /**
+     * Device-side wall (ns) from the most recently completed run, written
+     * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in
+     * run()). Returns 0 before any run completes. Independent of any
+     * profiling / swimlane subsystem.
+     */
+    uint64_t last_device_wall_ns() const { return device_wall_ns_; }
+
     /**
      * Print handshake results from device
      *
@@ -457,6 +465,15 @@ class DeviceRunner {
     rtStream_t stream_aicore_{nullptr};
     AicpuSoInfo so_info_;
     KernelArgsHelper kernel_args_;
+
+    // Platform-level device wall buffer: 8-byte device-resident slot whose
+    // address rides on KernelArgs.device_wall_data_base. AICPU writes the
+    // run wall (ns) through that pointer; this DeviceRunner pulls it back
+    // via copy_from_device after stream sync and caches it for
+    // last_device_wall_ns(). Allocated once at simpler_init, freed in
+    // finalize.
+    void *device_wall_dev_ptr_{nullptr};
+    uint64_t device_wall_ns_{0};
     DeviceArgs device_args_;
 
     // Kernel binary management
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 06fad5170..ee5b02f67 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -23,6 +23,7 @@
 
 #include <pthread.h>
 
+#include <chrono>
 #include <cstdlib>
 #include <utility>
 #include <vector>
@@ -334,8 +335,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int /*enable_dep_gen*/,
-    const char *output_prefix
+    const char *output_prefix, PtoRunTiming *out_timing
 ) {
+    if (out_timing != NULL) {
+        out_timing->host_wall_ns = 0;
+        out_timing->device_wall_ns = 0;
+    }
     if (ctx == NULL || runtime == NULL) return -1;
     DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
@@ -350,6 +355,8 @@ int run_prepared(
         pthread_setspecific(g_runner_key, nullptr);
     });
 
+    const auto host_t0 = std::chrono::steady_clock::now();
+
     try {
         int rc = runner->prepare_run_context(runner->device_id());
         if (rc != 0) return rc;
@@ -402,6 +409,12 @@ int run_prepared(
 
         rc = validate_runtime_impl(r);
         r->~Runtime();
+        if (out_timing != NULL) {
+            const auto host_t1 = std::chrono::steady_clock::now();
+            out_timing->host_wall_ns =
+                static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_t1 - host_t0).count());
+            out_timing->device_wall_ns = runner->last_device_wall_ns();
+        }
         return rc;
     } catch (...) {
         return -1;
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index a6f8fdba0..59a3a366b 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -30,6 +30,7 @@
 #include <sys/stat.h>
 
 #include <atomic>
+#include <chrono>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -320,6 +321,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return rc;
     }
 
+    // Lazy-allocate the 8-byte device_wall buffer on first run.
+    if (device_wall_dev_ptr_ == nullptr) {
+        device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t));
+        if (device_wall_dev_ptr_ != nullptr) {
+            kernel_args_.device_wall_data_base = reinterpret_cast<uint64_t>(device_wall_dev_ptr_);
+            *static_cast<uint64_t *>(device_wall_dev_ptr_) = 0;
+        }
+    }
+
     // Calculate execution parameters
     block_dim_ = block_dim;
     int num_aicore = block_dim * cores_per_blockdim_;
@@ -494,8 +504,17 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     std::vector<std::thread> aicpu_threads;
     aicpu_threads.reserve(over_launch);
     std::atomic<int> aicpu_rc{0};
+
+    // Sim "device wall" capture — see a2a3 sim/onboard kernel.cpp for the
+    // shape. Writes via device_wall_data_base (sim's "device pointer" is a
+    // host malloc'd uint64).
+    if (kernel_args_.device_wall_data_base != 0) {
+        *reinterpret_cast<uint64_t *>(kernel_args_.device_wall_data_base) = 0;
+    }
+    const auto sim_t0 = std::chrono::steady_clock::now();
+
     for (int i = 0; i < over_launch; i++) {
-        aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc]() {
+        aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc, sim_t0]() {
             if (!platform_aicpu_affinity_gate(launch_aicpu_num, over_launch)) {
                 return;
             }
@@ -504,6 +523,11 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
                 int expected = 0;
                 aicpu_rc.compare_exchange_strong(expected, rc, std::memory_order_acq_rel);
             }
+            if (kernel_args_.device_wall_data_base != 0) {
+                const auto t1 = std::chrono::steady_clock::now();
+                *reinterpret_cast<uint64_t *>(kernel_args_.device_wall_data_base) =
+                    static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - sim_t0).count());
+            }
         }));
     }
 
@@ -531,6 +555,13 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     LOG_INFO_V0("All threads completed");
 
+    // Snapshot device_wall buffer into device_wall_ns_ (sim: single deref,
+    // no DMA — mirrors onboard's post-sync rtMemcpy D2H).
+    device_wall_ns_ = 0;
+    if (device_wall_dev_ptr_ != nullptr) {
+        device_wall_ns_ = *static_cast<uint64_t *>(device_wall_dev_ptr_);
+    }
+
     int runtime_rc = aicpu_rc.load(std::memory_order_acquire);
     if (runtime_rc != 0) {
         LOG_ERROR("AICPU execution failed with rc=%d", runtime_rc);
@@ -859,6 +890,11 @@ int DeviceRunner::finalize() {
     mem_alloc_.finalize();
     clear_cpu_sim_shared_storage();
 
+    // Free the 8-byte device_wall buffer (allocated lazily in run()).
+    if (device_wall_dev_ptr_ != nullptr) {
+        free_tensor(device_wall_dev_ptr_);
+        device_wall_dev_ptr_ = nullptr;
+    }
     device_id_ = -1;
     worker_count_ = 0;
     last_runtime_ = nullptr;
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 36e8c2016..74b0cd6fa 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -167,6 +167,14 @@ class DeviceRunner {
     void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; }
     const std::string &output_prefix() const { return output_prefix_; }
 
+    /**
+     * Device-side wall (ns) from the most recently completed run, written
+     * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in
+     * run()). Returns 0 before any run completes. Independent of any
+     * profiling / swimlane subsystem.
+     */
+    uint64_t last_device_wall_ns() const { return device_wall_ns_; }
+
     /**
      * Attach the calling thread to the simulated device.
      *
@@ -254,6 +262,15 @@ class DeviceRunner {
     // Simulation state (no actual device resources)
     KernelArgs kernel_args_;
 
+    // Platform-level device wall buffer: 8-byte device-resident slot whose
+    // address rides on KernelArgs.device_wall_data_base. AICPU writes the
+    // run wall (ns) through that pointer; this DeviceRunner pulls it back
+    // via copy_from_device after stream sync and caches it for
+    // last_device_wall_ns(). Allocated once at simpler_init, freed in
+    // finalize.
+    void *device_wall_dev_ptr_{nullptr};
+    uint64_t device_wall_ns_{0};
+
     // Chip-callable buffer pool (sim path). Keyed by FNV-1a 64-bit content
     // hash; each entry owns the host scratch + dlopen handles needed for the
     // children embedded in that buffer. Bulk-freed in finalize().
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 5f5099142..5f398614f 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -24,6 +24,7 @@
 #include <new>
 #include <pthread.h>
 
+#include <chrono>
 #include <utility>
 #include <vector>
 
@@ -270,8 +271,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int /*enable_dep_gen*/,
-    const char *output_prefix
+    const char *output_prefix, PtoRunTiming *out_timing
 ) {
+    if (out_timing != NULL) {
+        out_timing->host_wall_ns = 0;
+        out_timing->device_wall_ns = 0;
+    }
     if (ctx == NULL || runtime == NULL) return -1;
     DeviceRunner *runner = static_cast<DeviceRunner *>(ctx);
 
@@ -283,6 +288,8 @@ int run_prepared(
     pthread_once(&g_runner_key_once, create_runner_key);
     pthread_setspecific(g_runner_key, ctx);
 
+    const auto host_t0 = std::chrono::steady_clock::now();
+
     try {
         Runtime *r = new (runtime) Runtime();
         r->host_api.device_malloc = device_malloc;
@@ -327,6 +334,12 @@ int run_prepared(
         rc = validate_runtime_impl(r);
         r->~Runtime();
         pthread_setspecific(g_runner_key, nullptr);
+        if (out_timing != NULL) {
+            const auto host_t1 = std::chrono::steady_clock::now();
+            out_timing->host_wall_ns =
+                static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_t1 - host_t0).count());
+            out_timing->device_wall_ns = runner->last_device_wall_ns();
+        }
         return rc;
     } catch (...) {
         pthread_setspecific(g_runner_key, nullptr);
diff --git a/src/common/hierarchical/types.h b/src/common/hierarchical/types.h
index aa5121785..938ffe2d4 100644
--- a/src/common/hierarchical/types.h
+++ b/src/common/hierarchical/types.h
@@ -215,3 +215,17 @@ class ReadyQueue {
     std::condition_variable cv_;
     bool shutdown_{false};
 };
+
+// =============================================================================
+// RunTiming — wall-clock breakdown returned by ChipWorker::run
+// =============================================================================
+
+// host_wall_ns is the steady_clock delta wrapping the dispatch; device_wall_ns
+// is on-NPU wall captured by the platform AICPU entry (see KernelArgs::
+// device_wall_ns). Mirrors PtoRunTiming in src/common/worker/pto_runtime_c_api.h
+// so the value flows through unchanged from the dlsym ABI up to the Python
+// binding.
+struct RunTiming {
+    uint64_t host_wall_ns = 0;
+    uint64_t device_wall_ns = 0;
+};
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 9b0c7ef39..a402eb5c3 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -300,12 +300,12 @@ void ChipWorker::prepare_callable(int32_t callable_id, const void *callable) {
     }
 }
 
-void ChipWorker::run(int32_t callable_id, TaskArgsView args, const CallConfig &config) {
+RunTiming ChipWorker::run(int32_t callable_id, TaskArgsView args, const CallConfig &config) {
     ChipStorageTaskArgs chip_storage = view_to_chip_storage(args);
-    run(callable_id, &chip_storage, config);
+    return run(callable_id, &chip_storage, config);
 }
 
-void ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config) {
+RunTiming ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config) {
     config.validate();
     if (!initialized_) {
         throw std::runtime_error("ChipWorker not initialized; call init() first");
@@ -313,13 +313,15 @@ void ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args, const
 
     void *rt = runtime_buf_.data();
 
+    PtoRunTiming timing{0, 0};
     int rc = run_prepared_fn_(
         device_ctx_, rt, callable_id, args, config.block_dim, config.aicpu_thread_num, config.enable_l2_swimlane,
-        config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.output_prefix
+        config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.output_prefix, &timing
     );
     if (rc != 0) {
         throw std::runtime_error("run_prepared failed with code " + std::to_string(rc));
     }
+    return RunTiming{timing.host_wall_ns, timing.device_wall_ns};
 }
 
 void ChipWorker::unregister_callable(int32_t callable_id) {
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index a93bca33d..79c15d4d4 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -19,6 +19,7 @@
 
 #include "../task_interface/call_config.h"
 #include "../task_interface/task_args.h"
+#include "pto_runtime_c_api.h"
 #include "types.h"
 
 class ChipWorker {
@@ -50,12 +51,14 @@ class ChipWorker {
 
     // Launch a cid previously staged via prepare_callable.
     // Materializes a ChipStorageTaskArgs from `args` (one memcpy of T*40B + S*8B
-    // into a stack POD), then delegates to the overload below.
-    void run(int32_t callable_id, TaskArgsView args, const CallConfig &config);
+    // into a stack POD), then delegates to the overload below. Returns
+    // RunTiming with host wall (steady_clock around dispatch) + device wall
+    // (KernelArgs::device_wall_ns captured by the platform AICPU entry).
+    RunTiming run(int32_t callable_id, TaskArgsView args, const CallConfig &config);
     // Same launch, but the caller already holds the runtime.so-ABI POD —
     // skip the view→storage memcpy and hand the pointer straight to the C ABI.
     // Used by the ChipStorageTaskArgs path in the nanobind binding.
-    void run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config);
+    RunTiming run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config);
 
     // Per-callable_id preparation. Requires init() first and a callable_id
     // in [0, MAX_REGISTERED_CALLABLE_IDS) (cap 64).
@@ -128,7 +131,8 @@ class ChipWorker {
     // CANN dlog sync. Reads the current log level off HostLogger itself.
     using SimplerInitFn = int (*)(void *, int, const uint8_t *, size_t, const uint8_t *, size_t);
     using PrepareCallableFn = int (*)(void *, int32_t, const void *);
-    using RunPreparedFn = int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, const char *);
+    using RunPreparedFn =
+        int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, const char *, PtoRunTiming *);
     using UnregisterCallableFn = int (*)(void *, int32_t);
     using GetAicpuDlopenCountFn = size_t (*)(void *);
     using FinalizeDeviceFn = int (*)(void *);
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index 0e2e28cb2..00debb446 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -48,6 +48,31 @@ extern "C" {
 typedef void *RuntimeHandle;
 typedef void *DeviceContextHandle;
 
+/**
+ * Timing breakdown for a single run_prepared() invocation.
+ *
+ *   host_wall_ns   — wall-clock around the host-side dispatch (steady_clock
+ *                    delta wrapping the platform run() call). Always populated
+ *                    when out_timing != NULL.
+ *
+ *   device_wall_ns — on-NPU wall of the most recent orchestrator phase
+ *                    (orch_summary.end_time - .start_time, converted to ns
+ *                    using arch frequency). Populated whenever the runtime
+ *                    was built with PTO2_PROFILING (the default), regardless
+ *                    of enable_l2_swimlane. The orch_summary capture is
+ *                    decoupled from the per-record swimlane buffer pool —
+ *                    only the lightweight phase header allocation is needed
+ *                    for this field. Zero only when PTO2_PROFILING was off
+ *                    at runtime build time.
+ *
+ * Both fields are zeroed by the callee on entry (including on error paths)
+ * so callers can pass an uninitialized struct.
+ */
+typedef struct PtoRunTiming {
+    uint64_t host_wall_ns;
+    uint64_t device_wall_ns;
+} PtoRunTiming;
+
 /* ===========================================================================
  * Public API (resolved by ChipWorker via dlsym)
  * =========================================================================== */
@@ -160,12 +185,16 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
  * `device_id` and the executor binaries are not threaded through this entry
  * — they were captured by `simpler_init` and live on the DeviceRunner.
  *
+ * If `out_timing` is non-NULL, the callee writes the wall-clock breakdown for
+ * this invocation into it (see PtoRunTiming above). The struct is zeroed on
+ * entry and partially populated on early-error returns.
+ *
  * @return 0 on success, negative on error (no prep state, NULL ctx, etc.).
  */
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
-    const char *output_prefix
+    const char *output_prefix, PtoRunTiming *out_timing
 );
 
 /**
diff --git a/tests/ut/py/test_run_timing.py b/tests/ut/py/test_run_timing.py
new file mode 100644
index 000000000..6b8aaed12
--- /dev/null
+++ b/tests/ut/py/test_run_timing.py
@@ -0,0 +1,122 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Tests for the RunTiming binding returned by Worker.run / run_prepared.
+
+Pure-Python unit tests — no device, no kernels. Verifies that the nanobind
+bound class behaves as documented at the boundary so callers (benchmark
+scripts, perf dashboards) can rely on the contract without spinning up a
+worker.
+"""
+
+import inspect
+
+import pytest
+from _task_interface import RunTiming  # pyright: ignore[reportMissingImports]
+
+
+class TestRunTimingConstruction:
+    def test_default_constructor_zeros(self):
+        t = RunTiming()
+        assert t.host_wall_ns == 0
+        assert t.device_wall_ns == 0
+        assert t.host_wall_us == 0.0
+        assert t.device_wall_us == 0.0
+
+    def test_explicit_values(self):
+        t = RunTiming(1500, 2500)
+        assert t.host_wall_ns == 1500
+        assert t.device_wall_ns == 2500
+
+    def test_device_wall_defaults_to_zero(self):
+        # The (host_ns, device_ns=0) constructor is the path Worker.run uses
+        # for L3+ DAGs, where per-task device cycles aren't aggregated yet.
+        t = RunTiming(9999)
+        assert t.host_wall_ns == 9999
+        assert t.device_wall_ns == 0
+
+
+class TestRunTimingUnitConversion:
+    def test_ns_to_us_host(self):
+        assert RunTiming(1_000, 0).host_wall_us == pytest.approx(1.0)
+        assert RunTiming(1_500_000, 0).host_wall_us == pytest.approx(1500.0)
+
+    def test_ns_to_us_device(self):
+        assert RunTiming(0, 1_000).device_wall_us == pytest.approx(1.0)
+        assert RunTiming(0, 7_250).device_wall_us == pytest.approx(7.25)
+
+    def test_us_returns_float(self):
+        t = RunTiming(1000, 2000)
+        assert isinstance(t.host_wall_us, float)
+        assert isinstance(t.device_wall_us, float)
+
+    def test_ns_returns_int(self):
+        t = RunTiming(1000, 2000)
+        assert isinstance(t.host_wall_ns, int)
+        assert isinstance(t.device_wall_ns, int)
+
+
+class TestRunTimingRepr:
+    def test_repr_includes_both_walls(self):
+        r = repr(RunTiming(1500, 2500))
+        assert "RunTiming" in r
+        assert "host_wall_us=1.5" in r
+        assert "device_wall_us=2.5" in r
+
+    def test_repr_zero(self):
+        r = repr(RunTiming())
+        assert "host_wall_us=0" in r
+        assert "device_wall_us=0" in r
+
+
+class TestRunTimingExports:
+    """Guard against the symbol getting dropped from the public surface — it's
+    re-exported from simpler.worker so end-user code can write
+    `from simpler.worker import RunTiming`.
+    """
+
+    def test_exported_from_task_interface_module(self):
+        import _task_interface  # noqa: PLC0415
+
+        assert hasattr(_task_interface, "RunTiming")
+        assert _task_interface.RunTiming is RunTiming
+
+    def test_exported_from_simpler_worker(self):
+        from simpler.worker import RunTiming as WorkerRunTiming  # noqa: PLC0415
+
+        assert WorkerRunTiming is RunTiming
+
+
+class TestWorkerRunSignature:
+    """Worker.run signature contract: documented to return a RunTiming.
+    We can't actually drive a worker without a device, but we can inspect the
+    wrapper signature + docstring to catch accidental drift (e.g. someone
+    reverting the return to None).
+    """
+
+    def test_worker_run_return_annotation(self):
+        from simpler.worker import Worker  # noqa: PLC0415
+
+        sig = inspect.signature(Worker.run)
+        # The annotation is the RunTiming class itself (PEP 484 style).
+        assert sig.return_annotation is RunTiming, (
+            f"Worker.run lost its RunTiming return annotation; got {sig.return_annotation!r}"
+        )
+
+    def test_chip_worker_wrapper_returns_run_timing(self):
+        # The simpler.task_interface.ChipWorker wrapper is what
+        # Worker._chip_worker uses internally; it must forward the return
+        # rather than swallowing it.
+        from simpler.task_interface import ChipWorker  # noqa: PLC0415  # pyright: ignore[reportAttributeAccessIssue]
+
+        src = inspect.getsource(ChipWorker.run)
+        # Surgical drift guard: the wrapper must `return self._impl.run(...)`
+        # rather than calling it for side effects only. If someone reverts to
+        # the side-effect-only form, RunTiming would silently become None for
+        # every caller without anything else breaking.
+        assert "return self._impl.run" in src
diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh
index 84fc7486e..dc29ef329 100755
--- a/tools/benchmark_rounds.sh
+++ b/tools/benchmark_rounds.sh
@@ -7,8 +7,10 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
-# Benchmark wrapper: run examples on hardware,
-# then parse device-log timing lines to report per-round latency.
+# Benchmark wrapper: run examples on hardware, then parse per-round latency
+# emitted by the test framework directly (see simpler_setup/scene_test.py::
+# _log_round_timings). Worker.run now returns a RunTiming struct so we no
+# longer scrape device logs for sched/orch timestamps.
 #
 # Usage:
 #   ./tools/benchmark_rounds.sh [-p <platform>] [-d <device>] [-n <rounds>] [-r <runtime>]
@@ -22,11 +24,8 @@ PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
 
 # ---------------------------------------------------------------------------
 # Examples to benchmark and their case lists, per runtime.
-# Key   = directory name under tests/st/<platform>/<runtime>/
-# Value = comma-separated case names to run (empty string = run DEFAULT_CASE)
 # ---------------------------------------------------------------------------
 
-# --- tensormap_and_ringbuffer ---
 declare -A TMR_EXAMPLE_CASES=(
     [alternating_matmul_add]="Case1"
     [benchmark_bgemm]="Case0"
@@ -44,9 +43,6 @@ TMR_EXAMPLE_ORDER=(
     spmd_paged_attention
 )
 
-# ---------------------------------------------------------------------------
-# Parse arguments
-# ---------------------------------------------------------------------------
 DEVICE_ID=0
 ROUNDS=100
 PLATFORM=a2a3
@@ -56,62 +52,33 @@ EXTRA_ARGS=()
 
 while [[ $# -gt 0 ]]; do
     case "$1" in
-        -p|--platform)
-            PLATFORM="$2"
-            shift 2
-            ;;
-        -d|--device)
-            DEVICE_ID="$2"
-            shift 2
-            ;;
-        -n|--rounds)
-            ROUNDS="$2"
-            shift 2
-            ;;
-        -r|--runtime)
-            RUNTIME="$2"
-            shift 2
-            ;;
-        -v|--verbose)
-            VERBOSE=1
-            shift
-            ;;
+        -p|--platform) PLATFORM="$2"; shift 2 ;;
+        -d|--device)   DEVICE_ID="$2"; shift 2 ;;
+        -n|--rounds)   ROUNDS="$2"; shift 2 ;;
+        -r|--runtime)  RUNTIME="$2"; shift 2 ;;
+        -v|--verbose)  VERBOSE=1; shift ;;
         --help|-h)
             cat <<'USAGE'
-benchmark_rounds.sh — run all examples and report per-round timing from device logs
-
-Usage:
-  ./tools/benchmark_rounds.sh [-p <platform>] [-d <device>] [-n <rounds>] [-r <runtime>] [-v]
+benchmark_rounds.sh — run examples and report per-round host/device wall
+emitted directly by Worker.run().
 
 Options:
-  -p, --platform Platform to run on (default: a2a3)
-  -d, --device   Device ID (default: 0)
-  -n, --rounds   Override number of rounds for each example (default: 100)
-  -r, --runtime  Runtime to benchmark: tensormap_and_ringbuffer (default)
-  -v, --verbose  Save detailed test_*.py output to a timestamped log file
-  -h, --help     Show this help
-
-All other options are passed through to the underlying `python test_*.py`
-invocation (e.g. --case).
-
-Edit the EXAMPLE_CASES map at the top of this script to control which
-examples and cases to benchmark.
-
-Output:
-  Average elapsed time in microseconds for each example.
+  -p, --platform  Platform (default: a2a3)
+  -d, --device    Device ID (default: 0)
+  -n, --rounds    Rounds per example (default: 100)
+  -r, --runtime   Runtime (default: tensormap_and_ringbuffer)
+  -v, --verbose   Save test output to a timestamped log
+  -h, --help      Show this help
+
+Device-wall numbers are zero only when the runtime is built without
+PTO2_PROFILING. The default build has it on, so the device column is
+populated for every round regardless of --enable-l2-swimlane.
 USAGE
-            exit 0
-            ;;
-        *)
-            EXTRA_ARGS+=("$1")
-            shift
-            ;;
+            exit 0 ;;
+        *) EXTRA_ARGS+=("$1"); shift ;;
     esac
 done
 
-# ---------------------------------------------------------------------------
-# Verbose logging setup
-# ---------------------------------------------------------------------------
 VERBOSE_LOG=""
 if [[ $VERBOSE -eq 1 ]]; then
     mkdir -p "$PROJECT_ROOT/outputs"
@@ -119,357 +86,95 @@ if [[ $VERBOSE -eq 1 ]]; then
     echo "Verbose log: $VERBOSE_LOG"
 fi
 
-vlog() {
-    if [[ -n "$VERBOSE_LOG" ]]; then
-        echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$VERBOSE_LOG"
-    fi
-}
-
-# ---------------------------------------------------------------------------
-# Derive arch from platform and set examples directories
-# ---------------------------------------------------------------------------
-# Search both examples/ (migrated tests) and tests/st/ (legacy tests)
-ARCH="${PLATFORM%%sim}"  # strip "sim" suffix if present
+ARCH="${PLATFORM%%sim}"
 EXAMPLES_DIRS=(
     "$PROJECT_ROOT/tests/st/${ARCH}/${RUNTIME}"
     "$PROJECT_ROOT/examples/${ARCH}/${RUNTIME}"
 )
 
-# Clock frequency (MHz) for converting cycle counts to microseconds
-case "$PLATFORM" in
-    a2a3) FREQ=50 ;;
-    a5)   FREQ=1000 ;;
-    *)    echo "ERROR: unsupported platform '$PLATFORM'. Use a2a3 or a5."; exit 1 ;;
-esac
-
-# Select example cases and order based on runtime
 case "$RUNTIME" in
     tensormap_and_ringbuffer)
         declare -n EXAMPLE_CASES=TMR_EXAMPLE_CASES
         EXAMPLE_ORDER=("${TMR_EXAMPLE_ORDER[@]}")
         ;;
     *)
-        echo "ERROR: unknown runtime '$RUNTIME'. Use tensormap_and_ringbuffer."
-        exit 1
-        ;;
+        echo "ERROR: unknown runtime '$RUNTIME'." >&2
+        exit 1 ;;
 esac
 
 # ---------------------------------------------------------------------------
-# Resolve CANN device log directory: $ASCEND_WORK_PATH/log/debug or ~/ascend/log/debug
+# Parse "Avg Host: X.X us  |  Avg Device: Y.Y us  (N rounds)" emitted by
+# _log_round_timings. Returns "host_avg device_avg" (device may be "-").
 # ---------------------------------------------------------------------------
-if [[ -n "${ASCEND_WORK_PATH:-}" ]]; then
-    LOG_ROOT="$ASCEND_WORK_PATH/log/debug"
-    if [[ ! -d "$LOG_ROOT" ]]; then
-        LOG_ROOT="$HOME/ascend/log/debug"
-    fi
-else
-    LOG_ROOT="$HOME/ascend/log/debug"
-fi
-DEVICE_LOG_DIR="$LOG_ROOT/device-${DEVICE_ID}"
-
-# ---------------------------------------------------------------------------
-# parse_timing <log_file>
-#   Grep for orch_start / end lines, compute per-round elapsed, print summary.
-# ---------------------------------------------------------------------------
-parse_timing() {
-    local log_file="$1"
-
-    local timing
-    timing=$(grep -E 'Thread [0-9]+: (sched_start|orch_start|orch_end|sched_end|orch_stage_end)' "$log_file" || true)
-
-    if [[ -z "$timing" ]]; then
-        echo "  (no benchmark timing data — was PTO2_PROFILING enabled?)"
-        return 1
-    fi
-
-    echo "$timing" | awk -v freq="$FREQ" '
-    function new_round() {
-        flush_round()
-        round++
-        min_start = 0; max_end = 0
-        min_sched_start = 0; max_sched_end = 0
-        min_orch_start = 0; max_orch_end = 0
-        delete sched_seen
-        delete orch_seen
-    }
-    function flush_round() {
-        if (round >= 0 && max_end > 0 && min_start > 0) {
-            results[round] = (max_end - min_start) / freq
-            if (max_sched_end > 0 && min_sched_start > 0)
-                sched_results[round] = (max_sched_end - min_sched_start) / freq
-            if (max_orch_end > 0 && min_orch_start > 0)
-                orch_results[round] = (max_orch_end - min_orch_start) / freq
-            count++
-        }
-    }
-    BEGIN {
-        round = 0; count = 0
-        min_start = 0; max_end = 0
-        min_sched_start = 0; max_sched_end = 0
-        min_orch_start = 0; max_orch_end = 0
-        has_sched = 0; has_orch_end = 0
-    }
-    /sched_start=/ {
-        match($0, /Thread ([0-9]+):/, tm)
-        tid = tm[1] + 0
-        if (tid in sched_seen) new_round()
-        sched_seen[tid] = 1
-        has_sched = 1
-        match($0, /sched_start=([0-9]+)/, m)
-        val = m[1] + 0
-        if (min_sched_start == 0 || val < min_sched_start) min_sched_start = val
-        if (min_start == 0 || val < min_start) min_start = val
-    }
-    /orch_start=/ {
-        match($0, /Thread ([0-9]+):/, tm)
-        tid = tm[1] + 0
-        if (tid in orch_seen) new_round()
-        orch_seen[tid] = 1
-        match($0, /orch_start=([0-9]+)/, m)
-        val = m[1] + 0
-        if (min_orch_start == 0 || val < min_orch_start) min_orch_start = val
-        if (min_start == 0 || val < min_start) min_start = val
-    }
-    /sched_end[^=]*=/ {
-        match($0, /sched_end[^=]*=([0-9]+)/, m)
-        val = m[1] + 0
-        if (val > max_sched_end) max_sched_end = val
-        if (val > max_end) max_end = val
-    }
-    /orch_end=/ {
-        match($0, /orch_end=([0-9]+)/, m)
-        val = m[1] + 0
-        has_orch_end = 1
-        if (val > max_orch_end) max_orch_end = val
-        if (val > max_end) max_end = val
-    }
-    /orch_stage_end=/ {
-        match($0, /orch_stage_end=([0-9]+)/, m)
-        val = m[1] + 0
-        if (val > max_end) max_end = val
-    }
-    END {
-        flush_round()
-        if (count == 0) { print "  (no rounds parsed)"; exit 1 }
-
-        show_sched = has_sched
-        show_orch = has_orch_end
-
-        # Header
-        hdr = sprintf("  %-8s  %12s", "Round", "Elapsed (us)")
-        sep = sprintf("  %-8s  %12s", "-----", "------------")
-        if (show_sched) { hdr = hdr sprintf("  %12s", "Sched (us)"); sep = sep sprintf("  %12s", "----------") }
-        if (show_orch)  { hdr = hdr sprintf("  %12s", "Orch (us)");  sep = sep sprintf("  %12s", "---------")  }
-        print hdr; print sep
-
-        sum_v = 0; min_v = results[0]; max_v = results[0]
-        sum_s = 0; min_s = sched_results[0]; max_s = sched_results[0]
-        sum_o = 0; min_o = orch_results[0]; max_o = orch_results[0]
-
-        for (i = 0; i < count; i++) {
-            line = sprintf("  %-8d  %12.1f", i, results[i])
-            sum_v += results[i]
-            if (results[i] < min_v) min_v = results[i]
-            if (results[i] > max_v) max_v = results[i]
-            if (show_sched) {
-                line = line sprintf("  %12.1f", sched_results[i])
-                sum_s += sched_results[i]
-                if (sched_results[i] < min_s) min_s = sched_results[i]
-                if (sched_results[i] > max_s) max_s = sched_results[i]
-            }
-            if (show_orch) {
-                line = line sprintf("  %12.1f", orch_results[i])
-                sum_o += orch_results[i]
-                if (orch_results[i] < min_o) min_o = orch_results[i]
-                if (orch_results[i] > max_o) max_o = orch_results[i]
-            }
-            print line
-        }
-
-        printf "\n  Avg: %.1f us", sum_v / count
-        if (show_sched) printf "  |  Sched Avg: %.1f us", sum_s / count
-        if (show_orch)  printf "  |  Orch Avg: %.1f us", sum_o / count
-        printf "  (%d rounds)\n", count
-
-        TRIM = 10
-        if (count > 2 * TRIM) {
-            # Insertion sort for each metric
-            for (i = 0; i < count; i++) sv[i] = results[i]
-            for (i = 1; i < count; i++) {
-                k = sv[i]; j = i - 1
-                while (j >= 0 && sv[j] > k) { sv[j+1] = sv[j]; j-- }
-                sv[j+1] = k
-            }
-            tc = count - 2 * TRIM; ts = 0
-            for (i = TRIM; i < count - TRIM; i++) ts += sv[i]
-            printf "  Trimmed Avg: %.1f us  (dropped %d low + %d high, %d rounds used)\n", ts / tc, TRIM, TRIM, tc
-
-            if (show_sched) {
-                for (i = 0; i < count; i++) ss[i] = sched_results[i]
-                for (i = 1; i < count; i++) {
-                    k = ss[i]; j = i - 1
-                    while (j >= 0 && ss[j] > k) { ss[j+1] = ss[j]; j-- }
-                    ss[j+1] = k
-                }
-                ts2 = 0
-                for (i = TRIM; i < count - TRIM; i++) ts2 += ss[i]
-                printf "  Sched Trimmed Avg: %.1f us  (dropped %d low + %d high)\n", ts2 / tc, TRIM, TRIM
-            }
-            if (show_orch) {
-                for (i = 0; i < count; i++) so[i] = orch_results[i]
-                for (i = 1; i < count; i++) {
-                    k = so[i]; j = i - 1
-                    while (j >= 0 && so[j] > k) { so[j+1] = so[j]; j-- }
-                    so[j+1] = k
-                }
-                ts3 = 0
-                for (i = TRIM; i < count - TRIM; i++) ts3 += so[i]
-                printf "  Orch Trimmed Avg: %.1f us  (dropped %d low + %d high)\n", ts3 / tc, TRIM, TRIM
-            }
-        }
-    }'
-}
-
-# ---------------------------------------------------------------------------
-# wait_for_new_log <pre_run_logs_file>
-#   Wait up to 15s for a new .log file in DEVICE_LOG_DIR. Prints the path.
-# ---------------------------------------------------------------------------
-wait_for_new_log() {
-    local pre_file="$1"
-    local new_log=""
-    local deadline=$((SECONDS + 15))
-
-    while [[ $SECONDS -lt $deadline ]]; do
-        if [[ -d "$DEVICE_LOG_DIR" ]]; then
-            new_log=$(comm -13 "$pre_file" <(ls -1 "$DEVICE_LOG_DIR"/*.log 2>/dev/null | sort) 2>/dev/null | tail -1 || true)
-            if [[ -n "$new_log" ]]; then
-                echo "$new_log"
-                return 0
-            fi
-        fi
-        sleep 0.5
-    done
-
-    # Fallback: newest log
-    if [[ -d "$DEVICE_LOG_DIR" ]]; then
-        new_log=$(ls -t "$DEVICE_LOG_DIR"/*.log 2>/dev/null | head -1 || true)
-        if [[ -n "$new_log" ]]; then
-            echo "$new_log"
-            return 0
-        fi
-    fi
-    return 1
+parse_avgs() {
+    local output="$1"
+    local host_avg device_avg
+    host_avg=$(echo "$output" | grep -oE 'Avg Host: [0-9.]+' | tail -1 | awk '{print $3}')
+    device_avg=$(echo "$output" | grep -oE 'Avg Device: [0-9.]+' | tail -1 | awk '{print $3}')
+    [[ -z "$host_avg"   ]] && host_avg="-"
+    [[ -z "$device_avg" ]] && device_avg="-"
+    echo "$host_avg $device_avg"
 }
 
-# ---------------------------------------------------------------------------
-# run_bench <example> <example_dir> [case_name]
-#   Run one benchmark invocation (via `python test_*.py`) and parse timing
-#   from the resulting log. Skips the example if it has no test_*.py.
-#   Sets global PASS / FAIL counters.
-# ---------------------------------------------------------------------------
 run_bench() {
     local example="$1" example_dir="$2" case_name="${3:-}"
+    [[ -n "$case_name" ]] && echo "  ---- $case_name ----"
 
-    if [[ -n "$case_name" ]]; then
-        echo "  ---- $case_name ----"
-    fi
-
-    # Snapshot existing logs
-    local pre_log_file
-    pre_log_file=$(mktemp)
-    trap 'rm -f -- "$pre_log_file"' RETURN
-    ls -1 "$DEVICE_LOG_DIR"/*.log 2>/dev/null | sort > "$pre_log_file" || true
-
-    # Build run command using test_*.py
     local test_file
     test_file=$(find "$example_dir" -maxdepth 1 -name 'test_*.py' -print -quit 2>/dev/null || true)
-
-    local run_cmd
-    if [[ -n "$test_file" ]]; then
-        run_cmd=(
-            python3 "$test_file"
-            --platform "$PLATFORM" --device "$DEVICE_ID"
-            --rounds "$ROUNDS" --skip-golden
-        )
-    else
+    if [[ -z "$test_file" ]]; then
         echo "  SKIPPED: no test_*.py found in $example_dir"
         return
     fi
+
+    local run_cmd=(
+        python3 "$test_file"
+        --platform "$PLATFORM" --device "$DEVICE_ID"
+        --rounds "$ROUNDS" --skip-golden
+    )
     if [[ -n "$case_name" ]]; then
-        run_cmd+=(--case "$case_name")
-        [[ -n "$test_file" ]] && run_cmd+=(--manual include)
+        run_cmd+=(--case "$case_name" --manual include)
     fi
     run_cmd+=("${EXTRA_ARGS[@]}")
 
-    # Run example
-    vlog "Running: ${run_cmd[*]}"
-    local rc=0
+    local output rc=0
+    output=$("${run_cmd[@]}" 2>&1) || rc=$?
     if [[ -n "$VERBOSE_LOG" ]]; then
-        local run_output
-        run_output=$("${run_cmd[@]}" 2>&1) || rc=$?
-        if [[ -n "$run_output" ]]; then echo "$run_output" >> "$VERBOSE_LOG"; fi
-    else
-        "${run_cmd[@]}" > /dev/null 2>&1 || rc=$?
+        echo "==== $example ${case_name} ====" >> "$VERBOSE_LOG"
+        echo "$output" >> "$VERBOSE_LOG"
     fi
     if [[ $rc -ne 0 ]]; then
-        echo "  FAILED: benchmark run returned non-zero"
-        vlog "FAILED: exit code $rc"
+        echo "  FAILED: exit code $rc"
         ((FAIL++)) || true
         return
     fi
 
-    # Find new device log
-    local new_log
-    new_log=$(wait_for_new_log "$pre_log_file")
+    local avgs host_avg device_avg
+    avgs=$(parse_avgs "$output")
+    host_avg=$(echo "$avgs" | awk '{print $1}')
+    device_avg=$(echo "$avgs" | awk '{print $2}')
 
-    if [[ -z "$new_log" ]]; then
-        echo "  FAILED: no device log found in $DEVICE_LOG_DIR"
+    if [[ "$host_avg" == "-" ]]; then
+        echo "  (no per-round timing — was --rounds > 1?)"
         ((FAIL++)) || true
         return
     fi
 
-    echo "  Log: $new_log"
-    local timing_output
-    local parse_rc=0
-    timing_output=$(parse_timing "$new_log") || parse_rc=$?
-    echo "$timing_output"
-
-    if [[ $parse_rc -ne 0 ]]; then
-        ((FAIL++)) || true
-        return
-    fi
+    echo "  Avg Host: $host_avg us  |  Avg Device: $device_avg us  ($ROUNDS rounds)"
     ((PASS++)) || true
 
-    # Extract averages for summary table
     local label="$example"
     [[ -n "$case_name" ]] && label="$example ($case_name)"
-
-    local avg_line
-    avg_line=$(echo "$timing_output" | grep "^  Avg:" || true)
-    local avg_elapsed="-" avg_sched="-" avg_orch="-"
-    if [[ -n "$avg_line" ]]; then
-        avg_elapsed=$(echo "$avg_line" | awk '{print $2}')
-        avg_sched=$(echo "$avg_line" | grep -o 'Sched Avg: [0-9.]*' | awk '{print $3}') || avg_sched="-"
-        avg_orch=$(echo "$avg_line" | grep -o 'Orch Avg: [0-9.]*' | awk '{print $3}') || avg_orch="-"
-    fi
-
     SUMMARY_NAMES+=("$label")
-    SUMMARY_ELAPSED+=("$avg_elapsed")
-    SUMMARY_SCHED+=("$avg_sched")
-    SUMMARY_ORCH+=("$avg_orch")
+    SUMMARY_HOST+=("$host_avg")
+    SUMMARY_DEVICE+=("$device_avg")
 }
 
-# ---------------------------------------------------------------------------
-# Main loop
-# ---------------------------------------------------------------------------
 PASS=0
 FAIL=0
-
-# Summary collection arrays
 SUMMARY_NAMES=()
-SUMMARY_ELAPSED=()
-SUMMARY_SCHED=()
-SUMMARY_ORCH=()
+SUMMARY_HOST=()
+SUMMARY_DEVICE=()
 
 echo ""
 echo "Runtime: $RUNTIME"
@@ -477,25 +182,13 @@ echo "Runtime: $RUNTIME"
 for example in "${EXAMPLE_ORDER[@]}"; do
     case_list="${EXAMPLE_CASES[$example]:-}"
 
-    # Search for example: prefer test_*.py (new style), fall back to golden.py (legacy).
-    # tests/st/ is searched before examples/ since benchmarks use production-scale cases.
     EXAMPLE_DIR=""
     for dir in "${EXAMPLES_DIRS[@]}"; do
         candidate="$dir/$example"
         if [[ -d "$candidate" ]] && ls "$candidate"/test_*.py 1>/dev/null 2>&1; then
-            EXAMPLE_DIR="$candidate"
-            break
+            EXAMPLE_DIR="$candidate"; break
         fi
     done
-    if [[ -z "$EXAMPLE_DIR" ]]; then
-        for dir in "${EXAMPLES_DIRS[@]}"; do
-            candidate="$dir/$example"
-            if [[ -f "$candidate/golden.py" && -d "$candidate/kernels" ]]; then
-                EXAMPLE_DIR="$candidate"
-                break
-            fi
-        done
-    fi
 
     echo ""
     echo "================================================================"
@@ -503,7 +196,7 @@ for example in "${EXAMPLE_ORDER[@]}"; do
     echo "================================================================"
 
     if [[ -z "$EXAMPLE_DIR" ]]; then
-        echo "  SKIP: not found in any search directory"
+        echo "  SKIP: not found"
         ((FAIL++)) || true
         continue
     fi
@@ -518,54 +211,19 @@ for example in "${EXAMPLE_ORDER[@]}"; do
     fi
 done
 
-# ---------------------------------------------------------------------------
-# Performance Summary Table
-# ---------------------------------------------------------------------------
 if [[ ${#SUMMARY_NAMES[@]} -gt 0 ]]; then
-    # Check if any sched/orch data exists across all runs
-    _has_sched=0
-    _has_orch=0
-    for _i in "${!SUMMARY_NAMES[@]}"; do
-        [[ "${SUMMARY_SCHED[$_i]}" != "-" ]] && _has_sched=1
-        [[ "${SUMMARY_ORCH[$_i]}" != "-" ]] && _has_orch=1
-    done
-
     echo ""
     echo "================================================================"
     echo "  Performance Summary ($RUNTIME)"
     echo "================================================================"
     echo ""
-
-    # Header
-    _hdr=$(printf "  %-40s  %12s" "Example" "Elapsed (us)")
-    _sep=$(printf "  %-40s  %12s" "----------------------------------------" "------------")
-    if [[ $_has_sched -eq 1 ]]; then
-        _hdr=$(printf "%s  %12s" "$_hdr" "Sched (us)")
-        _sep=$(printf "%s  %12s" "$_sep" "------------")
-    fi
-    if [[ $_has_orch -eq 1 ]]; then
-        _hdr=$(printf "%s  %12s" "$_hdr" "Orch (us)")
-        _sep=$(printf "%s  %12s" "$_sep" "------------")
-    fi
-    echo "$_hdr"
-    echo "$_sep"
-
-    # Rows
-    for _i in "${!SUMMARY_NAMES[@]}"; do
-        _row=$(printf "  %-40s  %12s" "${SUMMARY_NAMES[$_i]}" "${SUMMARY_ELAPSED[$_i]}")
-        if [[ $_has_sched -eq 1 ]]; then
-            _row=$(printf "%s  %12s" "$_row" "${SUMMARY_SCHED[$_i]}")
-        fi
-        if [[ $_has_orch -eq 1 ]]; then
-            _row=$(printf "%s  %12s" "$_row" "${SUMMARY_ORCH[$_i]}")
-        fi
-        echo "$_row"
+    printf "  %-40s  %12s  %12s\n" "Example" "Host (us)" "Device (us)"
+    printf "  %-40s  %12s  %12s\n" "----------------------------------------" "------------" "------------"
+    for i in "${!SUMMARY_NAMES[@]}"; do
+        printf "  %-40s  %12s  %12s\n" "${SUMMARY_NAMES[$i]}" "${SUMMARY_HOST[$i]}" "${SUMMARY_DEVICE[$i]}"
     done
 fi
 
-# ---------------------------------------------------------------------------
-# Summary
-# ---------------------------------------------------------------------------
 TOTAL=$((PASS + FAIL))
 echo ""
 echo "================================================================"