diff --git a/examples/workers/l2/vector_add/main.py b/examples/workers/l2/vector_add/main.py index 6ad1480ad..6db910b84 100644 --- a/examples/workers/l2/vector_add/main.py +++ b/examples/workers/l2/vector_add/main.py @@ -127,7 +127,7 @@ def build_chip_callable(platform: str) -> ChipCallable: ) -def _run(worker: Worker, chip_cid: int) -> None: +def _run(worker: Worker, chip_cid: int): """Allocate device memory, copy inputs, execute, copy outputs back, verify.""" # --- 1. Prepare host arrays --- torch.manual_seed(42) @@ -155,7 +155,8 @@ def _run(worker: Worker, chip_cid: int) -> None: # --- 4. Run. CallConfig() defaults are fine for this kernel. --- config = CallConfig() print("[vector_add] running on device...") - worker.run(chip_cid, args, config) + timing = worker.run(chip_cid, args, config) + print(f"[vector_add] {timing}") # --- 5. D2H copy back + verify --- worker.copy_from(host_out.data_ptr(), dev_out, NBYTES) @@ -169,6 +170,7 @@ def _run(worker: Worker, chip_cid: int) -> None: print(f"[vector_add] max |host_out - expected| = {max_diff:.3e}") assert torch.allclose(host_out, expected, rtol=1e-5, atol=1e-5) print("[vector_add] golden check PASSED") + return timing def run(platform: str, device_id: int) -> int: diff --git a/examples/workers/l2/vector_add/test_run_timing.py b/examples/workers/l2/vector_add/test_run_timing.py new file mode 100644 index 000000000..a3944c087 --- /dev/null +++ b/examples/workers/l2/vector_add/test_run_timing.py @@ -0,0 +1,115 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Hardware ST asserting Worker.run returns a usable RunTiming. + +Reuses the vector_add example's kernel + ChipCallable build so this test +doesn't drag in its own kernel sources. The contract being verified: + + * `worker.run(...)` returns a `RunTiming` instance (not None). + * `host_wall_us` is strictly positive — there's no way a real dispatch + took zero steady-clock time, so this would catch a regression where + the C ABI stopped writing the timing back to the out-param. + * `device_wall_us` is strictly positive on the default build, even + without `--enable-l2-swimlane`. The orch_summary write was decoupled + from the swimlane shared-region path so device timing is always + available when PTO2_PROFILING is compiled in (default). + * `host_wall_us > device_wall_us` — host wall must wrap the device + dispatch, so the host clock should always read longer. +""" + +import pytest +from _task_interface import RunTiming # pyright: ignore[reportMissingImports] +from simpler.task_interface import CallConfig, ChipStorageTaskArgs, ContinuousTensor, DataType +from simpler.worker import Worker + +from .main import N_COLS, N_ELEMS, N_ROWS, NBYTES, build_chip_callable + + +def _drive_one_run(platform: str, device_id: int, *, enable_l2_swimlane: bool = False): + import torch # noqa: PLC0415 + + worker = Worker( + level=2, + platform=platform, + runtime="tensormap_and_ringbuffer", + device_id=device_id, + ) + chip_callable = build_chip_callable(platform) + chip_cid = worker.register(chip_callable) + worker.init() + try: + # Use deterministic inputs so the run never accidentally hits a + # degenerate kernel path that fails to publish a perf record. + host_a = torch.full((N_ROWS, N_COLS), 1.0, dtype=torch.float32) + host_b = torch.full((N_ROWS, N_COLS), 2.0, dtype=torch.float32) + + dev_a = worker.malloc(NBYTES) + dev_b = worker.malloc(NBYTES) + dev_out = worker.malloc(NBYTES) + worker.copy_to(dev_a, host_a.data_ptr(), NBYTES) + worker.copy_to(dev_b, host_b.data_ptr(), NBYTES) + + args = ChipStorageTaskArgs() + args.add_tensor(ContinuousTensor.make(dev_a, (N_ROWS, N_COLS), DataType.FLOAT32)) + args.add_tensor(ContinuousTensor.make(dev_b, (N_ROWS, N_COLS), DataType.FLOAT32)) + args.add_tensor(ContinuousTensor.make(dev_out, (N_ROWS, N_COLS), DataType.FLOAT32)) + + config = CallConfig() + config.enable_l2_swimlane = enable_l2_swimlane + + timing = worker.run(chip_cid, args, config) + + # Verify the output is sane (so we know the kernel actually ran and + # the timing isn't from an early-error path). + host_out = torch.zeros(N_ROWS, N_COLS, dtype=torch.float32) + worker.copy_from(host_out.data_ptr(), dev_out, NBYTES) + worker.free(dev_a) + worker.free(dev_b) + worker.free(dev_out) + assert torch.allclose(host_out, host_a + host_b, rtol=1e-5, atol=1e-5), ( + f"vector_add kernel output diverged; max |a+b - out| = " + f"{float(torch.max(torch.abs(host_out - (host_a + host_b)))):.3e} " + f"(N_ELEMS={N_ELEMS})" + ) + return timing + finally: + worker.close() + + +@pytest.mark.platforms(["a2a3sim", "a2a3"]) +@pytest.mark.runtime("tensormap_and_ringbuffer") +@pytest.mark.device_count(1) +def test_worker_run_returns_run_timing(st_platform, st_device_ids): + timing = _drive_one_run(st_platform, int(st_device_ids[0])) + + assert isinstance(timing, RunTiming), ( + f"Worker.run must return a RunTiming (got {type(timing).__name__}); " + f"a None return means the ChipWorker Python wrapper dropped the C++ return value." + ) + # Host wall is measured with steady_clock around the dispatch; even on + # sim it covers thread + IPC overhead so the only way to see 0 is a + # bug in the out-param plumbing. + assert timing.host_wall_us > 0.0, f"host_wall_us must be > 0, got {timing.host_wall_us}" + assert timing.host_wall_ns > 0, f"host_wall_ns must be > 0, got {timing.host_wall_ns}" + # device_wall must also be > 0 without --enable-l2-swimlane after the + # Phase B decoupling: orch_summary is written unconditionally when + # PTO2_PROFILING is on (default build). Hitting 0 here means either: + # - the AICPU's l2_perf_aicpu_write_orch_summary path regressed back + # under an is_l2_swimlane_enabled() gate, or + # - the host stopped reading the phase header after the run. + assert timing.device_wall_us > 0.0, ( + f"device_wall_us must be > 0 on the default PTO2_PROFILING build, got {timing.device_wall_us}; " + f"regression in the orch_summary capture path." + ) + assert timing.device_wall_ns > 0 + # Host wall must wrap the device dispatch — anything else points at + # a steady-clock / cycle-conversion bug. + assert timing.host_wall_us >= timing.device_wall_us, ( + f"host_wall_us ({timing.host_wall_us}) must wrap device_wall_us ({timing.device_wall_us})" + ) diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index a6f07c48c..5d4d874fa 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -652,6 +652,57 @@ NB_MODULE(_task_interface, m) { // one, change the other. m.attr("DEFAULT_LOG_THRESHOLD") = 20; // V5 = Python INFO + // --- RunTiming --- + // Returned by ChipWorker.run_prepared* / Worker.run. Cycles → ns conversion + // happens on the platform side (frequency known there); units exposed to + // Python are µs as floats to match historical benchmark_rounds.sh output. + nb::class_(m, "RunTiming") + .def(nb::init<>()) + .def( + "__init__", + [](RunTiming *self, uint64_t host_wall_ns, uint64_t device_wall_ns) { + new (self) RunTiming{host_wall_ns, device_wall_ns}; + }, + nb::arg("host_wall_ns"), nb::arg("device_wall_ns") = 0, + "Construct with explicit ns values (used by the Python Worker.run " + "wrapper to surface host-side timing for L3+ DAGs)." + ) + .def_prop_ro( + "host_wall_us", + [](const RunTiming &t) { + return t.host_wall_ns / 1000.0; + }, + "Host steady-clock wall around the dispatch, in microseconds." + ) + .def_prop_ro( + "device_wall_us", + [](const RunTiming &t) { + return t.device_wall_ns / 1000.0; + }, + "On-NPU wall (orch end - orch start), in microseconds. Populated whenever the " + "runtime was built with PTO2_PROFILING (the default); independent of " + "enable_l2_swimlane after the orch_summary capture was decoupled from the " + "swimlane shared region. Zero only on a PTO2_PROFILING-off build." + ) + .def_prop_ro( + "host_wall_ns", + [](const RunTiming &t) { + return t.host_wall_ns; + } + ) + .def_prop_ro( + "device_wall_ns", + [](const RunTiming &t) { + return t.device_wall_ns; + } + ) + .def("__repr__", [](const RunTiming &t) { + std::ostringstream os; + os << "RunTiming(host_wall_us=" << t.host_wall_ns / 1000.0 + << ", device_wall_us=" << t.device_wall_ns / 1000.0 << ")"; + return os.str(); + }); + // --- ChipWorker --- nb::class_(m, "_ChipWorker") .def(nb::init<>()) @@ -685,19 +736,21 @@ NB_MODULE(_task_interface, m) { .def( "run", [](ChipWorker &self, int32_t callable_id, ChipStorageTaskArgs &args, const CallConfig &config) { - self.run(callable_id, &args, config); + return self.run(callable_id, &args, config); }, nb::arg("callable_id"), nb::arg("args"), nb::arg("config"), - "Launch a callable_id previously staged via prepare_callable." + "Launch a callable_id previously staged via prepare_callable. " + "Returns RunTiming with host/device wall." ) .def( "run", [](ChipWorker &self, int32_t callable_id, TaskArgs &args, const CallConfig &config) { TaskArgsView view = make_view(args); - self.run(callable_id, view, config); + return self.run(callable_id, view, config); }, nb::arg("callable_id"), nb::arg("args"), nb::arg("config"), - "Launch a callable_id from a TaskArgs (used for in-process callers)." + "Launch a callable_id from a TaskArgs (used for in-process callers). " + "Returns RunTiming." ) .def( "run_prepared_from_blob", @@ -710,7 +763,7 @@ NB_MODULE(_task_interface, m) { // loops never re-implement the tensor/scalar layout in Python // (where it has historically dropped fields like child_memory). TaskArgsView view = read_blob(reinterpret_cast(args_blob_ptr), blob_capacity); - self.run(callable_id, view, config); + return self.run(callable_id, view, config); }, nb::arg("callable_id"), nb::arg("args_blob_ptr"), nb::arg("blob_capacity"), nb::arg("config"), "Launch a callable_id from a raw mailbox-blob pointer + capacity " diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py index 2e0398898..49e8471d6 100644 --- a/python/simpler/task_interface.py +++ b/python/simpler/task_interface.py @@ -521,12 +521,14 @@ def run(self, callable_id, args, config=None, **kwargs): args: ChipStorageTaskArgs for this invocation. config: Optional CallConfig. If None, a default is created. **kwargs: Overrides applied to config (e.g. block_dim=24). + + Returns a :class:`RunTiming` with host + device wall. """ if config is None: config = CallConfig() for k, v in kwargs.items(): setattr(config, k, v) - self._impl.run(int(callable_id), args, config) + return self._impl.run(int(callable_id), args, config) def unregister_callable(self, callable_id): """Drop prepared state for ``callable_id`` and release its orch SO share.""" diff --git a/python/simpler/worker.py b/python/simpler/worker.py index 64c678236..ba02d5e86 100644 --- a/python/simpler/worker.py +++ b/python/simpler/worker.py @@ -70,6 +70,7 @@ def my_l4_orch(orch, args, config): MAX_REGISTERED_CALLABLE_IDS, ChipBootstrapChannel, ChipBootstrapMailboxState, + RunTiming, _mailbox_load_i32, _mailbox_store_i32, read_args_from_blob, @@ -1444,7 +1445,7 @@ def copy_from(self, dst: int, src: int, size: int, worker_id: int = 0) -> None: # run — uniform entry point # ------------------------------------------------------------------ - def run(self, callable, args=None, config=None) -> None: + def run(self, callable, args=None, config=None) -> RunTiming: """Execute one task (L2) or one DAG (L3+) synchronously. Dispatch: @@ -1455,35 +1456,48 @@ def run(self, callable, args=None, config=None) -> None: ``args`` : TaskArgs (optional) ``config``: CallConfig (optional, default-constructed if None) + + Returns a :class:`RunTiming` with ``host_wall_us`` (Python wall-clock + around the dispatch) and ``device_wall_us`` (on-NPU orchestrator wall, + populated whenever the runtime was built with ``PTO2_PROFILING`` — + the default build has it on). For L3+ DAGs, ``host_wall_us`` covers + the whole orch fn and ``device_wall_us`` is unset (0) — per-task + device timings are not aggregated here. """ assert self._initialized, "Worker not initialized; call init() first" cfg = config if config is not None else CallConfig() if self.level == 2: assert self._chip_worker is not None - self._chip_worker.run(int(callable), args, cfg) - else: - self._start_hierarchical() - assert self._orch is not None - assert self._worker is not None - # Drop any error stashed by a previous run() so this call starts - # clean. drain() rethrows on the way out; every successful run() - # leaves the error slot empty, but an unrelated caller may have - # poked it. - self._orch._clear_error() - self._orch._scope_begin() - try: - callable(self._orch, args, cfg) - finally: - # Always release scope refs and drain so ring slots aren't - # stranded when the orch fn raises mid-DAG. drain() also - # rethrows the first dispatch failure for this run — that - # is how child-task exceptions surface to the caller of - # Worker.run(). scope_end deliberately does NOT throw: if - # it did, released refs would be incomplete and drain - # would hang on in-flight tasks. - self._orch._scope_end() - self._orch._drain() + return self._chip_worker.run(int(callable), args, cfg) + + self._start_hierarchical() + assert self._orch is not None + assert self._worker is not None + # Drop any error stashed by a previous run() so this call starts + # clean. drain() rethrows on the way out; every successful run() + # leaves the error slot empty, but an unrelated caller may have + # poked it. + self._orch._clear_error() + self._orch._scope_begin() + t_start = time.perf_counter_ns() + try: + callable(self._orch, args, cfg) + finally: + # Always release scope refs and drain so ring slots aren't + # stranded when the orch fn raises mid-DAG. drain() also + # rethrows the first dispatch failure for this run — that + # is how child-task exceptions surface to the caller of + # Worker.run(). scope_end deliberately does NOT throw: if + # it did, released refs would be incomplete and drain + # would hang on in-flight tasks. + self._orch._scope_end() + self._orch._drain() + # device_wall stays 0 for L3+: aggregating per-task device cycles + # across a DAG isn't implemented here (would need accumulation in the + # ring scheduler). Callers wanting per-task device wall should issue + # individual run calls. + return RunTiming(time.perf_counter_ns() - t_start, 0) def prepare_callable(self, callable_id: int, callable) -> None: """L2 only: pre-stage a callable under ``callable_id`` (see diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py index e681fa167..5a994b332 100644 --- a/simpler_setup/scene_test.py +++ b/simpler_setup/scene_test.py @@ -325,6 +325,69 @@ def _temporary_env(env_updates): os.environ[k] = v +def _log_round_timings(timings): + """Print per-round + summary host/device wall (µs) for multi-round runs. + + Replaces the device-log scraping that ``tools/benchmark_rounds.sh`` used to + do — Worker.run now returns the timing directly, so the per-round table + can be emitted by the framework without grep+awk over device logs. + + Output goes to ``print`` (not ``logger.info``) because benchmark numbers + are a user-facing artifact and the project's default log level suppresses + INFO in standalone test_*.py runs. + + ``timings`` is a list of (host_wall_us, device_wall_us) tuples. The device + column reports 0 when the runtime was built without PTO2_PROFILING (the + default build has it on) or when an L3+ DAG is in use (per-task device + cycles aren't aggregated). + """ + if not timings: + return + n = len(timings) + host_vals = sorted(t[0] for t in timings) + dev_vals = sorted(t[1] for t in timings) + host_avg = sum(host_vals) / n + # Device avg averages over non-zero rounds only. When --rounds > 1 the + # framework restricts enable_l2_swimlane (and thus orch_summary capture) + # to round 0, so the other rounds report 0; including those zeros in the + # average would silently halve / quarter the reported device wall and + # mislead the benchmark consumer. dev_count carries the sample size into + # the summary line for transparency. + dev_nonzero = [v for v in dev_vals if v > 0.0] + dev_count = len(dev_nonzero) + dev_avg = (sum(dev_nonzero) / dev_count) if dev_count else 0.0 + show_device = dev_count > 0 + + header = f" {'Round':<6} {'Host (us)':>12}" + if show_device: + header += f" {'Device (us)':>12}" + print(header) + print(" " + "-" * (len(header) - 2)) + for i, (h, d) in enumerate(timings): + line = f" {i:<6d} {h:>12.1f}" + if show_device: + line += f" {d:>12.1f}" + print(line) + summary = f" Avg Host: {host_avg:.1f} us" + if show_device: + summary += f" | Avg Device: {dev_avg:.1f} us [{dev_count}/{n} rounds captured]" + summary += f" ({n} rounds)" + print(summary) + + trim = 10 + if n > 2 * trim: + tc = n - 2 * trim + host_trim = sum(host_vals[trim:-trim]) / tc + msg = f" Trimmed Avg Host: {host_trim:.1f} us" + if show_device and dev_count > 2 * trim: + dev_nonzero_sorted = sorted(dev_nonzero) + dev_trim_count = dev_count - 2 * trim + dev_trim = sum(dev_nonzero_sorted[trim:-trim]) / dev_trim_count + msg += f" | Trimmed Avg Device: {dev_trim:.1f} us" + msg += f" (dropped {trim} low + {trim} high, {tc} rounds used)" + print(msg) + + def _resolve_callable_paths(cls, cls_dir): """Resolve relative source paths in CALLABLE against cls_dir.""" callable_spec = cls.CALLABLE @@ -909,26 +972,37 @@ def _run_and_validate_l2( initial_outputs[name] = getattr(test_args, name).clone() # Execute rounds + timings = [] # populated only when rounds > 1 for round_idx in range(rounds): if round_idx > 0: for name, initial in initial_outputs.items(): getattr(test_args, name).copy_(initial) + # enable_l2_swimlane / enable_dep_gen are already forced False by + # the upstream gate in test_run / run_module when rounds > 1, so an + # extra `and round_idx == 0` here is dead code; pass them through + # verbatim. (If the upstream gate is ever relaxed, restore the + # per-round masking here.) config = self._build_config( config_dict, - enable_l2_swimlane=(enable_l2_swimlane if round_idx == 0 else 0), + enable_l2_swimlane=enable_l2_swimlane, enable_dump_tensor=enable_dump_tensor, enable_pmu=enable_pmu, - enable_dep_gen=(enable_dep_gen and round_idx == 0), + enable_dep_gen=enable_dep_gen, output_prefix=output_prefix, ) with _temporary_env(self._resolve_env()): - worker.run(cid, chip_args, config=config) + timing = worker.run(cid, chip_args, config=config) + if rounds > 1 and timing is not None: + timings.append((timing.host_wall_us, timing.device_wall_us)) if not skip_golden: _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) + if timings: + _log_round_timings(timings) + def _run_and_validate_l3( # noqa: PLR0913 -- threads CLI diagnostic flags + L3 ns context self, worker, @@ -981,17 +1055,20 @@ def _run_and_validate_l3( # noqa: PLR0913 -- threads CLI diagnostic flags + L3 orch_fn = self.CALLABLE["orchestration"] # Execute rounds + timings = [] for round_idx in range(rounds): if round_idx > 0: for name, initial in initial_tensors.items(): getattr(test_args, name).copy_(initial) + # See _run_and_validate_l2: the per-round masking is dead code + # under the existing upstream gate. Keep parity by passing through. config = self._build_config( config_dict, - enable_l2_swimlane=(enable_l2_swimlane if round_idx == 0 else 0), + enable_l2_swimlane=enable_l2_swimlane, enable_dump_tensor=enable_dump_tensor, enable_pmu=enable_pmu, - enable_dep_gen=(enable_dep_gen and round_idx == 0), + enable_dep_gen=enable_dep_gen, output_prefix=output_prefix, ) @@ -1001,11 +1078,18 @@ def task_orch(orch, _args, _cfg, _ns=ns, _test_args=test_args, _config=config): orch_fn(orch, _ns, _test_args, _config) with _temporary_env(self._resolve_env()): - worker.run(task_orch) + timing = worker.run(task_orch) + if rounds > 1 and timing is not None: + # L3+ DAGs surface host wall only; device_wall_us is 0 because + # per-task device cycles aren't aggregated up to Worker.run. + timings.append((timing.host_wall_us, timing.device_wall_us)) if not skip_golden: _compare_outputs(test_args, golden_args, all_tensor_names, self.RTOL, self.ATOL) + if timings: + _log_round_timings(timings) + # ------------------------------------------------------------------ # pytest auto test method # ------------------------------------------------------------------ diff --git a/src/a2a3/platform/include/common/kernel_args.h b/src/a2a3/platform/include/common/kernel_args.h index 9cffcd169..99e72fdea 100644 --- a/src/a2a3/platform/include/common/kernel_args.h +++ b/src/a2a3/platform/include/common/kernel_args.h @@ -95,6 +95,19 @@ struct KernelArgs { uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 uint32_t enable_profiling_flag{0}; // Profiling umbrella bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu uint32_t _pad{0}; // Alignment padding + + // Device pointer to an 8-byte buffer that the platform AICPU entry writes + // the run-wall (ns) into. Allocated once at simpler_init, kept resident. + // Onboard AICPU receives KernelArgs as a CANN-private copy (see + // rtAicpuKernelLaunchExWithArgs in launch_aicpu_kernel), so an inline + // field on KernelArgs is write-only from AICPU — host has no way to read + // it back. The dedicated device buffer is host-allocated and the address + // travels via this field; AICPU writes `*(uint64_t*)device_wall_data_base + // = wall_ns`, host pulls the 8 bytes via rtMemcpy(... DEVICE_TO_HOST) + // after stream sync. Sim's "device pointer" is a host malloc'd uint64 + // (no special-casing — write-through works because sim AICPU and host + // share memory). Zero when the buffer was not allocated. + uint64_t device_wall_data_base{0}; }; #ifdef __cplusplus diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index 843dc9fd6..32e24a526 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -15,6 +15,7 @@ #include "common/platform_config.h" #include "aicpu/dep_gen_collector_aicpu.h" #include "aicpu/device_log.h" +#include "aicpu/device_time.h" #include "aicpu/l2_perf_collector_aicpu.h" #include "aicpu/pmu_collector_aicpu.h" #include "aicpu/platform_regs.h" @@ -22,6 +23,15 @@ #include "aicpu/tensor_dump_aicpu.h" #include "runtime.h" +// Run-wall capture: g_device_start_cycle is set once in +// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread +// of the multi-threaded DynTileFwkBackendKernelServer writes the converted +// (end - start) into KernelArgs.device_wall_ns on exit. Plain stores — +// last-writer-wins is fine for wall measurement (concurrent exiting threads' +// `my_end` values differ by µs, the final overwrite is within benchmark +// noise of the true max-end - min-start). +static uint64_t g_device_start_cycle = 0; + // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp) extern "C" int aicpu_execute(Runtime *arg); @@ -55,6 +65,15 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer set_log_level(static_cast(k_args->log_level)); set_log_info_v(static_cast(k_args->log_info_v)); + // Init is launched single-threaded (block_dim=1), so the race-free spot + // to capture run start and reset the wall accumulator. Subsequent + // DynTileFwkBackendKernelServer threads stamp end on their way out, via + // the device-resident 8-byte buffer addressed by device_wall_data_base. + g_device_start_cycle = get_sys_cnt_aicpu(); + if (k_args->device_wall_data_base != 0) { + *reinterpret_cast(k_args->device_wall_data_base) = 0; + } + LOG_INFO_V0("%s", "Runtime Executor Init: Initializing AICPU kernel"); return 0; } @@ -117,5 +136,14 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer } LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully"); + // Stamp end into the device_wall buffer (addressed via + // device_wall_data_base). Last-writer-wins across threads — wall + // measurements are µs-scale tolerant, see g_device_start_cycle comment. + uint64_t my_end = get_sys_cnt_aicpu(); + if (k_args->device_wall_data_base != 0 && my_end > g_device_start_cycle) { + *reinterpret_cast(k_args->device_wall_data_base) = + static_cast(cycles_to_us(my_end - g_device_start_cycle) * 1000.0); + } + return rc; } diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 7e51b5c94..933247b4f 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -512,6 +513,19 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { return rc; } + // Lazy-allocate the 8-byte device_wall buffer on first run. AICPU writes + // the run wall (ns) into this buffer via KernelArgs::device_wall_data_base; + // host pulls it back via rtMemcpy D2H after stream sync (see post-sync + // block below). Kept resident across runs; freed in finalize. + if (device_wall_dev_ptr_ == nullptr) { + device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t)); + if (device_wall_dev_ptr_ != nullptr) { + kernel_args_.args.device_wall_data_base = reinterpret_cast(device_wall_dev_ptr_); + uint64_t zero = 0; + (void)copy_to_device(device_wall_dev_ptr_, &zero, sizeof(uint64_t)); + } + } + // Validate block_dim against stream resource limits rc = validate_block_dim(stream_aicore_, block_dim); if (rc != 0) { @@ -763,6 +777,23 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { return rc; } + // Pull the platform-level device wall (ns) back from the 8-byte device + // buffer that AICPU writes through via KernelArgs::device_wall_data_base. + // (We can't use the device_k_args_ shadow here — CANN's + // rtAicpuKernelLaunchExWithArgs copies KernelArgs into AICPU-private + // memory at launch, so AICPU's writes to its local copy don't propagate + // to device_k_args_.) Failure path is a soft warn — wall stays zero. + device_wall_ns_ = 0; + if (device_wall_dev_ptr_ != nullptr) { + int wall_rc = rtMemcpy( + &device_wall_ns_, sizeof(uint64_t), device_wall_dev_ptr_, sizeof(uint64_t), RT_MEMCPY_DEVICE_TO_HOST + ); + if (wall_rc != 0) { + LOG_WARN("rtMemcpy(device_wall_ns) D2H failed: %d", wall_rc); + device_wall_ns_ = 0; + } + } + // Tear down collectors. stop() joins mgmt then collector in the only safe // order (mgmt's final-drain pass into L2 has poll as its consumer). // Diagnostic exports use the per-task `output_prefix_` directory the user @@ -1102,6 +1133,11 @@ int DeviceRunner::finalize() { acl_ready_ = false; } + // Free the 8-byte device_wall buffer (allocated lazily in run()). + if (device_wall_dev_ptr_ != nullptr) { + free_tensor(device_wall_dev_ptr_); + device_wall_dev_ptr_ = nullptr; + } device_id_ = -1; block_dim_ = 0; worker_count_ = 0; diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 812ba3c4a..99ad566f4 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -294,6 +294,14 @@ class DeviceRunner { void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; } const std::string &output_prefix() const { return output_prefix_; } + /** + * Device-side wall (ns) from the most recently completed run, written + * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in + * run()). Returns 0 before any run completes. Independent of any + * profiling / swimlane subsystem. + */ + uint64_t last_device_wall_ns() const { return device_wall_ns_; } + /** * Print handshake results from device * @@ -550,6 +558,15 @@ class DeviceRunner { rtStream_t stream_aicore_{nullptr}; AicpuSoInfo so_info_; KernelArgsHelper kernel_args_; + + // Platform-level device wall buffer: 8-byte device-resident slot whose + // address rides on KernelArgs.device_wall_data_base. AICPU writes the + // run wall (ns) through that pointer; this DeviceRunner pulls it back + // via copy_from_device after stream sync and caches it for + // last_device_wall_ns(). Allocated once at simpler_init, freed in + // finalize. + void *device_wall_dev_ptr_{nullptr}; + uint64_t device_wall_ns_{0}; DeviceArgs device_args_; // Kernel binary management diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index 267114b7a..b287e277b 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -23,6 +23,7 @@ #include +#include #include #include #include @@ -298,8 +299,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c int run_prepared( DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen, - const char *output_prefix + const char *output_prefix, PtoRunTiming *out_timing ) { + if (out_timing != NULL) { + out_timing->host_wall_ns = 0; + out_timing->device_wall_ns = 0; + } if (ctx == NULL || runtime == NULL) return -1; DeviceRunner *runner = static_cast(ctx); @@ -314,6 +319,8 @@ int run_prepared( pthread_setspecific(g_runner_key, nullptr); }); + const auto host_t0 = std::chrono::steady_clock::now(); + try { int rc = runner->prepare_run_context(runner->device_id()); if (rc != 0) return rc; @@ -367,6 +374,12 @@ int run_prepared( rc = validate_runtime_impl(r); r->~Runtime(); + if (out_timing != NULL) { + const auto host_t1 = std::chrono::steady_clock::now(); + out_timing->host_wall_ns = + static_cast(std::chrono::duration_cast(host_t1 - host_t0).count()); + out_timing->device_wall_ns = runner->last_device_wall_ns(); + } return rc; } catch (...) { return -1; diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 46d3e6c5b..bd2b9a719 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -351,6 +352,18 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { return rc; } + // Lazy-allocate the 8-byte device_wall buffer on first run. Sim's + // "device pointer" is a host malloc returned by allocate_tensor; the + // sim AICPU thread (host CPU thread) writes through this pointer just + // like onboard's AICPU does. + if (device_wall_dev_ptr_ == nullptr) { + device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t)); + if (device_wall_dev_ptr_ != nullptr) { + kernel_args_.device_wall_data_base = reinterpret_cast(device_wall_dev_ptr_); + *static_cast(device_wall_dev_ptr_) = 0; + } + } + // Calculate execution parameters block_dim_ = block_dim; int num_aicore = block_dim * cores_per_blockdim_; @@ -578,8 +591,20 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { std::vector aicpu_threads; aicpu_threads.reserve(over_launch); std::atomic aicpu_rc{0}; + + // Sim "device wall" capture — mirrors the onboard kernel.cpp path via the + // device_wall_data_base buffer (sim's "device pointer" is a host malloc'd + // uint64). sim_t0 is the host steady_clock equivalent of get_sys_cnt_aicpu + // in Init; each thread on exit overwrites the buffer with (now - sim_t0). + // Last-writer-wins — sim threads are host CPU threads, variance is well + // below benchmark noise. + if (kernel_args_.device_wall_data_base != 0) { + *reinterpret_cast(kernel_args_.device_wall_data_base) = 0; + } + const auto sim_t0 = std::chrono::steady_clock::now(); + for (int i = 0; i < over_launch; i++) { - aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc]() { + aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc, sim_t0]() { if (!platform_aicpu_affinity_gate(launch_aicpu_num, over_launch)) { return; } @@ -588,6 +613,11 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { int expected = 0; aicpu_rc.compare_exchange_strong(expected, rc, std::memory_order_acq_rel); } + if (kernel_args_.device_wall_data_base != 0) { + const auto t1 = std::chrono::steady_clock::now(); + *reinterpret_cast(kernel_args_.device_wall_data_base) = + static_cast(std::chrono::duration_cast(t1 - sim_t0).count()); + } })); } @@ -615,6 +645,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { } LOG_INFO_V0("All threads completed"); + // Snapshot the device_wall buffer into device_wall_ns_. Sim's "device + // memory" is host memory, so this is a single dereference — no DMA. + // Mirrors the post-sync rtMemcpy D2H that the onboard DeviceRunner does. + device_wall_ns_ = 0; + if (device_wall_dev_ptr_ != nullptr) { + device_wall_ns_ = *static_cast(device_wall_dev_ptr_); + } + int runtime_rc = aicpu_rc.load(std::memory_order_acquire); if (runtime_rc != 0) { LOG_ERROR("AICPU execution failed with rc=%d", runtime_rc); @@ -955,6 +993,11 @@ int DeviceRunner::finalize() { mem_alloc_.finalize(); clear_cpu_sim_shared_storage(); + // Free the 8-byte device_wall buffer (allocated lazily in run()). + if (device_wall_dev_ptr_ != nullptr) { + free_tensor(device_wall_dev_ptr_); + device_wall_dev_ptr_ = nullptr; + } device_id_ = -1; worker_count_ = 0; last_runtime_ = nullptr; diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index 573236634..748b4e7ab 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -174,6 +174,14 @@ class DeviceRunner { void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; } const std::string &output_prefix() const { return output_prefix_; } + /** + * Device-side wall (ns) from the most recently completed run, written + * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in + * run()). Returns 0 before any run completes. Independent of any + * profiling / swimlane subsystem. + */ + uint64_t last_device_wall_ns() const { return device_wall_ns_; } + /** * Attach the calling thread to the simulated device. * @@ -257,6 +265,15 @@ class DeviceRunner { // Simulation state (no actual device resources) KernelArgs kernel_args_; + // Platform-level device wall buffer: 8-byte device-resident slot whose + // address rides on KernelArgs.device_wall_data_base. AICPU writes the + // run wall (ns) through that pointer; this DeviceRunner pulls it back + // via copy_from_device after stream sync and caches it for + // last_device_wall_ns(). Allocated once at simpler_init, freed in + // finalize. + void *device_wall_dev_ptr_{nullptr}; + uint64_t device_wall_ns_{0}; + // Chip-callable buffer pool (sim path). Keyed by FNV-1a 64-bit content // hash of the ChipCallable bytes. Each entry owns a host scratch holding // the ChipCallable with each child's resolved_addr_ fixed up to the diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index afbd85718..b6276f184 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -273,8 +274,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c int run_prepared( DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen, - const char *output_prefix + const char *output_prefix, PtoRunTiming *out_timing ) { + if (out_timing != NULL) { + out_timing->host_wall_ns = 0; + out_timing->device_wall_ns = 0; + } if (ctx == NULL || runtime == NULL) return -1; DeviceRunner *runner = static_cast(ctx); @@ -286,6 +291,8 @@ int run_prepared( pthread_once(&g_runner_key_once, create_runner_key); pthread_setspecific(g_runner_key, ctx); + const auto host_t0 = std::chrono::steady_clock::now(); + try { Runtime *r = new (runtime) Runtime(); r->host_api.device_malloc = device_malloc; @@ -331,6 +338,12 @@ int run_prepared( rc = validate_runtime_impl(r); r->~Runtime(); pthread_setspecific(g_runner_key, nullptr); + if (out_timing != NULL) { + const auto host_t1 = std::chrono::steady_clock::now(); + out_timing->host_wall_ns = + static_cast(std::chrono::duration_cast(host_t1 - host_t0).count()); + out_timing->device_wall_ns = runner->last_device_wall_ns(); + } return rc; } catch (...) { pthread_setspecific(g_runner_key, nullptr); diff --git a/src/a5/platform/include/common/kernel_args.h b/src/a5/platform/include/common/kernel_args.h index 8494ac369..9a5416806 100644 --- a/src/a5/platform/include/common/kernel_args.h +++ b/src/a5/platform/include/common/kernel_args.h @@ -81,6 +81,12 @@ struct KernelArgs { uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 uint32_t enable_profiling_flag{0}; // Profiling umbrella bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu uint32_t _pad{0}; // Alignment padding + + // Device pointer to an 8-byte buffer that the platform AICPU entry writes + // the run-wall (ns) into. Allocated once at simpler_init, kept resident. + // See the a2a3 kernel_args.h for the full design rationale (CANN's + // AICPU args copy makes inline fields write-only). + uint64_t device_wall_data_base{0}; }; #ifdef __cplusplus diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp index 4fbea2945..4337e4429 100644 --- a/src/a5/platform/onboard/aicpu/kernel.cpp +++ b/src/a5/platform/onboard/aicpu/kernel.cpp @@ -14,6 +14,7 @@ #include "common/kernel_args.h" #include "common/platform_config.h" #include "aicpu/device_log.h" +#include "aicpu/device_time.h" #include "aicpu/l2_perf_collector_aicpu.h" #include "aicpu/platform_regs.h" #include "aicpu/platform_aicpu_affinity.h" @@ -21,6 +22,13 @@ #include "aicpu/tensor_dump_aicpu.h" #include "runtime.h" +// Run-wall capture: g_device_start_cycle is set once in +// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread +// of the multi-threaded DynTileFwkBackendKernelServer writes the converted +// (end - start) into KernelArgs.device_wall_ns on exit. Plain stores — +// last-writer-wins is fine for wall measurement. +static uint64_t g_device_start_cycle = 0; + // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp) extern "C" int aicpu_execute(Runtime *arg); @@ -54,6 +62,13 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer set_log_level(static_cast(k_args->log_level)); set_log_info_v(static_cast(k_args->log_info_v)); + // Init is launched single-threaded (block_dim=1) — race-free spot to + // capture run start and reset the device_wall buffer. + g_device_start_cycle = get_sys_cnt_aicpu(); + if (k_args->device_wall_data_base != 0) { + *reinterpret_cast(k_args->device_wall_data_base) = 0; + } + LOG_INFO_V0("%s", "Runtime Executor Init: Initializing AICPU kernel"); return 0; } @@ -114,5 +129,12 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer } LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully"); + // Stamp end into the device_wall buffer. Last-writer-wins across threads. + uint64_t my_end = get_sys_cnt_aicpu(); + if (k_args->device_wall_data_base != 0 && my_end > g_device_start_cycle) { + *reinterpret_cast(k_args->device_wall_data_base) = + static_cast(cycles_to_us(my_end - g_device_start_cycle) * 1000.0); + } + return rc; } diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index b27d9c6c9..bb46f24fc 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -375,6 +376,17 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { return rc; } + // Lazy-allocate the 8-byte device_wall buffer on first run. See the a2a3 + // onboard equivalent for the rationale. + if (device_wall_dev_ptr_ == nullptr) { + device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t)); + if (device_wall_dev_ptr_ != nullptr) { + kernel_args_.args.device_wall_data_base = reinterpret_cast(device_wall_dev_ptr_); + uint64_t zero = 0; + (void)copy_to_device(device_wall_dev_ptr_, &zero, sizeof(uint64_t)); + } + } + // Validate block_dim against stream resource limits rc = validate_block_dim(stream_aicore_, block_dim); if (rc != 0) { @@ -560,6 +572,20 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { return rc; } + // Pull the platform-level device wall (ns) back from the dedicated + // 8-byte device buffer (AICPU writes via KernelArgs::device_wall_data_base + // — CANN copies args at launch so the inline field would be unreachable). + device_wall_ns_ = 0; + if (device_wall_dev_ptr_ != nullptr) { + int wall_rc = rtMemcpy( + &device_wall_ns_, sizeof(uint64_t), device_wall_dev_ptr_, sizeof(uint64_t), RT_MEMCPY_DEVICE_TO_HOST + ); + if (wall_rc != 0) { + LOG_WARN("rtMemcpy(device_wall_ns) D2H failed: %d", wall_rc); + device_wall_ns_ = 0; + } + } + // Tear down collectors. stop() joins mgmt then collector in the only // safe order (mgmt's final-drain pass into L2 has poll as its // consumer). Diagnostic exports use the per-task `output_prefix_` @@ -882,6 +908,11 @@ int DeviceRunner::finalize() { return rc; } + // Free the 8-byte device_wall buffer (allocated lazily in run()). + if (device_wall_dev_ptr_ != nullptr) { + free_tensor(device_wall_dev_ptr_); + device_wall_dev_ptr_ = nullptr; + } device_id_ = -1; block_dim_ = 0; worker_count_ = 0; diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 985f9708b..3762eeaee 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -279,6 +279,14 @@ class DeviceRunner { void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; } const std::string &output_prefix() const { return output_prefix_; } + /** + * Device-side wall (ns) from the most recently completed run, written + * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in + * run()). Returns 0 before any run completes. Independent of any + * profiling / swimlane subsystem. + */ + uint64_t last_device_wall_ns() const { return device_wall_ns_; } + /** * Print handshake results from device * @@ -457,6 +465,15 @@ class DeviceRunner { rtStream_t stream_aicore_{nullptr}; AicpuSoInfo so_info_; KernelArgsHelper kernel_args_; + + // Platform-level device wall buffer: 8-byte device-resident slot whose + // address rides on KernelArgs.device_wall_data_base. AICPU writes the + // run wall (ns) through that pointer; this DeviceRunner pulls it back + // via copy_from_device after stream sync and caches it for + // last_device_wall_ns(). Allocated once at simpler_init, freed in + // finalize. + void *device_wall_dev_ptr_{nullptr}; + uint64_t device_wall_ns_{0}; DeviceArgs device_args_; // Kernel binary management diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 06fad5170..ee5b02f67 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -23,6 +23,7 @@ #include +#include #include #include #include @@ -334,8 +335,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c int run_prepared( DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int /*enable_dep_gen*/, - const char *output_prefix + const char *output_prefix, PtoRunTiming *out_timing ) { + if (out_timing != NULL) { + out_timing->host_wall_ns = 0; + out_timing->device_wall_ns = 0; + } if (ctx == NULL || runtime == NULL) return -1; DeviceRunner *runner = static_cast(ctx); @@ -350,6 +355,8 @@ int run_prepared( pthread_setspecific(g_runner_key, nullptr); }); + const auto host_t0 = std::chrono::steady_clock::now(); + try { int rc = runner->prepare_run_context(runner->device_id()); if (rc != 0) return rc; @@ -402,6 +409,12 @@ int run_prepared( rc = validate_runtime_impl(r); r->~Runtime(); + if (out_timing != NULL) { + const auto host_t1 = std::chrono::steady_clock::now(); + out_timing->host_wall_ns = + static_cast(std::chrono::duration_cast(host_t1 - host_t0).count()); + out_timing->device_wall_ns = runner->last_device_wall_ns(); + } return rc; } catch (...) { return -1; diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index a6f8fdba0..59a3a366b 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -320,6 +321,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { return rc; } + // Lazy-allocate the 8-byte device_wall buffer on first run. + if (device_wall_dev_ptr_ == nullptr) { + device_wall_dev_ptr_ = allocate_tensor(sizeof(uint64_t)); + if (device_wall_dev_ptr_ != nullptr) { + kernel_args_.device_wall_data_base = reinterpret_cast(device_wall_dev_ptr_); + *static_cast(device_wall_dev_ptr_) = 0; + } + } + // Calculate execution parameters block_dim_ = block_dim; int num_aicore = block_dim * cores_per_blockdim_; @@ -494,8 +504,17 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { std::vector aicpu_threads; aicpu_threads.reserve(over_launch); std::atomic aicpu_rc{0}; + + // Sim "device wall" capture — see a2a3 sim/onboard kernel.cpp for the + // shape. Writes via device_wall_data_base (sim's "device pointer" is a + // host malloc'd uint64). + if (kernel_args_.device_wall_data_base != 0) { + *reinterpret_cast(kernel_args_.device_wall_data_base) = 0; + } + const auto sim_t0 = std::chrono::steady_clock::now(); + for (int i = 0; i < over_launch; i++) { - aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc]() { + aicpu_threads.push_back(create_thread([this, &runtime, launch_aicpu_num, over_launch, &aicpu_rc, sim_t0]() { if (!platform_aicpu_affinity_gate(launch_aicpu_num, over_launch)) { return; } @@ -504,6 +523,11 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { int expected = 0; aicpu_rc.compare_exchange_strong(expected, rc, std::memory_order_acq_rel); } + if (kernel_args_.device_wall_data_base != 0) { + const auto t1 = std::chrono::steady_clock::now(); + *reinterpret_cast(kernel_args_.device_wall_data_base) = + static_cast(std::chrono::duration_cast(t1 - sim_t0).count()); + } })); } @@ -531,6 +555,13 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { LOG_INFO_V0("All threads completed"); + // Snapshot device_wall buffer into device_wall_ns_ (sim: single deref, + // no DMA — mirrors onboard's post-sync rtMemcpy D2H). + device_wall_ns_ = 0; + if (device_wall_dev_ptr_ != nullptr) { + device_wall_ns_ = *static_cast(device_wall_dev_ptr_); + } + int runtime_rc = aicpu_rc.load(std::memory_order_acquire); if (runtime_rc != 0) { LOG_ERROR("AICPU execution failed with rc=%d", runtime_rc); @@ -859,6 +890,11 @@ int DeviceRunner::finalize() { mem_alloc_.finalize(); clear_cpu_sim_shared_storage(); + // Free the 8-byte device_wall buffer (allocated lazily in run()). + if (device_wall_dev_ptr_ != nullptr) { + free_tensor(device_wall_dev_ptr_); + device_wall_dev_ptr_ = nullptr; + } device_id_ = -1; worker_count_ = 0; last_runtime_ = nullptr; diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 36e8c2016..74b0cd6fa 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -167,6 +167,14 @@ class DeviceRunner { void set_output_prefix(const char *prefix) { output_prefix_ = (prefix != nullptr) ? prefix : ""; } const std::string &output_prefix() const { return output_prefix_; } + /** + * Device-side wall (ns) from the most recently completed run, written + * by the platform AICPU entry (onboard: kernel.cpp; sim: lambda in + * run()). Returns 0 before any run completes. Independent of any + * profiling / swimlane subsystem. + */ + uint64_t last_device_wall_ns() const { return device_wall_ns_; } + /** * Attach the calling thread to the simulated device. * @@ -254,6 +262,15 @@ class DeviceRunner { // Simulation state (no actual device resources) KernelArgs kernel_args_; + // Platform-level device wall buffer: 8-byte device-resident slot whose + // address rides on KernelArgs.device_wall_data_base. AICPU writes the + // run wall (ns) through that pointer; this DeviceRunner pulls it back + // via copy_from_device after stream sync and caches it for + // last_device_wall_ns(). Allocated once at simpler_init, freed in + // finalize. + void *device_wall_dev_ptr_{nullptr}; + uint64_t device_wall_ns_{0}; + // Chip-callable buffer pool (sim path). Keyed by FNV-1a 64-bit content // hash; each entry owns the host scratch + dlopen handles needed for the // children embedded in that buffer. Bulk-freed in finalize(). diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index 5f5099142..5f398614f 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -270,8 +271,12 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c int run_prepared( DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int /*enable_dep_gen*/, - const char *output_prefix + const char *output_prefix, PtoRunTiming *out_timing ) { + if (out_timing != NULL) { + out_timing->host_wall_ns = 0; + out_timing->device_wall_ns = 0; + } if (ctx == NULL || runtime == NULL) return -1; DeviceRunner *runner = static_cast(ctx); @@ -283,6 +288,8 @@ int run_prepared( pthread_once(&g_runner_key_once, create_runner_key); pthread_setspecific(g_runner_key, ctx); + const auto host_t0 = std::chrono::steady_clock::now(); + try { Runtime *r = new (runtime) Runtime(); r->host_api.device_malloc = device_malloc; @@ -327,6 +334,12 @@ int run_prepared( rc = validate_runtime_impl(r); r->~Runtime(); pthread_setspecific(g_runner_key, nullptr); + if (out_timing != NULL) { + const auto host_t1 = std::chrono::steady_clock::now(); + out_timing->host_wall_ns = + static_cast(std::chrono::duration_cast(host_t1 - host_t0).count()); + out_timing->device_wall_ns = runner->last_device_wall_ns(); + } return rc; } catch (...) { pthread_setspecific(g_runner_key, nullptr); diff --git a/src/common/hierarchical/types.h b/src/common/hierarchical/types.h index aa5121785..938ffe2d4 100644 --- a/src/common/hierarchical/types.h +++ b/src/common/hierarchical/types.h @@ -215,3 +215,17 @@ class ReadyQueue { std::condition_variable cv_; bool shutdown_{false}; }; + +// ============================================================================= +// RunTiming — wall-clock breakdown returned by ChipWorker::run +// ============================================================================= + +// host_wall_ns is the steady_clock delta wrapping the dispatch; device_wall_ns +// is on-NPU wall captured by the platform AICPU entry (see KernelArgs:: +// device_wall_ns). Mirrors PtoRunTiming in src/common/worker/pto_runtime_c_api.h +// so the value flows through unchanged from the dlsym ABI up to the Python +// binding. +struct RunTiming { + uint64_t host_wall_ns = 0; + uint64_t device_wall_ns = 0; +}; diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 9b0c7ef39..a402eb5c3 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -300,12 +300,12 @@ void ChipWorker::prepare_callable(int32_t callable_id, const void *callable) { } } -void ChipWorker::run(int32_t callable_id, TaskArgsView args, const CallConfig &config) { +RunTiming ChipWorker::run(int32_t callable_id, TaskArgsView args, const CallConfig &config) { ChipStorageTaskArgs chip_storage = view_to_chip_storage(args); - run(callable_id, &chip_storage, config); + return run(callable_id, &chip_storage, config); } -void ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config) { +RunTiming ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config) { config.validate(); if (!initialized_) { throw std::runtime_error("ChipWorker not initialized; call init() first"); @@ -313,13 +313,15 @@ void ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args, const void *rt = runtime_buf_.data(); + PtoRunTiming timing{0, 0}; int rc = run_prepared_fn_( device_ctx_, rt, callable_id, args, config.block_dim, config.aicpu_thread_num, config.enable_l2_swimlane, - config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.output_prefix + config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.output_prefix, &timing ); if (rc != 0) { throw std::runtime_error("run_prepared failed with code " + std::to_string(rc)); } + return RunTiming{timing.host_wall_ns, timing.device_wall_ns}; } void ChipWorker::unregister_callable(int32_t callable_id) { diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index a93bca33d..79c15d4d4 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -19,6 +19,7 @@ #include "../task_interface/call_config.h" #include "../task_interface/task_args.h" +#include "pto_runtime_c_api.h" #include "types.h" class ChipWorker { @@ -50,12 +51,14 @@ class ChipWorker { // Launch a cid previously staged via prepare_callable. // Materializes a ChipStorageTaskArgs from `args` (one memcpy of T*40B + S*8B - // into a stack POD), then delegates to the overload below. - void run(int32_t callable_id, TaskArgsView args, const CallConfig &config); + // into a stack POD), then delegates to the overload below. Returns + // RunTiming with host wall (steady_clock around dispatch) + device wall + // (KernelArgs::device_wall_ns captured by the platform AICPU entry). + RunTiming run(int32_t callable_id, TaskArgsView args, const CallConfig &config); // Same launch, but the caller already holds the runtime.so-ABI POD — // skip the view→storage memcpy and hand the pointer straight to the C ABI. // Used by the ChipStorageTaskArgs path in the nanobind binding. - void run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config); + RunTiming run(int32_t callable_id, const ChipStorageTaskArgs *args, const CallConfig &config); // Per-callable_id preparation. Requires init() first and a callable_id // in [0, MAX_REGISTERED_CALLABLE_IDS) (cap 64). @@ -128,7 +131,8 @@ class ChipWorker { // CANN dlog sync. Reads the current log level off HostLogger itself. using SimplerInitFn = int (*)(void *, int, const uint8_t *, size_t, const uint8_t *, size_t); using PrepareCallableFn = int (*)(void *, int32_t, const void *); - using RunPreparedFn = int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, const char *); + using RunPreparedFn = + int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, const char *, PtoRunTiming *); using UnregisterCallableFn = int (*)(void *, int32_t); using GetAicpuDlopenCountFn = size_t (*)(void *); using FinalizeDeviceFn = int (*)(void *); diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index 0e2e28cb2..00debb446 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -48,6 +48,31 @@ extern "C" { typedef void *RuntimeHandle; typedef void *DeviceContextHandle; +/** + * Timing breakdown for a single run_prepared() invocation. + * + * host_wall_ns — wall-clock around the host-side dispatch (steady_clock + * delta wrapping the platform run() call). Always populated + * when out_timing != NULL. + * + * device_wall_ns — on-NPU wall of the most recent orchestrator phase + * (orch_summary.end_time - .start_time, converted to ns + * using arch frequency). Populated whenever the runtime + * was built with PTO2_PROFILING (the default), regardless + * of enable_l2_swimlane. The orch_summary capture is + * decoupled from the per-record swimlane buffer pool — + * only the lightweight phase header allocation is needed + * for this field. Zero only when PTO2_PROFILING was off + * at runtime build time. + * + * Both fields are zeroed by the callee on entry (including on error paths) + * so callers can pass an uninitialized struct. + */ +typedef struct PtoRunTiming { + uint64_t host_wall_ns; + uint64_t device_wall_ns; +} PtoRunTiming; + /* =========================================================================== * Public API (resolved by ChipWorker via dlsym) * =========================================================================== */ @@ -160,12 +185,16 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c * `device_id` and the executor binaries are not threaded through this entry * — they were captured by `simpler_init` and live on the DeviceRunner. * + * If `out_timing` is non-NULL, the callee writes the wall-clock breakdown for + * this invocation into it (see PtoRunTiming above). The struct is zeroed on + * entry and partially populated on early-error returns. + * * @return 0 on success, negative on error (no prep state, NULL ctx, etc.). */ int run_prepared( DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen, - const char *output_prefix + const char *output_prefix, PtoRunTiming *out_timing ); /** diff --git a/tests/ut/py/test_run_timing.py b/tests/ut/py/test_run_timing.py new file mode 100644 index 000000000..6b8aaed12 --- /dev/null +++ b/tests/ut/py/test_run_timing.py @@ -0,0 +1,122 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Tests for the RunTiming binding returned by Worker.run / run_prepared. + +Pure-Python unit tests — no device, no kernels. Verifies that the nanobind +bound class behaves as documented at the boundary so callers (benchmark +scripts, perf dashboards) can rely on the contract without spinning up a +worker. +""" + +import inspect + +import pytest +from _task_interface import RunTiming # pyright: ignore[reportMissingImports] + + +class TestRunTimingConstruction: + def test_default_constructor_zeros(self): + t = RunTiming() + assert t.host_wall_ns == 0 + assert t.device_wall_ns == 0 + assert t.host_wall_us == 0.0 + assert t.device_wall_us == 0.0 + + def test_explicit_values(self): + t = RunTiming(1500, 2500) + assert t.host_wall_ns == 1500 + assert t.device_wall_ns == 2500 + + def test_device_wall_defaults_to_zero(self): + # The (host_ns, device_ns=0) constructor is the path Worker.run uses + # for L3+ DAGs, where per-task device cycles aren't aggregated yet. + t = RunTiming(9999) + assert t.host_wall_ns == 9999 + assert t.device_wall_ns == 0 + + +class TestRunTimingUnitConversion: + def test_ns_to_us_host(self): + assert RunTiming(1_000, 0).host_wall_us == pytest.approx(1.0) + assert RunTiming(1_500_000, 0).host_wall_us == pytest.approx(1500.0) + + def test_ns_to_us_device(self): + assert RunTiming(0, 1_000).device_wall_us == pytest.approx(1.0) + assert RunTiming(0, 7_250).device_wall_us == pytest.approx(7.25) + + def test_us_returns_float(self): + t = RunTiming(1000, 2000) + assert isinstance(t.host_wall_us, float) + assert isinstance(t.device_wall_us, float) + + def test_ns_returns_int(self): + t = RunTiming(1000, 2000) + assert isinstance(t.host_wall_ns, int) + assert isinstance(t.device_wall_ns, int) + + +class TestRunTimingRepr: + def test_repr_includes_both_walls(self): + r = repr(RunTiming(1500, 2500)) + assert "RunTiming" in r + assert "host_wall_us=1.5" in r + assert "device_wall_us=2.5" in r + + def test_repr_zero(self): + r = repr(RunTiming()) + assert "host_wall_us=0" in r + assert "device_wall_us=0" in r + + +class TestRunTimingExports: + """Guard against the symbol getting dropped from the public surface — it's + re-exported from simpler.worker so end-user code can write + `from simpler.worker import RunTiming`. + """ + + def test_exported_from_task_interface_module(self): + import _task_interface # noqa: PLC0415 + + assert hasattr(_task_interface, "RunTiming") + assert _task_interface.RunTiming is RunTiming + + def test_exported_from_simpler_worker(self): + from simpler.worker import RunTiming as WorkerRunTiming # noqa: PLC0415 + + assert WorkerRunTiming is RunTiming + + +class TestWorkerRunSignature: + """Worker.run signature contract: documented to return a RunTiming. + We can't actually drive a worker without a device, but we can inspect the + wrapper signature + docstring to catch accidental drift (e.g. someone + reverting the return to None). + """ + + def test_worker_run_return_annotation(self): + from simpler.worker import Worker # noqa: PLC0415 + + sig = inspect.signature(Worker.run) + # The annotation is the RunTiming class itself (PEP 484 style). + assert sig.return_annotation is RunTiming, ( + f"Worker.run lost its RunTiming return annotation; got {sig.return_annotation!r}" + ) + + def test_chip_worker_wrapper_returns_run_timing(self): + # The simpler.task_interface.ChipWorker wrapper is what + # Worker._chip_worker uses internally; it must forward the return + # rather than swallowing it. + from simpler.task_interface import ChipWorker # noqa: PLC0415 # pyright: ignore[reportAttributeAccessIssue] + + src = inspect.getsource(ChipWorker.run) + # Surgical drift guard: the wrapper must `return self._impl.run(...)` + # rather than calling it for side effects only. If someone reverts to + # the side-effect-only form, RunTiming would silently become None for + # every caller without anything else breaking. + assert "return self._impl.run" in src diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh index 84fc7486e..dc29ef329 100755 --- a/tools/benchmark_rounds.sh +++ b/tools/benchmark_rounds.sh @@ -7,8 +7,10 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. # ----------------------------------------------------------------------------------------------------------- -# Benchmark wrapper: run examples on hardware, -# then parse device-log timing lines to report per-round latency. +# Benchmark wrapper: run examples on hardware, then parse per-round latency +# emitted by the test framework directly (see simpler_setup/scene_test.py:: +# _log_round_timings). Worker.run now returns a RunTiming struct so we no +# longer scrape device logs for sched/orch timestamps. # # Usage: # ./tools/benchmark_rounds.sh [-p ] [-d ] [-n ] [-r ] @@ -22,11 +24,8 @@ PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" # --------------------------------------------------------------------------- # Examples to benchmark and their case lists, per runtime. -# Key = directory name under tests/st/// -# Value = comma-separated case names to run (empty string = run DEFAULT_CASE) # --------------------------------------------------------------------------- -# --- tensormap_and_ringbuffer --- declare -A TMR_EXAMPLE_CASES=( [alternating_matmul_add]="Case1" [benchmark_bgemm]="Case0" @@ -44,9 +43,6 @@ TMR_EXAMPLE_ORDER=( spmd_paged_attention ) -# --------------------------------------------------------------------------- -# Parse arguments -# --------------------------------------------------------------------------- DEVICE_ID=0 ROUNDS=100 PLATFORM=a2a3 @@ -56,62 +52,33 @@ EXTRA_ARGS=() while [[ $# -gt 0 ]]; do case "$1" in - -p|--platform) - PLATFORM="$2" - shift 2 - ;; - -d|--device) - DEVICE_ID="$2" - shift 2 - ;; - -n|--rounds) - ROUNDS="$2" - shift 2 - ;; - -r|--runtime) - RUNTIME="$2" - shift 2 - ;; - -v|--verbose) - VERBOSE=1 - shift - ;; + -p|--platform) PLATFORM="$2"; shift 2 ;; + -d|--device) DEVICE_ID="$2"; shift 2 ;; + -n|--rounds) ROUNDS="$2"; shift 2 ;; + -r|--runtime) RUNTIME="$2"; shift 2 ;; + -v|--verbose) VERBOSE=1; shift ;; --help|-h) cat <<'USAGE' -benchmark_rounds.sh — run all examples and report per-round timing from device logs - -Usage: - ./tools/benchmark_rounds.sh [-p ] [-d ] [-n ] [-r ] [-v] +benchmark_rounds.sh — run examples and report per-round host/device wall +emitted directly by Worker.run(). Options: - -p, --platform Platform to run on (default: a2a3) - -d, --device Device ID (default: 0) - -n, --rounds Override number of rounds for each example (default: 100) - -r, --runtime Runtime to benchmark: tensormap_and_ringbuffer (default) - -v, --verbose Save detailed test_*.py output to a timestamped log file - -h, --help Show this help - -All other options are passed through to the underlying `python test_*.py` -invocation (e.g. --case). - -Edit the EXAMPLE_CASES map at the top of this script to control which -examples and cases to benchmark. - -Output: - Average elapsed time in microseconds for each example. + -p, --platform Platform (default: a2a3) + -d, --device Device ID (default: 0) + -n, --rounds Rounds per example (default: 100) + -r, --runtime Runtime (default: tensormap_and_ringbuffer) + -v, --verbose Save test output to a timestamped log + -h, --help Show this help + +Device-wall numbers are zero only when the runtime is built without +PTO2_PROFILING. The default build has it on, so the device column is +populated for every round regardless of --enable-l2-swimlane. USAGE - exit 0 - ;; - *) - EXTRA_ARGS+=("$1") - shift - ;; + exit 0 ;; + *) EXTRA_ARGS+=("$1"); shift ;; esac done -# --------------------------------------------------------------------------- -# Verbose logging setup -# --------------------------------------------------------------------------- VERBOSE_LOG="" if [[ $VERBOSE -eq 1 ]]; then mkdir -p "$PROJECT_ROOT/outputs" @@ -119,357 +86,95 @@ if [[ $VERBOSE -eq 1 ]]; then echo "Verbose log: $VERBOSE_LOG" fi -vlog() { - if [[ -n "$VERBOSE_LOG" ]]; then - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$VERBOSE_LOG" - fi -} - -# --------------------------------------------------------------------------- -# Derive arch from platform and set examples directories -# --------------------------------------------------------------------------- -# Search both examples/ (migrated tests) and tests/st/ (legacy tests) -ARCH="${PLATFORM%%sim}" # strip "sim" suffix if present +ARCH="${PLATFORM%%sim}" EXAMPLES_DIRS=( "$PROJECT_ROOT/tests/st/${ARCH}/${RUNTIME}" "$PROJECT_ROOT/examples/${ARCH}/${RUNTIME}" ) -# Clock frequency (MHz) for converting cycle counts to microseconds -case "$PLATFORM" in - a2a3) FREQ=50 ;; - a5) FREQ=1000 ;; - *) echo "ERROR: unsupported platform '$PLATFORM'. Use a2a3 or a5."; exit 1 ;; -esac - -# Select example cases and order based on runtime case "$RUNTIME" in tensormap_and_ringbuffer) declare -n EXAMPLE_CASES=TMR_EXAMPLE_CASES EXAMPLE_ORDER=("${TMR_EXAMPLE_ORDER[@]}") ;; *) - echo "ERROR: unknown runtime '$RUNTIME'. Use tensormap_and_ringbuffer." - exit 1 - ;; + echo "ERROR: unknown runtime '$RUNTIME'." >&2 + exit 1 ;; esac # --------------------------------------------------------------------------- -# Resolve CANN device log directory: $ASCEND_WORK_PATH/log/debug or ~/ascend/log/debug +# Parse "Avg Host: X.X us | Avg Device: Y.Y us (N rounds)" emitted by +# _log_round_timings. Returns "host_avg device_avg" (device may be "-"). # --------------------------------------------------------------------------- -if [[ -n "${ASCEND_WORK_PATH:-}" ]]; then - LOG_ROOT="$ASCEND_WORK_PATH/log/debug" - if [[ ! -d "$LOG_ROOT" ]]; then - LOG_ROOT="$HOME/ascend/log/debug" - fi -else - LOG_ROOT="$HOME/ascend/log/debug" -fi -DEVICE_LOG_DIR="$LOG_ROOT/device-${DEVICE_ID}" - -# --------------------------------------------------------------------------- -# parse_timing -# Grep for orch_start / end lines, compute per-round elapsed, print summary. -# --------------------------------------------------------------------------- -parse_timing() { - local log_file="$1" - - local timing - timing=$(grep -E 'Thread [0-9]+: (sched_start|orch_start|orch_end|sched_end|orch_stage_end)' "$log_file" || true) - - if [[ -z "$timing" ]]; then - echo " (no benchmark timing data — was PTO2_PROFILING enabled?)" - return 1 - fi - - echo "$timing" | awk -v freq="$FREQ" ' - function new_round() { - flush_round() - round++ - min_start = 0; max_end = 0 - min_sched_start = 0; max_sched_end = 0 - min_orch_start = 0; max_orch_end = 0 - delete sched_seen - delete orch_seen - } - function flush_round() { - if (round >= 0 && max_end > 0 && min_start > 0) { - results[round] = (max_end - min_start) / freq - if (max_sched_end > 0 && min_sched_start > 0) - sched_results[round] = (max_sched_end - min_sched_start) / freq - if (max_orch_end > 0 && min_orch_start > 0) - orch_results[round] = (max_orch_end - min_orch_start) / freq - count++ - } - } - BEGIN { - round = 0; count = 0 - min_start = 0; max_end = 0 - min_sched_start = 0; max_sched_end = 0 - min_orch_start = 0; max_orch_end = 0 - has_sched = 0; has_orch_end = 0 - } - /sched_start=/ { - match($0, /Thread ([0-9]+):/, tm) - tid = tm[1] + 0 - if (tid in sched_seen) new_round() - sched_seen[tid] = 1 - has_sched = 1 - match($0, /sched_start=([0-9]+)/, m) - val = m[1] + 0 - if (min_sched_start == 0 || val < min_sched_start) min_sched_start = val - if (min_start == 0 || val < min_start) min_start = val - } - /orch_start=/ { - match($0, /Thread ([0-9]+):/, tm) - tid = tm[1] + 0 - if (tid in orch_seen) new_round() - orch_seen[tid] = 1 - match($0, /orch_start=([0-9]+)/, m) - val = m[1] + 0 - if (min_orch_start == 0 || val < min_orch_start) min_orch_start = val - if (min_start == 0 || val < min_start) min_start = val - } - /sched_end[^=]*=/ { - match($0, /sched_end[^=]*=([0-9]+)/, m) - val = m[1] + 0 - if (val > max_sched_end) max_sched_end = val - if (val > max_end) max_end = val - } - /orch_end=/ { - match($0, /orch_end=([0-9]+)/, m) - val = m[1] + 0 - has_orch_end = 1 - if (val > max_orch_end) max_orch_end = val - if (val > max_end) max_end = val - } - /orch_stage_end=/ { - match($0, /orch_stage_end=([0-9]+)/, m) - val = m[1] + 0 - if (val > max_end) max_end = val - } - END { - flush_round() - if (count == 0) { print " (no rounds parsed)"; exit 1 } - - show_sched = has_sched - show_orch = has_orch_end - - # Header - hdr = sprintf(" %-8s %12s", "Round", "Elapsed (us)") - sep = sprintf(" %-8s %12s", "-----", "------------") - if (show_sched) { hdr = hdr sprintf(" %12s", "Sched (us)"); sep = sep sprintf(" %12s", "----------") } - if (show_orch) { hdr = hdr sprintf(" %12s", "Orch (us)"); sep = sep sprintf(" %12s", "---------") } - print hdr; print sep - - sum_v = 0; min_v = results[0]; max_v = results[0] - sum_s = 0; min_s = sched_results[0]; max_s = sched_results[0] - sum_o = 0; min_o = orch_results[0]; max_o = orch_results[0] - - for (i = 0; i < count; i++) { - line = sprintf(" %-8d %12.1f", i, results[i]) - sum_v += results[i] - if (results[i] < min_v) min_v = results[i] - if (results[i] > max_v) max_v = results[i] - if (show_sched) { - line = line sprintf(" %12.1f", sched_results[i]) - sum_s += sched_results[i] - if (sched_results[i] < min_s) min_s = sched_results[i] - if (sched_results[i] > max_s) max_s = sched_results[i] - } - if (show_orch) { - line = line sprintf(" %12.1f", orch_results[i]) - sum_o += orch_results[i] - if (orch_results[i] < min_o) min_o = orch_results[i] - if (orch_results[i] > max_o) max_o = orch_results[i] - } - print line - } - - printf "\n Avg: %.1f us", sum_v / count - if (show_sched) printf " | Sched Avg: %.1f us", sum_s / count - if (show_orch) printf " | Orch Avg: %.1f us", sum_o / count - printf " (%d rounds)\n", count - - TRIM = 10 - if (count > 2 * TRIM) { - # Insertion sort for each metric - for (i = 0; i < count; i++) sv[i] = results[i] - for (i = 1; i < count; i++) { - k = sv[i]; j = i - 1 - while (j >= 0 && sv[j] > k) { sv[j+1] = sv[j]; j-- } - sv[j+1] = k - } - tc = count - 2 * TRIM; ts = 0 - for (i = TRIM; i < count - TRIM; i++) ts += sv[i] - printf " Trimmed Avg: %.1f us (dropped %d low + %d high, %d rounds used)\n", ts / tc, TRIM, TRIM, tc - - if (show_sched) { - for (i = 0; i < count; i++) ss[i] = sched_results[i] - for (i = 1; i < count; i++) { - k = ss[i]; j = i - 1 - while (j >= 0 && ss[j] > k) { ss[j+1] = ss[j]; j-- } - ss[j+1] = k - } - ts2 = 0 - for (i = TRIM; i < count - TRIM; i++) ts2 += ss[i] - printf " Sched Trimmed Avg: %.1f us (dropped %d low + %d high)\n", ts2 / tc, TRIM, TRIM - } - if (show_orch) { - for (i = 0; i < count; i++) so[i] = orch_results[i] - for (i = 1; i < count; i++) { - k = so[i]; j = i - 1 - while (j >= 0 && so[j] > k) { so[j+1] = so[j]; j-- } - so[j+1] = k - } - ts3 = 0 - for (i = TRIM; i < count - TRIM; i++) ts3 += so[i] - printf " Orch Trimmed Avg: %.1f us (dropped %d low + %d high)\n", ts3 / tc, TRIM, TRIM - } - } - }' -} - -# --------------------------------------------------------------------------- -# wait_for_new_log -# Wait up to 15s for a new .log file in DEVICE_LOG_DIR. Prints the path. -# --------------------------------------------------------------------------- -wait_for_new_log() { - local pre_file="$1" - local new_log="" - local deadline=$((SECONDS + 15)) - - while [[ $SECONDS -lt $deadline ]]; do - if [[ -d "$DEVICE_LOG_DIR" ]]; then - new_log=$(comm -13 "$pre_file" <(ls -1 "$DEVICE_LOG_DIR"/*.log 2>/dev/null | sort) 2>/dev/null | tail -1 || true) - if [[ -n "$new_log" ]]; then - echo "$new_log" - return 0 - fi - fi - sleep 0.5 - done - - # Fallback: newest log - if [[ -d "$DEVICE_LOG_DIR" ]]; then - new_log=$(ls -t "$DEVICE_LOG_DIR"/*.log 2>/dev/null | head -1 || true) - if [[ -n "$new_log" ]]; then - echo "$new_log" - return 0 - fi - fi - return 1 +parse_avgs() { + local output="$1" + local host_avg device_avg + host_avg=$(echo "$output" | grep -oE 'Avg Host: [0-9.]+' | tail -1 | awk '{print $3}') + device_avg=$(echo "$output" | grep -oE 'Avg Device: [0-9.]+' | tail -1 | awk '{print $3}') + [[ -z "$host_avg" ]] && host_avg="-" + [[ -z "$device_avg" ]] && device_avg="-" + echo "$host_avg $device_avg" } -# --------------------------------------------------------------------------- -# run_bench [case_name] -# Run one benchmark invocation (via `python test_*.py`) and parse timing -# from the resulting log. Skips the example if it has no test_*.py. -# Sets global PASS / FAIL counters. -# --------------------------------------------------------------------------- run_bench() { local example="$1" example_dir="$2" case_name="${3:-}" + [[ -n "$case_name" ]] && echo " ---- $case_name ----" - if [[ -n "$case_name" ]]; then - echo " ---- $case_name ----" - fi - - # Snapshot existing logs - local pre_log_file - pre_log_file=$(mktemp) - trap 'rm -f -- "$pre_log_file"' RETURN - ls -1 "$DEVICE_LOG_DIR"/*.log 2>/dev/null | sort > "$pre_log_file" || true - - # Build run command using test_*.py local test_file test_file=$(find "$example_dir" -maxdepth 1 -name 'test_*.py' -print -quit 2>/dev/null || true) - - local run_cmd - if [[ -n "$test_file" ]]; then - run_cmd=( - python3 "$test_file" - --platform "$PLATFORM" --device "$DEVICE_ID" - --rounds "$ROUNDS" --skip-golden - ) - else + if [[ -z "$test_file" ]]; then echo " SKIPPED: no test_*.py found in $example_dir" return fi + + local run_cmd=( + python3 "$test_file" + --platform "$PLATFORM" --device "$DEVICE_ID" + --rounds "$ROUNDS" --skip-golden + ) if [[ -n "$case_name" ]]; then - run_cmd+=(--case "$case_name") - [[ -n "$test_file" ]] && run_cmd+=(--manual include) + run_cmd+=(--case "$case_name" --manual include) fi run_cmd+=("${EXTRA_ARGS[@]}") - # Run example - vlog "Running: ${run_cmd[*]}" - local rc=0 + local output rc=0 + output=$("${run_cmd[@]}" 2>&1) || rc=$? if [[ -n "$VERBOSE_LOG" ]]; then - local run_output - run_output=$("${run_cmd[@]}" 2>&1) || rc=$? - if [[ -n "$run_output" ]]; then echo "$run_output" >> "$VERBOSE_LOG"; fi - else - "${run_cmd[@]}" > /dev/null 2>&1 || rc=$? + echo "==== $example ${case_name} ====" >> "$VERBOSE_LOG" + echo "$output" >> "$VERBOSE_LOG" fi if [[ $rc -ne 0 ]]; then - echo " FAILED: benchmark run returned non-zero" - vlog "FAILED: exit code $rc" + echo " FAILED: exit code $rc" ((FAIL++)) || true return fi - # Find new device log - local new_log - new_log=$(wait_for_new_log "$pre_log_file") + local avgs host_avg device_avg + avgs=$(parse_avgs "$output") + host_avg=$(echo "$avgs" | awk '{print $1}') + device_avg=$(echo "$avgs" | awk '{print $2}') - if [[ -z "$new_log" ]]; then - echo " FAILED: no device log found in $DEVICE_LOG_DIR" + if [[ "$host_avg" == "-" ]]; then + echo " (no per-round timing — was --rounds > 1?)" ((FAIL++)) || true return fi - echo " Log: $new_log" - local timing_output - local parse_rc=0 - timing_output=$(parse_timing "$new_log") || parse_rc=$? - echo "$timing_output" - - if [[ $parse_rc -ne 0 ]]; then - ((FAIL++)) || true - return - fi + echo " Avg Host: $host_avg us | Avg Device: $device_avg us ($ROUNDS rounds)" ((PASS++)) || true - # Extract averages for summary table local label="$example" [[ -n "$case_name" ]] && label="$example ($case_name)" - - local avg_line - avg_line=$(echo "$timing_output" | grep "^ Avg:" || true) - local avg_elapsed="-" avg_sched="-" avg_orch="-" - if [[ -n "$avg_line" ]]; then - avg_elapsed=$(echo "$avg_line" | awk '{print $2}') - avg_sched=$(echo "$avg_line" | grep -o 'Sched Avg: [0-9.]*' | awk '{print $3}') || avg_sched="-" - avg_orch=$(echo "$avg_line" | grep -o 'Orch Avg: [0-9.]*' | awk '{print $3}') || avg_orch="-" - fi - SUMMARY_NAMES+=("$label") - SUMMARY_ELAPSED+=("$avg_elapsed") - SUMMARY_SCHED+=("$avg_sched") - SUMMARY_ORCH+=("$avg_orch") + SUMMARY_HOST+=("$host_avg") + SUMMARY_DEVICE+=("$device_avg") } -# --------------------------------------------------------------------------- -# Main loop -# --------------------------------------------------------------------------- PASS=0 FAIL=0 - -# Summary collection arrays SUMMARY_NAMES=() -SUMMARY_ELAPSED=() -SUMMARY_SCHED=() -SUMMARY_ORCH=() +SUMMARY_HOST=() +SUMMARY_DEVICE=() echo "" echo "Runtime: $RUNTIME" @@ -477,25 +182,13 @@ echo "Runtime: $RUNTIME" for example in "${EXAMPLE_ORDER[@]}"; do case_list="${EXAMPLE_CASES[$example]:-}" - # Search for example: prefer test_*.py (new style), fall back to golden.py (legacy). - # tests/st/ is searched before examples/ since benchmarks use production-scale cases. EXAMPLE_DIR="" for dir in "${EXAMPLES_DIRS[@]}"; do candidate="$dir/$example" if [[ -d "$candidate" ]] && ls "$candidate"/test_*.py 1>/dev/null 2>&1; then - EXAMPLE_DIR="$candidate" - break + EXAMPLE_DIR="$candidate"; break fi done - if [[ -z "$EXAMPLE_DIR" ]]; then - for dir in "${EXAMPLES_DIRS[@]}"; do - candidate="$dir/$example" - if [[ -f "$candidate/golden.py" && -d "$candidate/kernels" ]]; then - EXAMPLE_DIR="$candidate" - break - fi - done - fi echo "" echo "================================================================" @@ -503,7 +196,7 @@ for example in "${EXAMPLE_ORDER[@]}"; do echo "================================================================" if [[ -z "$EXAMPLE_DIR" ]]; then - echo " SKIP: not found in any search directory" + echo " SKIP: not found" ((FAIL++)) || true continue fi @@ -518,54 +211,19 @@ for example in "${EXAMPLE_ORDER[@]}"; do fi done -# --------------------------------------------------------------------------- -# Performance Summary Table -# --------------------------------------------------------------------------- if [[ ${#SUMMARY_NAMES[@]} -gt 0 ]]; then - # Check if any sched/orch data exists across all runs - _has_sched=0 - _has_orch=0 - for _i in "${!SUMMARY_NAMES[@]}"; do - [[ "${SUMMARY_SCHED[$_i]}" != "-" ]] && _has_sched=1 - [[ "${SUMMARY_ORCH[$_i]}" != "-" ]] && _has_orch=1 - done - echo "" echo "================================================================" echo " Performance Summary ($RUNTIME)" echo "================================================================" echo "" - - # Header - _hdr=$(printf " %-40s %12s" "Example" "Elapsed (us)") - _sep=$(printf " %-40s %12s" "----------------------------------------" "------------") - if [[ $_has_sched -eq 1 ]]; then - _hdr=$(printf "%s %12s" "$_hdr" "Sched (us)") - _sep=$(printf "%s %12s" "$_sep" "------------") - fi - if [[ $_has_orch -eq 1 ]]; then - _hdr=$(printf "%s %12s" "$_hdr" "Orch (us)") - _sep=$(printf "%s %12s" "$_sep" "------------") - fi - echo "$_hdr" - echo "$_sep" - - # Rows - for _i in "${!SUMMARY_NAMES[@]}"; do - _row=$(printf " %-40s %12s" "${SUMMARY_NAMES[$_i]}" "${SUMMARY_ELAPSED[$_i]}") - if [[ $_has_sched -eq 1 ]]; then - _row=$(printf "%s %12s" "$_row" "${SUMMARY_SCHED[$_i]}") - fi - if [[ $_has_orch -eq 1 ]]; then - _row=$(printf "%s %12s" "$_row" "${SUMMARY_ORCH[$_i]}") - fi - echo "$_row" + printf " %-40s %12s %12s\n" "Example" "Host (us)" "Device (us)" + printf " %-40s %12s %12s\n" "----------------------------------------" "------------" "------------" + for i in "${!SUMMARY_NAMES[@]}"; do + printf " %-40s %12s %12s\n" "${SUMMARY_NAMES[$i]}" "${SUMMARY_HOST[$i]}" "${SUMMARY_DEVICE[$i]}" done fi -# --------------------------------------------------------------------------- -# Summary -# --------------------------------------------------------------------------- TOTAL=$((PASS + FAIL)) echo "" echo "================================================================"