From c378e9d5fedef1105d74b66eafd394b7106e64cd Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Sat, 30 May 2026 18:41:34 +0800 Subject: [PATCH] Refactor: AICPU fills AICore rotation table (move from host) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before: host computed each per-core `&L2PerfAicoreBufferState::rotation` device address (host-to-device translation against `perf_dev_ptr`) and wrote the resulting uint64_t into the `aicore_ring_addr` table inside `L2PerfCollector::initialize`. This coupled host to the AICore-side shared-memory layout — it had to reach into `get_aicore_buffer_state` and translate addresses for a table whose entries are pure device- internal pointers (only AICore reads them). After: AICPU fills the table inside `l2_perf_aicpu_init` reading `&ac_state->rotation` directly (no translation needed — it's already on the device). Host's job shrinks to "alloc the table bytes and hand the device pointer to AICPU via `KernelArgs::aicore_ring_addr`". To make this safe, AICore now defers reading the table until the first task is dispatched (AICPU writes the dispatch register only after init has completed, so by the time AICore reaches into the table, the entries are populated): - AICore kernel entry stashes the *slot pointer* (`&table[block_idx]`) via the new `set_aicore_rotation_slot`, NOT the dereferenced rotation address. Old `set_aicore_rotation` is removed. - `get_aicore_rotation()` lazily dereferences the slot on first call and caches the result. - aicore_executor moves the `get_aicore_rotation()` call from pre-loop init into the per-task branch (still amortized to 0 after first task since the getter caches). Without this deferral, AICore's kernel-entry deref would race with AICPU's `l2_perf_aicpu_init` table fill — observed as a hardware-only `aclrtSynchronizeStreamWithTimeout (AICPU) failed: 507018` because AICore stashed a garbage rotation pointer at entry and crashed when later dcci'ing it. The plumbing follows the existing `set_platform_l2_perf_base` pattern: - New `set_platform_aicore_rotation_table` / `get_*` setters on AICPU - On onboard: AICPU `kernel.cpp` calls the setter from `k_args` - On sim: host dlsyms the setter and calls it before triggering AICPU a2a3 only — a5 uses an independent dev alloc for each AICore ring (`state->aicore_ring_ptr`), so the host already has the dev address and the translation issue doesn't exist there. Tested: a2a3 + a2a3sim build clean; tests/st/.../l2_swimlane pass on both a2a3sim and a2a3 device 1 (test_l2_swimlane + test_l2_swimlane_mixed). --- .../include/aicore/aicore_profiling_state.h | 38 +++++++++++++------ .../include/aicpu/l2_perf_collector_aicpu.h | 10 +++++ src/a2a3/platform/onboard/aicore/kernel.cpp | 27 +++++++++++-- src/a2a3/platform/onboard/aicpu/kernel.cpp | 1 + src/a2a3/platform/sim/aicore/kernel.cpp | 25 +++++++++--- src/a2a3/platform/sim/host/device_runner.cpp | 9 +++++ src/a2a3/platform/sim/host/device_runner.h | 1 + .../src/aicpu/l2_perf_collector_aicpu.cpp | 23 +++++++++++ .../platform/src/host/l2_perf_collector.cpp | 25 ++++-------- .../aicore/aicore_executor.cpp | 9 ++++- .../aicore/aicore_executor.cpp | 20 +++++++--- 11 files changed, 144 insertions(+), 44 deletions(-) diff --git a/src/a2a3/platform/include/aicore/aicore_profiling_state.h b/src/a2a3/platform/include/aicore/aicore_profiling_state.h index 0a4f6b6fd..b41a60dbb 100644 --- a/src/a2a3/platform/include/aicore/aicore_profiling_state.h +++ b/src/a2a3/platform/include/aicore/aicore_profiling_state.h @@ -25,12 +25,18 @@ * * Lifecycle: * 1. Host fills `KernelArgs::enable_profiling_flag` and - * `KernelArgs::aicore_ring_addr` (now points to a per-core - * `AicoreRotation` table). - * 2. AICore kernel entry indexes `aicore_ring_addr[block_idx]` for this - * core's `AicoreRotation*` and calls `set_aicore_profiling_flag()` + - * `set_aicore_rotation()` before invoking `aicore_execute`. - * 3. `aicore_execute` and downstream profiling helpers read via getters. + * `KernelArgs::aicore_ring_addr` (points to a per-core `AicoreRotation` + * device-address table). Host allocates the table bytes; AICPU populates + * the entries inside `l2_perf_aicpu_init`. + * 2. AICore kernel entry stashes `&aicore_ring_addr[block_idx]` (the slot + * pointer — NOT the dereferenced rotation pointer yet) via + * `set_aicore_rotation_slot()`, and calls `set_aicore_profiling_flag()`, + * before invoking `aicore_execute`. + * 3. `get_aicore_rotation()` lazily dereferences the slot the first time + * it is called. Callers must defer the call until AFTER AICPU has + * dispatched the first task (so AICPU init has had a chance to populate + * the table). The executor handles this by calling it inside the main + * loop's first-task branch. */ #ifndef PLATFORM_AICORE_AICORE_PROFILING_STATE_H_ @@ -50,12 +56,22 @@ __aicore__ void set_aicore_profiling_flag(uint32_t flag); __aicore__ uint32_t get_aicore_profiling_flag(); /** - * Per-core AICore rotation channel. Set once at kernel entry from - * `((uint64_t*)k_args->aicore_ring_addr)[block_idx]`; nullptr when the L2 - * swimlane bit is off or the address table itself is null. AICore reads - * this cache line per task to pick up the current L2PerfAicoreBuffer. + * Per-core AICore rotation channel. + * + * `set_aicore_rotation_slot(slot)` stashes the address of THIS core's slot + * in the rotation-address table — `&((uint64_t*)k_args->aicore_ring_addr)[block_idx]`. + * No dereference happens here, because at kernel entry the AICPU side may + * not yet have populated the table (the host launches both kernels and + * AICPU's init runs concurrently with AICore's entry). + * + * `get_aicore_rotation()` lazily dereferences the stashed slot on first use, + * caches the result, and returns it on subsequent calls. Callers MUST defer + * the first call until after AICPU has dispatched the first task — by then + * AICPU's init has completed and the slot holds a valid device address. + * The executor's main loop honours this by reading the rotation only inside + * the first-task branch of the dispatch poll. */ -__aicore__ void set_aicore_rotation(__gm__ AicoreRotation *rotation); +__aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr); __aicore__ __gm__ AicoreRotation *get_aicore_rotation(); #endif // PLATFORM_AICORE_AICORE_PROFILING_STATE_H_ diff --git a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h index 341b426fd..03465e02b 100644 --- a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h +++ b/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h @@ -46,6 +46,16 @@ extern "C" uint64_t get_platform_l2_perf_base(); extern "C" void set_l2_swimlane_enabled(bool enable); extern "C" bool is_l2_swimlane_enabled(); +// AICore rotation-table device pointer (= KernelArgs::aicore_ring_addr). +// Published by the host before AICPU init runs; AICPU init fills the table +// with the per-core `&L2PerfAicoreBufferState::rotation` device addresses so +// AICore can index `aicore_ring_addr[block_idx]` to find its rotation channel. +// Moved from host into AICPU so the host stays decoupled from the AICore-side +// shared-memory layout (host previously did host-to-device address translation +// + reached into get_aicore_buffer_state to fill this). +extern "C" void set_platform_aicore_rotation_table(uint64_t table_addr); +extern "C" uint64_t get_platform_aicore_rotation_table(); + // Typed getter for the granular perf_level (promoted from the shared-memory // header inside l2_perf_aicpu_init). Gate sites should use this so the // comparison RHS is a named L2PerfLevel constant. diff --git a/src/a2a3/platform/onboard/aicore/kernel.cpp b/src/a2a3/platform/onboard/aicore/kernel.cpp index dff633920..e87cb81ef 100644 --- a/src/a2a3/platform/onboard/aicore/kernel.cpp +++ b/src/a2a3/platform/onboard/aicore/kernel.cpp @@ -43,15 +43,28 @@ // linker dedup the otherwise-duplicate symbol definitions across the two // compilation units. [[block_local]] static uint32_t s_aicore_profiling_flag; +// Slot pointer (NOT the dereferenced rotation address) — see +// aicore_profiling_state.h for the lazy-deref contract. +[[block_local]] static __gm__ uint64_t *s_aicore_rotation_slot; [[block_local]] static __gm__ AicoreRotation *s_aicore_rotation; __attribute__((weak)) __aicore__ void set_aicore_profiling_flag(uint32_t flag) { s_aicore_profiling_flag = flag; } __attribute__((weak)) __aicore__ uint32_t get_aicore_profiling_flag() { return s_aicore_profiling_flag; } -__attribute__((weak)) __aicore__ void set_aicore_rotation(__gm__ AicoreRotation *rotation) { - s_aicore_rotation = rotation; +__attribute__((weak)) __aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) { + s_aicore_rotation_slot = slot_ptr; + s_aicore_rotation = nullptr; // force lazy resolution on next get +} +__attribute__((weak)) __aicore__ __gm__ AicoreRotation *get_aicore_rotation() { + // Lazy first-call resolve: AICPU init populates `*s_aicore_rotation_slot` + // before dispatching the first task, so by the time the executor reaches + // for the rotation (inside the first-task branch of the dispatch poll) + // the slot holds a valid device address. + if (s_aicore_rotation == nullptr && s_aicore_rotation_slot != nullptr) { + s_aicore_rotation = reinterpret_cast<__gm__ AicoreRotation *>(*s_aicore_rotation_slot); + } + return s_aicore_rotation; } -__attribute__((weak)) __aicore__ __gm__ AicoreRotation *get_aicore_rotation() { return s_aicore_rotation; } extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type); @@ -91,8 +104,14 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ KernelA // get_aicore_rotation() — never touches Handshake for profiling. set_aicore_profiling_flag(k_args->enable_profiling_flag); if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)) { + // Stash only the slot pointer. The slot CONTENTS are written by + // AICPU's `l2_perf_aicpu_init` which runs concurrently with this + // entry; dereferencing here would race with AICPU's write. The + // executor defers the deref via `get_aicore_rotation()` until inside + // the first-task branch — by then AICPU has dispatched, so init is + // done and the slot is populated. __gm__ uint64_t *rotation_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_ring_addr); - set_aicore_rotation(reinterpret_cast<__gm__ AicoreRotation *>(rotation_table[block_idx])); + set_aicore_rotation_slot(rotation_table != nullptr ? &rotation_table[block_idx] : nullptr); } aicore_execute(k_args->runtime_args, block_idx, core_type); diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index 291cafc88..7926fa0e7 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -110,6 +110,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a set_platform_dump_base(k_args->dump_data_base); set_dump_tensor_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR)); set_platform_l2_perf_base(k_args->l2_perf_data_base); + set_platform_aicore_rotation_table(k_args->aicore_ring_addr); set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)); set_platform_pmu_base(k_args->pmu_data_base); set_platform_pmu_reg_addrs(k_args->pmu_reg_addrs); diff --git a/src/a2a3/platform/sim/aicore/kernel.cpp b/src/a2a3/platform/sim/aicore/kernel.cpp index fb723b9a1..033682d2b 100644 --- a/src/a2a3/platform/sim/aicore/kernel.cpp +++ b/src/a2a3/platform/sim/aicore/kernel.cpp @@ -33,6 +33,9 @@ static pthread_key_t g_reg_base_key; static pthread_key_t g_core_id_key; static pthread_key_t g_aicore_profiling_flag_key; +// Slot pointer (NOT the dereferenced rotation address) — see +// aicore_profiling_state.h for the lazy-deref contract. +static pthread_key_t g_aicore_rotation_slot_key; static pthread_key_t g_aicore_rotation_key; static pthread_once_t g_tls_once = PTHREAD_ONCE_INIT; @@ -40,6 +43,7 @@ static void create_tls_keys() { pthread_key_create(&g_reg_base_key, nullptr); pthread_key_create(&g_core_id_key, nullptr); pthread_key_create(&g_aicore_profiling_flag_key, nullptr); + pthread_key_create(&g_aicore_rotation_slot_key, nullptr); pthread_key_create(&g_aicore_rotation_key, nullptr); } @@ -61,11 +65,19 @@ __aicore__ uint32_t get_aicore_profiling_flag() { return static_cast(reinterpret_cast(pthread_getspecific(g_aicore_profiling_flag_key))); } -__aicore__ void set_aicore_rotation(__gm__ AicoreRotation *rotation) { - pthread_setspecific(g_aicore_rotation_key, reinterpret_cast(rotation)); +__aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) { + pthread_setspecific(g_aicore_rotation_slot_key, reinterpret_cast(slot_ptr)); + pthread_setspecific(g_aicore_rotation_key, nullptr); // force lazy resolve on next get } __aicore__ __gm__ AicoreRotation *get_aicore_rotation() { - return reinterpret_cast<__gm__ AicoreRotation *>(pthread_getspecific(g_aicore_rotation_key)); + auto *cached = reinterpret_cast<__gm__ AicoreRotation *>(pthread_getspecific(g_aicore_rotation_key)); + if (cached != nullptr) return cached; + auto *slot = reinterpret_cast<__gm__ uint64_t *>(pthread_getspecific(g_aicore_rotation_slot_key)); + if (slot == nullptr) return nullptr; + // Lazy first-call resolve — see aicore_profiling_state.h. + cached = reinterpret_cast<__gm__ AicoreRotation *>(*slot); + pthread_setspecific(g_aicore_rotation_key, reinterpret_cast(cached)); + return cached; } // Core identity setter function pointers — set by DeviceRunner after dlopen. @@ -107,10 +119,13 @@ extern "C" void aicore_execute_wrapper( // Publish per-core profiling state before the executor runs. set_aicore_profiling_flag(enable_profiling_flag); if (aicore_ring_addr != 0) { + // Stash only the slot pointer; deref happens lazily inside + // get_aicore_rotation() once AICPU has populated the table. See + // aicore_profiling_state.h. uint64_t *rotation_table = reinterpret_cast(aicore_ring_addr); - set_aicore_rotation(reinterpret_cast<__gm__ AicoreRotation *>(rotation_table[block_idx])); + set_aicore_rotation_slot(reinterpret_cast<__gm__ uint64_t *>(&rotation_table[block_idx])); } else { - set_aicore_rotation(nullptr); + set_aicore_rotation_slot(nullptr); } // Set core identity for pto-isa TPUSH/TPOP simulation. diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 3de9e359b..a6a78bdc2 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -273,6 +273,13 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } + set_platform_aicore_rotation_table_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_aicore_rotation_table")); + if (set_platform_aicore_rotation_table_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_platform_aicore_rotation_table: %s", dlerror()); + return -1; + } + set_l2_swimlane_enabled_func_ = reinterpret_cast(dlsym(aicpu_so_handle_, "set_l2_swimlane_enabled")); if (set_l2_swimlane_enabled_func_ == nullptr) { @@ -652,6 +659,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { set_platform_dump_base_func_(kernel_args_.dump_data_base); set_dump_tensor_enabled_func_(enable_dump_tensor_); set_platform_l2_perf_base_func_(kernel_args_.l2_perf_data_base); + set_platform_aicore_rotation_table_func_(kernel_args_.aicore_ring_addr); set_l2_swimlane_enabled_func_(enable_l2_swimlane_); set_platform_pmu_base_func_(kernel_args_.pmu_data_base); set_platform_pmu_reg_addrs_func_(kernel_args_.pmu_reg_addrs); @@ -844,6 +852,7 @@ void DeviceRunner::unload_executor_binaries() { set_platform_dump_base_func_ = nullptr; set_dump_tensor_enabled_func_ = nullptr; set_platform_l2_perf_base_func_ = nullptr; + set_platform_aicore_rotation_table_func_ = nullptr; set_l2_swimlane_enabled_func_ = nullptr; set_platform_pmu_base_func_ = nullptr; set_platform_pmu_reg_addrs_func_ = nullptr; diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index d0b29a765..f1a44e59b 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -382,6 +382,7 @@ class DeviceRunner { void (*set_platform_dump_base_func_)(uint64_t){nullptr}; void (*set_dump_tensor_enabled_func_)(bool){nullptr}; void (*set_platform_l2_perf_base_func_)(uint64_t){nullptr}; + void (*set_platform_aicore_rotation_table_func_)(uint64_t){nullptr}; void (*set_l2_swimlane_enabled_func_)(bool){nullptr}; void (*set_platform_pmu_base_func_)(uint64_t){nullptr}; void (*set_platform_pmu_reg_addrs_func_)(uint64_t){nullptr}; diff --git a/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp index c5a3c3143..7ab5b7498 100644 --- a/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp +++ b/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp @@ -73,10 +73,19 @@ static uint64_t g_platform_l2_perf_base = 0; static bool g_enable_l2_swimlane = false; static L2PerfLevel g_l2_perf_level = L2PerfLevel::DISABLED; +// AICore rotation-table device pointer (= KernelArgs::aicore_ring_addr). +// Published by the host (sim: dlsym'd setter; onboard: from k_args via the +// kernel entry); AICPU init walks it to fill per-core &rotation addresses. +static uint64_t g_platform_aicore_rotation_table = 0; + extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base) { g_platform_l2_perf_base = l2_perf_data_base; } extern "C" uint64_t get_platform_l2_perf_base() { return g_platform_l2_perf_base; } extern "C" void set_l2_swimlane_enabled(bool enable) { g_enable_l2_swimlane = enable; } extern "C" bool is_l2_swimlane_enabled() { return g_enable_l2_swimlane; } +extern "C" void set_platform_aicore_rotation_table(uint64_t table_addr) { + g_platform_aicore_rotation_table = table_addr; +} +extern "C" uint64_t get_platform_aicore_rotation_table() { return g_platform_aicore_rotation_table; } L2PerfLevel get_l2_perf_level() { return g_l2_perf_level; } /** @@ -132,6 +141,16 @@ void l2_perf_aicpu_init(int worker_count) { static_cast(g_l2_perf_level) ); + // Populate the per-core AicoreRotation device-address table. AICore reads + // `aicore_ring_addr[block_idx]` from KernelArgs to find its rotation + // channel; the table itself is host-allocated, but the entries are + // device-internal addresses (`&ac_state->rotation`) that the host would + // otherwise have to translate from host-mapped to device-mapped. AICPU + // already runs on the device, so it can write the addresses directly + // without any translation — that keeps the host side decoupled from the + // AICore shared-memory layout. + uint64_t *rotation_table = reinterpret_cast(g_platform_aicore_rotation_table); + // Pop first buffer from free_queue for each core for (int i = 0; i < worker_count; i++) { L2PerfBufferState *state = get_perf_buffer_state(l2_perf_base, i); @@ -140,6 +159,10 @@ void l2_perf_aicpu_init(int worker_count) { s_perf_buffer_states[i] = state; s_aicore_buffer_states[i] = ac_state; + if (rotation_table != nullptr) { + rotation_table[i] = reinterpret_cast(&ac_state->rotation); + } + // Pop first buffer from free_queue rmb(); uint32_t head = state->free_queue.head; diff --git a/src/a2a3/platform/src/host/l2_perf_collector.cpp b/src/a2a3/platform/src/host/l2_perf_collector.cpp index d9b978ec5..98c72a928 100644 --- a/src/a2a3/platform/src/host/l2_perf_collector.cpp +++ b/src/a2a3/platform/src/host/l2_perf_collector.cpp @@ -240,10 +240,13 @@ int L2PerfCollector::initialize( PLATFORM_PROF_BUFFERS_PER_CORE, PLATFORM_AICORE_BUFFERS_PER_CORE ); - // Step 5c: Standalone uint64_t[num_aicore] table holding per-core - // AicoreRotation device addresses (= &ac_state->rotation). AICore reads - // rotation_table[block_idx] via KernelArgs::aicore_ring_addr and feeds it - // into the platform's set_aicore_rotation(). + // Step 5c: Standalone uint64_t[num_aicore] table that will hold per-core + // AicoreRotation device addresses. Host only allocates the bytes and + // hands the device pointer to AICPU via KernelArgs::aicore_ring_addr; + // AICPU itself fills the entries inside `l2_perf_aicpu_init` (it has + // direct access to `&ac_state->rotation` device addresses, no + // host-to-device translation needed). AICore reads + // rotation_table[block_idx] at kernel entry. { size_t table_bytes = static_cast(num_aicore) * sizeof(uint64_t); void *rotation_table_host = nullptr; @@ -252,20 +255,6 @@ int L2PerfCollector::initialize( LOG_ERROR("Failed to allocate aicore_ring_addr (rotation) table (%zu bytes)", table_bytes); return -1; } - uint64_t *rotation_table = reinterpret_cast(rotation_table_host); - - // Compute the per-core device address of &state->rotation. We have - // the host-mapped shm region; the device equivalent is at the same - // offset from perf_dev_ptr. - auto host_to_dev = [&](void *host_addr) -> uint64_t { - uintptr_t offset = reinterpret_cast(host_addr) - reinterpret_cast(perf_host_ptr); - return reinterpret_cast(perf_dev_ptr) + offset; - }; - - for (int i = 0; i < num_aicore; i++) { - L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(perf_host_ptr, num_aicore, i); - rotation_table[i] = host_to_dev(&ac_state->rotation); - } aicore_ring_addr_table_dev_ = rotation_table_dev; } diff --git a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp index ceb16a794..c66b11ed9 100644 --- a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp @@ -60,7 +60,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU); // Per-core AicoreRotation channel; see tensormap_and_ringbuffer/.../aicore_executor.cpp. - __gm__ AicoreRotation *l2_perf_rotation = l2_perf_enabled ? get_aicore_rotation() : nullptr; + // Deferred until first task so AICPU's init has populated the rotation + // table (the dispatch itself proves init is done). + __gm__ AicoreRotation *l2_perf_rotation = nullptr; AicoreLocalState l2_perf_local = {nullptr, 0, 0}; volatile uint32_t task_id = AICPU_IDLE_TASK_ID; @@ -83,6 +85,11 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in uint32_t actual_task_id = task_id; write_reg(RegId::COND, MAKE_ACK_VALUE(actual_task_id)); + // First-task lazy resolve of the rotation channel. + if (l2_perf_enabled && l2_perf_rotation == nullptr) { + l2_perf_rotation = get_aicore_rotation(); + } + __gm__ Task *task_ptr = &(runtime->tasks[actual_task_id]); uint64_t start_time = get_sys_cnt_aicore(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index 25580ad9c..94e18b35e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -102,11 +102,14 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in bool dump_tensor_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR); bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU); - // Per-core AicoreRotation channel is published once at kernel entry from - // KernelArgs::aicore_ring_addr. AICore reads it per task (cheap relative - // to the per-task dcci(payload, ENTIRE_DATA_CACHE)) to pick up the - // current L2PerfAicoreBuffer — see l2_perf_collector_aicore.h. - __gm__ AicoreRotation *l2_perf_rotation = l2_perf_enabled ? get_aicore_rotation() : nullptr; + // Per-core AicoreRotation channel. The pointer to THIS core's rotation + // is stored in `KernelArgs::aicore_ring_addr[block_idx]`, but AICPU + // populates that table inside `l2_perf_aicpu_init` which runs + // concurrently with this kernel's entry — so we cannot deref at startup. + // Defer the deref via `get_aicore_rotation()` until the first task is + // dispatched; by then AICPU's init has completed (the very dispatch is + // proof of that). + __gm__ AicoreRotation *l2_perf_rotation = nullptr; AicoreLocalState l2_perf_local = {nullptr, 0, 0}; // Phase 4: Main execution loop - poll register for tasks until exit signal @@ -131,6 +134,13 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in { uint32_t task_id = reg_val; // Decode: register holds task_id directly + // First-task lazy resolve of the rotation channel — see comment + // above. `get_aicore_rotation()` caches after first call so this + // costs nothing on subsequent tasks. + if (l2_perf_enabled && l2_perf_rotation == nullptr) { + l2_perf_rotation = get_aicore_rotation(); + } + // Select dual-buffer slot: same bit as AICPU used when writing payload __gm__ PTO2DispatchPayload *exec_payload = payload + (task_id & 1u);