diff --git a/src/a2a3/platform/include/aicore/aicore_profiling_state.h b/src/a2a3/platform/include/aicore/aicore_profiling_state.h index 0a4f6b6fd..b41a60dbb 100644 --- a/src/a2a3/platform/include/aicore/aicore_profiling_state.h +++ b/src/a2a3/platform/include/aicore/aicore_profiling_state.h @@ -25,12 +25,18 @@ * * Lifecycle: * 1. Host fills `KernelArgs::enable_profiling_flag` and - * `KernelArgs::aicore_ring_addr` (now points to a per-core - * `AicoreRotation` table). - * 2. AICore kernel entry indexes `aicore_ring_addr[block_idx]` for this - * core's `AicoreRotation*` and calls `set_aicore_profiling_flag()` + - * `set_aicore_rotation()` before invoking `aicore_execute`. - * 3. `aicore_execute` and downstream profiling helpers read via getters. + * `KernelArgs::aicore_ring_addr` (points to a per-core `AicoreRotation` + * device-address table). Host allocates the table bytes; AICPU populates + * the entries inside `l2_perf_aicpu_init`. + * 2. AICore kernel entry stashes `&aicore_ring_addr[block_idx]` (the slot + * pointer — NOT the dereferenced rotation pointer yet) via + * `set_aicore_rotation_slot()`, and calls `set_aicore_profiling_flag()`, + * before invoking `aicore_execute`. + * 3. `get_aicore_rotation()` lazily dereferences the slot the first time + * it is called. Callers must defer the call until AFTER AICPU has + * dispatched the first task (so AICPU init has had a chance to populate + * the table). The executor handles this by calling it inside the main + * loop's first-task branch. */ #ifndef PLATFORM_AICORE_AICORE_PROFILING_STATE_H_ @@ -50,12 +56,22 @@ __aicore__ void set_aicore_profiling_flag(uint32_t flag); __aicore__ uint32_t get_aicore_profiling_flag(); /** - * Per-core AICore rotation channel. Set once at kernel entry from - * `((uint64_t*)k_args->aicore_ring_addr)[block_idx]`; nullptr when the L2 - * swimlane bit is off or the address table itself is null. AICore reads - * this cache line per task to pick up the current L2PerfAicoreBuffer. + * Per-core AICore rotation channel. + * + * `set_aicore_rotation_slot(slot)` stashes the address of THIS core's slot + * in the rotation-address table — `&((uint64_t*)k_args->aicore_ring_addr)[block_idx]`. + * No dereference happens here, because at kernel entry the AICPU side may + * not yet have populated the table (the host launches both kernels and + * AICPU's init runs concurrently with AICore's entry). + * + * `get_aicore_rotation()` lazily dereferences the stashed slot on first use, + * caches the result, and returns it on subsequent calls. Callers MUST defer + * the first call until after AICPU has dispatched the first task — by then + * AICPU's init has completed and the slot holds a valid device address. + * The executor's main loop honours this by reading the rotation only inside + * the first-task branch of the dispatch poll. */ -__aicore__ void set_aicore_rotation(__gm__ AicoreRotation *rotation); +__aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr); __aicore__ __gm__ AicoreRotation *get_aicore_rotation(); #endif // PLATFORM_AICORE_AICORE_PROFILING_STATE_H_ diff --git a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h index 341b426fd..03465e02b 100644 --- a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h +++ b/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h @@ -46,6 +46,16 @@ extern "C" uint64_t get_platform_l2_perf_base(); extern "C" void set_l2_swimlane_enabled(bool enable); extern "C" bool is_l2_swimlane_enabled(); +// AICore rotation-table device pointer (= KernelArgs::aicore_ring_addr). +// Published by the host before AICPU init runs; AICPU init fills the table +// with the per-core `&L2PerfAicoreBufferState::rotation` device addresses so +// AICore can index `aicore_ring_addr[block_idx]` to find its rotation channel. +// Moved from host into AICPU so the host stays decoupled from the AICore-side +// shared-memory layout (host previously did host-to-device address translation +// + reached into get_aicore_buffer_state to fill this). +extern "C" void set_platform_aicore_rotation_table(uint64_t table_addr); +extern "C" uint64_t get_platform_aicore_rotation_table(); + // Typed getter for the granular perf_level (promoted from the shared-memory // header inside l2_perf_aicpu_init). Gate sites should use this so the // comparison RHS is a named L2PerfLevel constant. diff --git a/src/a2a3/platform/onboard/aicore/kernel.cpp b/src/a2a3/platform/onboard/aicore/kernel.cpp index dff633920..e87cb81ef 100644 --- a/src/a2a3/platform/onboard/aicore/kernel.cpp +++ b/src/a2a3/platform/onboard/aicore/kernel.cpp @@ -43,15 +43,28 @@ // linker dedup the otherwise-duplicate symbol definitions across the two // compilation units. [[block_local]] static uint32_t s_aicore_profiling_flag; +// Slot pointer (NOT the dereferenced rotation address) — see +// aicore_profiling_state.h for the lazy-deref contract. +[[block_local]] static __gm__ uint64_t *s_aicore_rotation_slot; [[block_local]] static __gm__ AicoreRotation *s_aicore_rotation; __attribute__((weak)) __aicore__ void set_aicore_profiling_flag(uint32_t flag) { s_aicore_profiling_flag = flag; } __attribute__((weak)) __aicore__ uint32_t get_aicore_profiling_flag() { return s_aicore_profiling_flag; } -__attribute__((weak)) __aicore__ void set_aicore_rotation(__gm__ AicoreRotation *rotation) { - s_aicore_rotation = rotation; +__attribute__((weak)) __aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) { + s_aicore_rotation_slot = slot_ptr; + s_aicore_rotation = nullptr; // force lazy resolution on next get +} +__attribute__((weak)) __aicore__ __gm__ AicoreRotation *get_aicore_rotation() { + // Lazy first-call resolve: AICPU init populates `*s_aicore_rotation_slot` + // before dispatching the first task, so by the time the executor reaches + // for the rotation (inside the first-task branch of the dispatch poll) + // the slot holds a valid device address. + if (s_aicore_rotation == nullptr && s_aicore_rotation_slot != nullptr) { + s_aicore_rotation = reinterpret_cast<__gm__ AicoreRotation *>(*s_aicore_rotation_slot); + } + return s_aicore_rotation; } -__attribute__((weak)) __aicore__ __gm__ AicoreRotation *get_aicore_rotation() { return s_aicore_rotation; } extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type); @@ -91,8 +104,14 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ KernelA // get_aicore_rotation() — never touches Handshake for profiling. set_aicore_profiling_flag(k_args->enable_profiling_flag); if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)) { + // Stash only the slot pointer. The slot CONTENTS are written by + // AICPU's `l2_perf_aicpu_init` which runs concurrently with this + // entry; dereferencing here would race with AICPU's write. The + // executor defers the deref via `get_aicore_rotation()` until inside + // the first-task branch — by then AICPU has dispatched, so init is + // done and the slot is populated. __gm__ uint64_t *rotation_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_ring_addr); - set_aicore_rotation(reinterpret_cast<__gm__ AicoreRotation *>(rotation_table[block_idx])); + set_aicore_rotation_slot(rotation_table != nullptr ? &rotation_table[block_idx] : nullptr); } aicore_execute(k_args->runtime_args, block_idx, core_type); diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index 291cafc88..7926fa0e7 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -110,6 +110,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a set_platform_dump_base(k_args->dump_data_base); set_dump_tensor_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR)); set_platform_l2_perf_base(k_args->l2_perf_data_base); + set_platform_aicore_rotation_table(k_args->aicore_ring_addr); set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)); set_platform_pmu_base(k_args->pmu_data_base); set_platform_pmu_reg_addrs(k_args->pmu_reg_addrs); diff --git a/src/a2a3/platform/sim/aicore/kernel.cpp b/src/a2a3/platform/sim/aicore/kernel.cpp index fb723b9a1..033682d2b 100644 --- a/src/a2a3/platform/sim/aicore/kernel.cpp +++ b/src/a2a3/platform/sim/aicore/kernel.cpp @@ -33,6 +33,9 @@ static pthread_key_t g_reg_base_key; static pthread_key_t g_core_id_key; static pthread_key_t g_aicore_profiling_flag_key; +// Slot pointer (NOT the dereferenced rotation address) — see +// aicore_profiling_state.h for the lazy-deref contract. +static pthread_key_t g_aicore_rotation_slot_key; static pthread_key_t g_aicore_rotation_key; static pthread_once_t g_tls_once = PTHREAD_ONCE_INIT; @@ -40,6 +43,7 @@ static void create_tls_keys() { pthread_key_create(&g_reg_base_key, nullptr); pthread_key_create(&g_core_id_key, nullptr); pthread_key_create(&g_aicore_profiling_flag_key, nullptr); + pthread_key_create(&g_aicore_rotation_slot_key, nullptr); pthread_key_create(&g_aicore_rotation_key, nullptr); } @@ -61,11 +65,19 @@ __aicore__ uint32_t get_aicore_profiling_flag() { return static_cast(reinterpret_cast(pthread_getspecific(g_aicore_profiling_flag_key))); } -__aicore__ void set_aicore_rotation(__gm__ AicoreRotation *rotation) { - pthread_setspecific(g_aicore_rotation_key, reinterpret_cast(rotation)); +__aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) { + pthread_setspecific(g_aicore_rotation_slot_key, reinterpret_cast(slot_ptr)); + pthread_setspecific(g_aicore_rotation_key, nullptr); // force lazy resolve on next get } __aicore__ __gm__ AicoreRotation *get_aicore_rotation() { - return reinterpret_cast<__gm__ AicoreRotation *>(pthread_getspecific(g_aicore_rotation_key)); + auto *cached = reinterpret_cast<__gm__ AicoreRotation *>(pthread_getspecific(g_aicore_rotation_key)); + if (cached != nullptr) return cached; + auto *slot = reinterpret_cast<__gm__ uint64_t *>(pthread_getspecific(g_aicore_rotation_slot_key)); + if (slot == nullptr) return nullptr; + // Lazy first-call resolve — see aicore_profiling_state.h. + cached = reinterpret_cast<__gm__ AicoreRotation *>(*slot); + pthread_setspecific(g_aicore_rotation_key, reinterpret_cast(cached)); + return cached; } // Core identity setter function pointers — set by DeviceRunner after dlopen. @@ -107,10 +119,13 @@ extern "C" void aicore_execute_wrapper( // Publish per-core profiling state before the executor runs. set_aicore_profiling_flag(enable_profiling_flag); if (aicore_ring_addr != 0) { + // Stash only the slot pointer; deref happens lazily inside + // get_aicore_rotation() once AICPU has populated the table. See + // aicore_profiling_state.h. uint64_t *rotation_table = reinterpret_cast(aicore_ring_addr); - set_aicore_rotation(reinterpret_cast<__gm__ AicoreRotation *>(rotation_table[block_idx])); + set_aicore_rotation_slot(reinterpret_cast<__gm__ uint64_t *>(&rotation_table[block_idx])); } else { - set_aicore_rotation(nullptr); + set_aicore_rotation_slot(nullptr); } // Set core identity for pto-isa TPUSH/TPOP simulation. diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 3de9e359b..a6a78bdc2 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -273,6 +273,13 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } + set_platform_aicore_rotation_table_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_aicore_rotation_table")); + if (set_platform_aicore_rotation_table_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_platform_aicore_rotation_table: %s", dlerror()); + return -1; + } + set_l2_swimlane_enabled_func_ = reinterpret_cast(dlsym(aicpu_so_handle_, "set_l2_swimlane_enabled")); if (set_l2_swimlane_enabled_func_ == nullptr) { @@ -652,6 +659,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { set_platform_dump_base_func_(kernel_args_.dump_data_base); set_dump_tensor_enabled_func_(enable_dump_tensor_); set_platform_l2_perf_base_func_(kernel_args_.l2_perf_data_base); + set_platform_aicore_rotation_table_func_(kernel_args_.aicore_ring_addr); set_l2_swimlane_enabled_func_(enable_l2_swimlane_); set_platform_pmu_base_func_(kernel_args_.pmu_data_base); set_platform_pmu_reg_addrs_func_(kernel_args_.pmu_reg_addrs); @@ -844,6 +852,7 @@ void DeviceRunner::unload_executor_binaries() { set_platform_dump_base_func_ = nullptr; set_dump_tensor_enabled_func_ = nullptr; set_platform_l2_perf_base_func_ = nullptr; + set_platform_aicore_rotation_table_func_ = nullptr; set_l2_swimlane_enabled_func_ = nullptr; set_platform_pmu_base_func_ = nullptr; set_platform_pmu_reg_addrs_func_ = nullptr; diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index d0b29a765..f1a44e59b 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -382,6 +382,7 @@ class DeviceRunner { void (*set_platform_dump_base_func_)(uint64_t){nullptr}; void (*set_dump_tensor_enabled_func_)(bool){nullptr}; void (*set_platform_l2_perf_base_func_)(uint64_t){nullptr}; + void (*set_platform_aicore_rotation_table_func_)(uint64_t){nullptr}; void (*set_l2_swimlane_enabled_func_)(bool){nullptr}; void (*set_platform_pmu_base_func_)(uint64_t){nullptr}; void (*set_platform_pmu_reg_addrs_func_)(uint64_t){nullptr}; diff --git a/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp index c5a3c3143..7ab5b7498 100644 --- a/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp +++ b/src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp @@ -73,10 +73,19 @@ static uint64_t g_platform_l2_perf_base = 0; static bool g_enable_l2_swimlane = false; static L2PerfLevel g_l2_perf_level = L2PerfLevel::DISABLED; +// AICore rotation-table device pointer (= KernelArgs::aicore_ring_addr). +// Published by the host (sim: dlsym'd setter; onboard: from k_args via the +// kernel entry); AICPU init walks it to fill per-core &rotation addresses. +static uint64_t g_platform_aicore_rotation_table = 0; + extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base) { g_platform_l2_perf_base = l2_perf_data_base; } extern "C" uint64_t get_platform_l2_perf_base() { return g_platform_l2_perf_base; } extern "C" void set_l2_swimlane_enabled(bool enable) { g_enable_l2_swimlane = enable; } extern "C" bool is_l2_swimlane_enabled() { return g_enable_l2_swimlane; } +extern "C" void set_platform_aicore_rotation_table(uint64_t table_addr) { + g_platform_aicore_rotation_table = table_addr; +} +extern "C" uint64_t get_platform_aicore_rotation_table() { return g_platform_aicore_rotation_table; } L2PerfLevel get_l2_perf_level() { return g_l2_perf_level; } /** @@ -132,6 +141,16 @@ void l2_perf_aicpu_init(int worker_count) { static_cast(g_l2_perf_level) ); + // Populate the per-core AicoreRotation device-address table. AICore reads + // `aicore_ring_addr[block_idx]` from KernelArgs to find its rotation + // channel; the table itself is host-allocated, but the entries are + // device-internal addresses (`&ac_state->rotation`) that the host would + // otherwise have to translate from host-mapped to device-mapped. AICPU + // already runs on the device, so it can write the addresses directly + // without any translation — that keeps the host side decoupled from the + // AICore shared-memory layout. + uint64_t *rotation_table = reinterpret_cast(g_platform_aicore_rotation_table); + // Pop first buffer from free_queue for each core for (int i = 0; i < worker_count; i++) { L2PerfBufferState *state = get_perf_buffer_state(l2_perf_base, i); @@ -140,6 +159,10 @@ void l2_perf_aicpu_init(int worker_count) { s_perf_buffer_states[i] = state; s_aicore_buffer_states[i] = ac_state; + if (rotation_table != nullptr) { + rotation_table[i] = reinterpret_cast(&ac_state->rotation); + } + // Pop first buffer from free_queue rmb(); uint32_t head = state->free_queue.head; diff --git a/src/a2a3/platform/src/host/l2_perf_collector.cpp b/src/a2a3/platform/src/host/l2_perf_collector.cpp index d9b978ec5..98c72a928 100644 --- a/src/a2a3/platform/src/host/l2_perf_collector.cpp +++ b/src/a2a3/platform/src/host/l2_perf_collector.cpp @@ -240,10 +240,13 @@ int L2PerfCollector::initialize( PLATFORM_PROF_BUFFERS_PER_CORE, PLATFORM_AICORE_BUFFERS_PER_CORE ); - // Step 5c: Standalone uint64_t[num_aicore] table holding per-core - // AicoreRotation device addresses (= &ac_state->rotation). AICore reads - // rotation_table[block_idx] via KernelArgs::aicore_ring_addr and feeds it - // into the platform's set_aicore_rotation(). + // Step 5c: Standalone uint64_t[num_aicore] table that will hold per-core + // AicoreRotation device addresses. Host only allocates the bytes and + // hands the device pointer to AICPU via KernelArgs::aicore_ring_addr; + // AICPU itself fills the entries inside `l2_perf_aicpu_init` (it has + // direct access to `&ac_state->rotation` device addresses, no + // host-to-device translation needed). AICore reads + // rotation_table[block_idx] at kernel entry. { size_t table_bytes = static_cast(num_aicore) * sizeof(uint64_t); void *rotation_table_host = nullptr; @@ -252,20 +255,6 @@ int L2PerfCollector::initialize( LOG_ERROR("Failed to allocate aicore_ring_addr (rotation) table (%zu bytes)", table_bytes); return -1; } - uint64_t *rotation_table = reinterpret_cast(rotation_table_host); - - // Compute the per-core device address of &state->rotation. We have - // the host-mapped shm region; the device equivalent is at the same - // offset from perf_dev_ptr. - auto host_to_dev = [&](void *host_addr) -> uint64_t { - uintptr_t offset = reinterpret_cast(host_addr) - reinterpret_cast(perf_host_ptr); - return reinterpret_cast(perf_dev_ptr) + offset; - }; - - for (int i = 0; i < num_aicore; i++) { - L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(perf_host_ptr, num_aicore, i); - rotation_table[i] = host_to_dev(&ac_state->rotation); - } aicore_ring_addr_table_dev_ = rotation_table_dev; } diff --git a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp index ceb16a794..c66b11ed9 100644 --- a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp @@ -60,7 +60,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU); // Per-core AicoreRotation channel; see tensormap_and_ringbuffer/.../aicore_executor.cpp. - __gm__ AicoreRotation *l2_perf_rotation = l2_perf_enabled ? get_aicore_rotation() : nullptr; + // Deferred until first task so AICPU's init has populated the rotation + // table (the dispatch itself proves init is done). + __gm__ AicoreRotation *l2_perf_rotation = nullptr; AicoreLocalState l2_perf_local = {nullptr, 0, 0}; volatile uint32_t task_id = AICPU_IDLE_TASK_ID; @@ -83,6 +85,11 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in uint32_t actual_task_id = task_id; write_reg(RegId::COND, MAKE_ACK_VALUE(actual_task_id)); + // First-task lazy resolve of the rotation channel. + if (l2_perf_enabled && l2_perf_rotation == nullptr) { + l2_perf_rotation = get_aicore_rotation(); + } + __gm__ Task *task_ptr = &(runtime->tasks[actual_task_id]); uint64_t start_time = get_sys_cnt_aicore(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index 25580ad9c..94e18b35e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -102,11 +102,14 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in bool dump_tensor_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR); bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU); - // Per-core AicoreRotation channel is published once at kernel entry from - // KernelArgs::aicore_ring_addr. AICore reads it per task (cheap relative - // to the per-task dcci(payload, ENTIRE_DATA_CACHE)) to pick up the - // current L2PerfAicoreBuffer — see l2_perf_collector_aicore.h. - __gm__ AicoreRotation *l2_perf_rotation = l2_perf_enabled ? get_aicore_rotation() : nullptr; + // Per-core AicoreRotation channel. The pointer to THIS core's rotation + // is stored in `KernelArgs::aicore_ring_addr[block_idx]`, but AICPU + // populates that table inside `l2_perf_aicpu_init` which runs + // concurrently with this kernel's entry — so we cannot deref at startup. + // Defer the deref via `get_aicore_rotation()` until the first task is + // dispatched; by then AICPU's init has completed (the very dispatch is + // proof of that). + __gm__ AicoreRotation *l2_perf_rotation = nullptr; AicoreLocalState l2_perf_local = {nullptr, 0, 0}; // Phase 4: Main execution loop - poll register for tasks until exit signal @@ -131,6 +134,13 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in { uint32_t task_id = reg_val; // Decode: register holds task_id directly + // First-task lazy resolve of the rotation channel — see comment + // above. `get_aicore_rotation()` caches after first call so this + // costs nothing on subsequent tasks. + if (l2_perf_enabled && l2_perf_rotation == nullptr) { + l2_perf_rotation = get_aicore_rotation(); + } + // Select dual-buffer slot: same bit as AICPU used when writing payload __gm__ PTO2DispatchPayload *exec_payload = payload + (task_id & 1u);