From a1ecccff6192b8282717dd479083bac53aab3688 Mon Sep 17 00:00:00 2001 From: s00831018 Date: Tue, 19 May 2026 15:24:17 +0200 Subject: [PATCH 01/15] Adding the capability of re-running a task within a single kernel run. This is important for accurate timing --- .../aicpu/aicpu_executor.cpp | 166 ++++++++++++------ .../host/runtime_maker.cpp | 36 ++++ .../runtime/runtime.h | 9 + .../runtime/shared/runtime.cpp | 2 + 4 files changed, 155 insertions(+), 58 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 851015e21..a72e27c36 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -113,7 +113,7 @@ struct AicpuExecutor { bool orch_to_sched_{false}; // ===== Thread management state ===== - std::atomic thread_idx_{0}; + std::atomic thread_idx_accumulator{0}; std::atomic initialized_{false}; std::atomic init_done_{false}; std::atomic init_failed_{false}; @@ -121,6 +121,10 @@ struct AicpuExecutor { int32_t thread_num_{0}; + // Barrier counters for synchronization (timing) across threads + std::atomic barrier_counter_in_{0}; + std::atomic barrier_counter_out_{0}; + // ===== Task queue state (managed by scheduler ready queues) ===== std::atomic finished_count_{0}; @@ -142,6 +146,19 @@ struct AicpuExecutor { int32_t init(Runtime *runtime); int32_t run(Runtime *runtime); void deinit(Runtime *runtime); + int32_t getThreadId() { return thread_idx_accumulator.fetch_add(1); } + + // Barrier function to synchronize threads + inline void barrier() + { + // Two phase barrier (in_out), guarantees that all threads are retained in the rendezvous point, until all of them have arrived + barrier_counter_in_.fetch_add(1); + while (barrier_counter_in_.load(std::memory_order_relaxed) % thread_num_ != 0); + + barrier_counter_out_.fetch_add(1); + while (barrier_counter_out_.load(std::memory_order_relaxed) % thread_num_ != 0); + } + ~AicpuExecutor() { // Process-wide teardown (the single static instance dies here). Every @@ -157,6 +174,7 @@ struct AicpuExecutor { }; static AicpuExecutor g_aicpu_executor; +thread_local int32_t my_thread_idx_; // ===== AicpuExecutor Method Implementations ===== @@ -202,16 +220,18 @@ int32_t AicpuExecutor::init(Runtime *runtime) { * Shutdown AICore - Send exit signal via registers to all AICore kernels */ int32_t AicpuExecutor::run(Runtime *runtime) { - int32_t thread_idx = thread_idx_++; int32_t run_rc = 0; - LOG_INFO_V0("Thread %d: Start", thread_idx); + LOG_INFO_V0("Thread %d: at AicpuExecutor::Run", my_thread_idx_); // Orchestrator check - if (thread_idx >= sched_thread_num_) { + if (my_thread_idx_ >= sched_thread_num_) { #if PTO2_PROFILING uint64_t orch_cycle_start = 0; int32_t pto2_submitted_tasks = -1; #endif + + LOG_INFO_V0("Thread %d: Orchestrator Running", my_thread_idx_); + // Orchestrator thread: load + run the device orchestration SO. The braces // scope the per-callable dlopen / SO-table locals to this block. { @@ -221,7 +241,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { const int32_t callable_id = runtime->get_active_callable_id(); if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { LOG_ERROR( - "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS + "Thread %d: invalid callable_id %d (limit=%d)", my_thread_idx_, callable_id, MAX_REGISTERED_CALLABLE_IDS ); runtime_init_ready_.store(true, std::memory_order_release); return -1; @@ -232,9 +252,10 @@ int32_t AicpuExecutor::run(Runtime *runtime) { DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind; DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func; const bool reload_so = runtime->register_new_callable_id(); + const bool so_in_use = orch_so_table_[callable_id].in_use; - if (reload_so) { - LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id); + if (reload_so && so_in_use == false) { + LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", my_thread_idx_, callable_id); if (*p_handle != nullptr) { dlclose(*p_handle); *p_handle = nullptr; @@ -253,7 +274,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { size_t so_size = runtime->get_dev_orch_so_size(); if (so_data == nullptr || so_size == 0) { - LOG_ERROR("Thread %d: Device orchestration SO not set", thread_idx); + LOG_ERROR("Thread %d: Device orchestration SO not set", my_thread_idx_); // Unblock scheduler threads before returning so they don't spin forever. runtime_init_ready_.store(true, std::memory_order_release); return -1; @@ -271,7 +292,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path)); if (fd < 0) { LOG_INFO_V0( - "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno + "Thread %d: Cannot create SO at %s (errno=%d), trying next path", my_thread_idx_, so_path, errno ); continue; } @@ -279,17 +300,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) { close(fd); if (written != static_cast(so_size)) { LOG_INFO_V0( - "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno + "Thread %d: Cannot write SO to %s (errno=%d), trying next path", my_thread_idx_, so_path, errno ); unlink(so_path); continue; } file_created = true; - LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size); + LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", my_thread_idx_, so_path, so_size); } if (!file_created) { - LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx); + LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", my_thread_idx_); // Unblock scheduler threads before returning so they don't spin forever. runtime_init_ready_.store(true, std::memory_order_release); return -1; @@ -299,13 +320,13 @@ int32_t AicpuExecutor::run(Runtime *runtime) { void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); const char *dlopen_err = dlerror(); if (handle == nullptr) { - LOG_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown"); + LOG_ERROR("Thread %d: dlopen failed: %s", my_thread_idx_, dlopen_err ? dlopen_err : "unknown"); unlink(so_path); // Unblock scheduler threads before returning so they don't spin forever. runtime_init_ready_.store(true, std::memory_order_release); return -1; } - LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); + LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", my_thread_idx_, handle); // Unlink the on-disk SO immediately: dlopen has already mmap'd // the image, so the kernel keeps the inode alive until the @@ -330,7 +351,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { const char *entry_dlsym_error = dlerror(); if (entry_dlsym_error != nullptr) { LOG_ERROR( - "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error + "Thread %d: dlsym failed for entry symbol '%s': %s", my_thread_idx_, entry_symbol, entry_dlsym_error ); dlclose(handle); unlink(so_path); @@ -339,7 +360,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { return -1; } if (orch_func == nullptr) { - LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol); + LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", my_thread_idx_, entry_symbol); dlclose(handle); unlink(so_path); // Unblock scheduler threads before returning so they don't spin forever. @@ -352,7 +373,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { const char *config_dlsym_error = dlerror(); if (config_dlsym_error != nullptr || config_func == nullptr) { LOG_ERROR( - "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol, + "Thread %d: dlsym failed for config symbol '%s': %s", my_thread_idx_, config_symbol, config_dlsym_error ? config_dlsym_error : "NULL function pointer" ); config_func = nullptr; @@ -363,7 +384,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { reinterpret_cast(dlsym(handle, "framework_bind_runtime")); const char *bind_runtime_error = dlerror(); if (bind_runtime_error != nullptr) { - LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", thread_idx, bind_runtime_error); + LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", my_thread_idx_, bind_runtime_error); bind_runtime_func = nullptr; } @@ -375,11 +396,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) { orch_so_table_[callable_id].in_use = true; } else { LOG_INFO_V0( - "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id + "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", my_thread_idx_, *p_handle, callable_id ); if (*p_handle == nullptr || *p_func == nullptr) { LOG_ERROR( - "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx, + "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", my_thread_idx_, callable_id ); // Unblock scheduler threads before returning so they don't spin forever. @@ -391,13 +412,13 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // Validate arg count on every run (reload or cache hit). if (*p_config_func != nullptr) { PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args()); - LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count); + LOG_INFO_V0("Thread %d: Config: expected_args=%d", my_thread_idx_, cfg.expected_arg_count); if (cfg.expected_arg_count > 0) { const ChipStorageTaskArgs &args_validate = runtime->get_orch_args(); int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count(); if (actual_arg_count < cfg.expected_arg_count) { LOG_ERROR( - "Thread %d: arg_count %d < expected %d", thread_idx, actual_arg_count, + "Thread %d: arg_count %d < expected %d", my_thread_idx_, actual_arg_count, cfg.expected_arg_count ); // Clean up cached state so a subsequent run does a full reload. @@ -419,7 +440,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } } } else { - LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx); + LOG_INFO_V0("Thread %d: No config function, using defaults", my_thread_idx_); } // sm_handle / rt are bound to *this* run's memory and must be @@ -427,17 +448,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // reused above. const ChipStorageTaskArgs &args = runtime->get_orch_args(); int32_t arg_count = args.tensor_count() + args.scalar_count(); - LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", thread_idx, runtime->get_gm_sm_ptr(), arg_count); + LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", my_thread_idx_, runtime->get_gm_sm_ptr(), arg_count); for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) { const ContinuousTensor &t = args.tensor(i); LOG_INFO_V0( - "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", thread_idx, i, + "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", my_thread_idx_, i, static_cast(t.data), t.ndims, static_cast(t.dtype) ); } for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) { LOG_INFO_V0( - "Thread %d: orch_args[%d] = SCALAR(0x%lx)", thread_idx, args.tensor_count() + i, + "Thread %d: orch_args[%d] = SCALAR(0x%lx)", my_thread_idx_, args.tensor_count() + i, static_cast(args.scalar(i)) ); } @@ -456,7 +477,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { dep_pool_capacity = static_cast(runtime->dep_pool_size); } LOG_INFO_V0( - "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", thread_idx, + "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", my_thread_idx_, static_cast(task_window_size), static_cast(heap_size), dep_pool_capacity ); @@ -467,7 +488,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { PTO2SharedMemoryHandle *sm_handle = PTO2SharedMemoryHandle::create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size); if (!sm_handle) { - LOG_ERROR("Thread %d: Failed to create shared memory handle", thread_idx); + LOG_ERROR("Thread %d: Failed to create shared memory handle", my_thread_idx_); // Unblock scheduler threads before returning so they don't spin forever. runtime_init_ready_.store(true, std::memory_order_release); return -1; @@ -475,7 +496,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { rt = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity); if (!rt) { - LOG_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx); + LOG_ERROR("Thread %d: Failed to create PTO2Runtime", my_thread_idx_); sm_handle->destroy(); // Unblock scheduler threads before returning so they don't spin forever. runtime_init_ready_.store(true, std::memory_order_release); @@ -506,7 +527,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { #if PTO2_PROFILING if (is_l2_swimlane_enabled()) { - l2_perf_aicpu_set_orch_thread_idx(thread_idx); + l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_); } #endif @@ -514,7 +535,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // set the per-thread queue index and pop the initial buffer before any // submit_task can fire inside orch_func_. if (is_dep_gen_enabled()) { - dep_gen_aicpu_set_orch_thread_idx(thread_idx); + dep_gen_aicpu_set_orch_thread_idx(my_thread_idx_); dep_gen_aicpu_init(); } @@ -546,57 +567,57 @@ int32_t AicpuExecutor::run(Runtime *runtime) { p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle; if (total == 0) total = 1; // avoid div-by-zero LOG_INFO_V9( - "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", thread_idx, + "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", my_thread_idx_, static_cast(p.submit_count), cycles_to_us(total) ); LOG_INFO_V9( "Thread %d: task+heap_alloc: %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", - thread_idx, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total, + my_thread_idx_, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total, cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle), static_cast(p.alloc_atomic_count) ); LOG_INFO_V9( - "Thread %d: sync_tensormap : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.sync_cycle), + "Thread %d: sync_tensormap : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.sync_cycle), p.sync_cycle * 100.0 / total ); LOG_INFO_V9( - "Thread %d: lookup+dep : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.lookup_cycle), + "Thread %d: lookup+dep : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.lookup_cycle), p.lookup_cycle * 100.0 / total ); LOG_INFO_V9( - "Thread %d: tensormap_ins : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.insert_cycle), + "Thread %d: tensormap_ins : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.insert_cycle), p.insert_cycle * 100.0 / total ); LOG_INFO_V9( - "Thread %d: param_copy : %.3fus (%.1f%%) atomics=%" PRIu64 "", thread_idx, + "Thread %d: param_copy : %.3fus (%.1f%%) atomics=%" PRIu64 "", my_thread_idx_, cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast(p.args_atomic_count) ); LOG_INFO_V9( "Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", - thread_idx, cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total, + my_thread_idx_, cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total, cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle), static_cast(p.fanin_atomic_count) ); LOG_INFO_V9( - "Thread %d: avg/task : %.3fus", thread_idx, + "Thread %d: avg/task : %.3fus", my_thread_idx_, p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0 ); #if PTO2_TENSORMAP_PROFILING PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling(); - LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", thread_idx); + LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", my_thread_idx_); LOG_INFO_V9( - "Thread %d: lookups : %" PRIu64 ", inserts: %" PRIu64 "", thread_idx, + "Thread %d: lookups : %" PRIu64 ", inserts: %" PRIu64 "", my_thread_idx_, static_cast(tp.lookup_count), static_cast(tp.insert_count) ); LOG_INFO_V9( - "Thread %d: chain walked : total=%" PRIu64 ", avg=%.1f, max=%d", thread_idx, + "Thread %d: chain walked : total=%" PRIu64 ", avg=%.1f, max=%d", my_thread_idx_, static_cast(tp.lookup_chain_total), tp.lookup_count > 0 ? static_cast(tp.lookup_chain_total) / tp.lookup_count : 0.0, tp.lookup_chain_max ); LOG_INFO_V9( - "Thread %d: overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", thread_idx, + "Thread %d: overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", my_thread_idx_, static_cast(tp.overlap_checks), static_cast(tp.overlap_hits), tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0 ); @@ -637,12 +658,12 @@ int32_t AicpuExecutor::run(Runtime *runtime) { pto2_submitted_tasks = total_tasks; #endif - sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks); + sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks); } #if PTO2_PROFILING uint64_t orch_end_ts = get_sys_cnt_aicpu(); LOG_INFO_V9( - "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", thread_idx, + "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", my_thread_idx_, static_cast(orch_cycle_start), static_cast(orch_end_ts), cycles_to_us(orch_end_ts - orch_cycle_start) ); @@ -653,42 +674,42 @@ int32_t AicpuExecutor::run(Runtime *runtime) { ); } #endif - LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx); + LOG_INFO_V0("Thread %d: Orchestrator completed", my_thread_idx_); } // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false) - if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) { + if (!sched_ctx_.is_completed() && (my_thread_idx_ < sched_thread_num_ || orch_to_sched_)) { // Device orchestration: wait for the primary orchestrator to initialize the SM header while (!runtime_init_ready_.load(std::memory_order_acquire)) { SPIN_WAIT_HINT(); } if (rt == nullptr) { - LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx); + LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", my_thread_idx_); } else { sched_ctx_.bind_runtime(rt); - int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, thread_idx); + int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, my_thread_idx_); if (completed < 0) { - LOG_ERROR("Thread %d: Scheduler failed with rc=%d", thread_idx, completed); + LOG_ERROR("Thread %d: Scheduler failed with rc=%d", my_thread_idx_, completed); run_rc = completed; } else { - LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", thread_idx, completed); + LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", my_thread_idx_, completed); } } } + LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_); + // Always shutdown AICore — even if sched_ctx_.completed_ was already true. // platform_deinit_aicore_regs is idempotent; orchestrator threads have - // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly. - int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx); + // core_trackers_[my_thread_idx_].core_num() == 0 so they skip the loop harmlessly. + int32_t shutdown_rc = sched_ctx_.shutdown(my_thread_idx_); if (shutdown_rc != 0 && run_rc == 0) { run_rc = shutdown_rc; } - LOG_INFO_V0("Thread %d: Completed", thread_idx); - // Check if this is the last thread to finish int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel); - if (prev_finished + 1 == thread_num_) { + if (prev_finished + 1 % thread_num_ == 0) { finished_.store(true, std::memory_order_release); // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we // always tear them down here, but we keep the per-cid orch SO entries @@ -742,7 +763,7 @@ void AicpuExecutor::deinit(Runtime *runtime) { initialized_.store(false, std::memory_order_release); init_done_.store(false, std::memory_order_release); init_failed_.store(false, std::memory_order_release); - thread_idx_.store(0, std::memory_order_release); + thread_idx_accumulator.store(0, std::memory_order_release); finished_.store(false, std::memory_order_release); LOG_INFO_V0("DeInit: AicpuExecutor reset complete"); @@ -769,6 +790,8 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { return -1; } + my_thread_idx_ = g_aicpu_executor.getThreadId(); + LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution"); g_aicpu_executor.init(runtime); @@ -780,7 +803,34 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { } } - int32_t rc = g_aicpu_executor.run(runtime); + // Return code for running the kernel. It must be zero (no error) for all runs + int32_t rc = 0; + + // Adding timing runs, exclusively for performance evaluation: + const auto warmupIterationCount = runtime->get_warmup_iteration_count(); + const auto timingIterationCount = runtime->get_timing_iteration_count(); + + // First, perform warmup to disregard any cold cache effects and thread initialization times + for (int32_t i = 0; i < warmupIterationCount; i++) + { + rc |= g_aicpu_executor.run(runtime); + + // Waiting for threads to come back before re-running. + g_aicpu_executor.barrier(); + } + + // Second, perform timed runs (the ones that count) + for (int32_t i = 0; i < timingIterationCount; i++) + { + rc |= g_aicpu_executor.run(runtime); + + // Waiting for threads to come back before re-running. + g_aicpu_executor.barrier(); + } + + // Perform actual kernel run + rc |= g_aicpu_executor.run(runtime); + if (rc != 0) { LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc); } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index ab3cf838b..a0ecb1311 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -224,6 +224,42 @@ extern "C" int bind_prepared_to_runtime_impl( } int64_t t_args_end = _now_ms(); + // (Timing) Specify whether to perform warmup runs inside the aicpu executor + { + const char *env_warmup_iterations = std::getenv("PTO2_WARMUP_ITERATION_COUNT"); + if (env_warmup_iterations) { + char *endptr; + int64_t val = strtol(env_warmup_iterations, &endptr, 10); + if (endptr != env_warmup_iterations && *endptr == '\0') { + runtime->warmup_iteration_count = static_cast(val); + } else { + LOG_WARN( + "PTO2_WARMUP_ITERATION_COUNT=%s is invalid, using default %d", env_warmup_iterations, RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT + ); + runtime->warmup_iteration_count = RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT; + } + } + LOG_INFO_V0("Warmup iteration count: %d", runtime->warmup_iteration_count); + } + + // (Timing) Specify whether to perform timing runs inside the aicpu executor + { + const char *env_timing_iterations = std::getenv("PTO2_TIMING_ITERATION_COUNT"); + if (env_timing_iterations) { + char *endptr; + int64_t val = strtol(env_timing_iterations, &endptr, 10); + if (endptr != env_timing_iterations && *endptr == '\0') { + runtime->timing_iteration_count = static_cast(val); + } else { + LOG_WARN( + "PTO2_TIMING_ITERATION_COUNT=%s is invalid, using default %d", env_timing_iterations, RUNTIME_DEFAULT_TIMING_ITERATION_COUNT + ); + runtime->timing_iteration_count = RUNTIME_DEFAULT_TIMING_ITERATION_COUNT; + } + } + LOG_INFO_V0("Warmup iteration count: %d", runtime->timing_iteration_count); + } + // Read ready queue shard count from environment for AICPU scheduler { const char *env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS"); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index bd9d08ae8..2b4637193 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -54,6 +54,8 @@ // Default ready queue shards: one shard per worker thread (total minus orchestrator) constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1; +constexpr int RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT = 0; +constexpr int RUNTIME_DEFAULT_TIMING_ITERATION_COUNT = 0; // ============================================================================= // Data Structures @@ -167,6 +169,10 @@ class Runtime { int sche_cpu_num; // Number of AICPU threads for scheduling int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1) + // Timing parameters (for precise performance estimation) + int warmup_iteration_count; + int timing_iteration_count; + // Ring buffer size overrides (0 = use compile-time defaults) uint64_t task_window_size; uint64_t heap_size; @@ -216,6 +222,9 @@ class Runtime { // Performance Profiling // ========================================================================= + inline int32_t get_warmup_iteration_count() const { return warmup_iteration_count; }; + inline int32_t get_timing_iteration_count() const { return timing_iteration_count; }; + // ========================================================================= // Device orchestration (for AICPU thread 3) // ========================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index d64a2b7d4..b43a222df 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -34,6 +34,8 @@ Runtime::Runtime() { worker_count = 0; sche_cpu_num = 1; ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS; + warmup_iteration_count = RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT; + timing_iteration_count = RUNTIME_DEFAULT_TIMING_ITERATION_COUNT; task_window_size = 0; heap_size = 0; dep_pool_size = 0; From 847b64c3f1e5df6ea3328e931a9ca6fa2bd82563 Mon Sep 17 00:00:00 2001 From: s00831018 Date: Wed, 20 May 2026 09:53:31 +0200 Subject: [PATCH 02/15] Adding timing --- .../aicpu/aicpu_executor.cpp | 35 ++++++++++--------- .../host/runtime_maker.cpp | 32 +++++++++++++---- .../runtime/runtime.h | 2 ++ .../runtime/scheduler/scheduler_dispatch.cpp | 3 ++ .../runtime/shared/runtime.cpp | 1 + 5 files changed, 50 insertions(+), 23 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index a72e27c36..0460563b6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -806,27 +806,28 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { // Return code for running the kernel. It must be zero (no error) for all runs int32_t rc = 0; - // Adding timing runs, exclusively for performance evaluation: - const auto warmupIterationCount = runtime->get_warmup_iteration_count(); - const auto timingIterationCount = runtime->get_timing_iteration_count(); - - // First, perform warmup to disregard any cold cache effects and thread initialization times - for (int32_t i = 0; i < warmupIterationCount; i++) + // Performing timing evaluation, exclusively for performance evaluation: + const auto isTimingEnabled = runtime->get_timing_enabled(); + if (isTimingEnabled == true) { - rc |= g_aicpu_executor.run(runtime); + const auto warmupIterationCount = runtime->get_warmup_iteration_count(); + const auto timingIterationCount = runtime->get_timing_iteration_count(); - // Waiting for threads to come back before re-running. - g_aicpu_executor.barrier(); - } + // First, perform warmup to disregard any cold cache effects and thread initialization times + for (int32_t i = 0; i < warmupIterationCount; i++) rc |= g_aicpu_executor.run(runtime); - // Second, perform timed runs (the ones that count) - for (int32_t i = 0; i < timingIterationCount; i++) - { - rc |= g_aicpu_executor.run(runtime); + // Second, perform timed runs (the ones that count) + for (int32_t i = 0; i < timingIterationCount; i++) + { + // Waiting for threads to arrive for before-timing. + g_aicpu_executor.barrier(); - // Waiting for threads to come back before re-running. - g_aicpu_executor.barrier(); - } + rc |= g_aicpu_executor.run(runtime); + + // Waiting for threads to come back for after-timing. + g_aicpu_executor.barrier(); + } + } // Perform actual kernel run rc |= g_aicpu_executor.run(runtime); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index a0ecb1311..13a4f99ef 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -224,9 +224,28 @@ extern "C" int bind_prepared_to_runtime_impl( } int64_t t_args_end = _now_ms(); - // (Timing) Specify whether to perform warmup runs inside the aicpu executor + + // (Timing) Specify whether to perform timing analysis inside the aicpu executor { - const char *env_warmup_iterations = std::getenv("PTO2_WARMUP_ITERATION_COUNT"); + const char *env_timing_iterations = std::getenv("PTO2_KERNEL_TIMING_ENABLED"); + if (env_timing_iterations) { + std::string env_timing_iterations_string = std::string(env_timing_iterations); + bool isValidValue = false; + if (env_timing_iterations_string == "True") { runtime->is_timing_enabled = true; isValidValue = true; } + if (env_timing_iterations_string == "False") { runtime->is_timing_enabled = false; isValidValue = true; } + if (isValidValue == false) + { + LOG_WARN("PTO2_KERNEL_TIMING_ENABLED=%s is invalid, using default: \"False\"", env_timing_iterations); + runtime->is_timing_enabled = false; + } + } + LOG_INFO_V0("Is kernel timing enabled? %s", runtime->is_timing_enabled ? "True" : "False"); + } + + // (Timing) Specify how many warmup runs inside the aicpu executor, if timing is enabled + if (runtime->is_timing_enabled == true) + { + const char *env_warmup_iterations = std::getenv("PTO2_KERNEL_TIMING_WARMUP_COUNT"); if (env_warmup_iterations) { char *endptr; int64_t val = strtol(env_warmup_iterations, &endptr, 10); @@ -234,7 +253,7 @@ extern "C" int bind_prepared_to_runtime_impl( runtime->warmup_iteration_count = static_cast(val); } else { LOG_WARN( - "PTO2_WARMUP_ITERATION_COUNT=%s is invalid, using default %d", env_warmup_iterations, RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT + "PTO2_KERNEL_TIMING_WARMUP_COUNT=%s is invalid, using default %d", env_warmup_iterations, RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT ); runtime->warmup_iteration_count = RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT; } @@ -242,9 +261,10 @@ extern "C" int bind_prepared_to_runtime_impl( LOG_INFO_V0("Warmup iteration count: %d", runtime->warmup_iteration_count); } - // (Timing) Specify whether to perform timing runs inside the aicpu executor + // (Timing) Specify how many timing runs inside the aicpu executor, if timing is enabled + if (runtime->is_timing_enabled == true) { - const char *env_timing_iterations = std::getenv("PTO2_TIMING_ITERATION_COUNT"); + const char *env_timing_iterations = std::getenv("PTO2_KERNEL_TIMING_ITERATION_COUNT"); if (env_timing_iterations) { char *endptr; int64_t val = strtol(env_timing_iterations, &endptr, 10); @@ -252,7 +272,7 @@ extern "C" int bind_prepared_to_runtime_impl( runtime->timing_iteration_count = static_cast(val); } else { LOG_WARN( - "PTO2_TIMING_ITERATION_COUNT=%s is invalid, using default %d", env_timing_iterations, RUNTIME_DEFAULT_TIMING_ITERATION_COUNT + "PTO2_KERNEL_TIMING_ITERATION_COUNT=%s is invalid, using default %d", env_timing_iterations, RUNTIME_DEFAULT_TIMING_ITERATION_COUNT ); runtime->timing_iteration_count = RUNTIME_DEFAULT_TIMING_ITERATION_COUNT; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 2b4637193..ea518eca8 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -170,6 +170,7 @@ class Runtime { int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1) // Timing parameters (for precise performance estimation) + bool is_timing_enabled; int warmup_iteration_count; int timing_iteration_count; @@ -222,6 +223,7 @@ class Runtime { // Performance Profiling // ========================================================================= + inline bool get_timing_enabled() const { return is_timing_enabled; }; inline int32_t get_warmup_iteration_count() const { return warmup_iteration_count; }; inline int32_t get_timing_iteration_count() const { return timing_iteration_count; }; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index a028fd138..54415364d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -314,6 +314,8 @@ void SchedulerContext::dispatch_shape( // ============================================================================= int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) { + LOG_INFO_V0("Thread %d: At resolve_and_dispatch", thread_idx); + always_assert(sched_ != nullptr); CoreTracker &tracker = core_trackers_[thread_idx]; LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx); @@ -392,6 +394,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ l2_perf.sched_start_ts = get_sys_cnt_aicpu(); #endif + LOG_INFO_V0("Thread %d: Scheduling Start. (Completed: %s)", thread_idx, completed_.load() ? "True" : "False"); while (true) { if (completed_.load(std::memory_order_acquire)) { break; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index b43a222df..0bc984871 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -34,6 +34,7 @@ Runtime::Runtime() { worker_count = 0; sche_cpu_num = 1; ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS; + is_timing_enabled = false; warmup_iteration_count = RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT; timing_iteration_count = RUNTIME_DEFAULT_TIMING_ITERATION_COUNT; task_window_size = 0; From 38e50089d8d823e689c8cdd8d89396dd1170d20d Mon Sep 17 00:00:00 2001 From: s00831018 Date: Wed, 20 May 2026 10:58:44 +0200 Subject: [PATCH 03/15] Progress --- .../aicpu/aicpu_executor.cpp | 74 +++++++++++++------ .../runtime/scheduler/scheduler_cold_path.cpp | 46 ++++++++++++ .../runtime/scheduler/scheduler_context.h | 3 + .../runtime/scheduler/scheduler_dispatch.cpp | 2 - 4 files changed, 101 insertions(+), 24 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 0460563b6..e71da8341 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -145,8 +145,10 @@ struct AicpuExecutor { // ===== Methods ===== int32_t init(Runtime *runtime); int32_t run(Runtime *runtime); + int32_t performTimingRuns(Runtime *runtime); void deinit(Runtime *runtime); int32_t getThreadId() { return thread_idx_accumulator.fetch_add(1); } + void resetSchedulerContext() { sched_ctx_.reset(); } // Barrier function to synchronize threads inline void barrier() @@ -769,6 +771,54 @@ void AicpuExecutor::deinit(Runtime *runtime) { LOG_INFO_V0("DeInit: AicpuExecutor reset complete"); } +int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) +{ + const auto warmupIterationCount = runtime->get_warmup_iteration_count(); + const auto timingIterationCount = runtime->get_timing_iteration_count(); + + // Return code for running the kernel. It must be zero (no error) for all runs + int32_t rc = 0; + + // First, perform warmup to disregard any cold cache effects and thread initialization times + for (int32_t i = 0; i < warmupIterationCount; i++) + { + barrier(); + uint64_t t0_ts = get_sys_cnt_aicpu(); + rc |= run(runtime); + barrier(); + uint64_t t1_ts = get_sys_cnt_aicpu(); + LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, t1_ts - t0_ts); + + // Waiting for threads to come back for after-timing. + // deinit(runtime); + // init(runtime); + } + + // Second, perform timed runs (the ones that count) + for (int32_t i = 0; i < timingIterationCount; i++) + { + // Waiting for threads to arrive for before-timing. + barrier(); + uint64_t t0_ts = get_sys_cnt_aicpu(); + rc |= run(runtime); + barrier(); + uint64_t t1_ts = get_sys_cnt_aicpu(); + LOG_INFO_V9("Thread %d: Timing %d/%d time: %luns", my_thread_idx_, i, timingIterationCount, t1_ts - t0_ts); + + // Waiting for threads to come back for after-timing. + // deinit(runtime); + // init(runtime); + } + + // A barier to make sure all threads agree at this point before running the actual run + barrier(); + + if (rc != 0) LOG_ERROR("Thread %d - Timed runs failed with rc=%d", my_thread_idx_, rc); + + // Return code + return rc; +} + // ===== Public Entry Point ===== /** @@ -803,31 +853,11 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { } } - // Return code for running the kernel. It must be zero (no error) for all runs + // Return code. Must be zero for all runs int32_t rc = 0; // Performing timing evaluation, exclusively for performance evaluation: - const auto isTimingEnabled = runtime->get_timing_enabled(); - if (isTimingEnabled == true) - { - const auto warmupIterationCount = runtime->get_warmup_iteration_count(); - const auto timingIterationCount = runtime->get_timing_iteration_count(); - - // First, perform warmup to disregard any cold cache effects and thread initialization times - for (int32_t i = 0; i < warmupIterationCount; i++) rc |= g_aicpu_executor.run(runtime); - - // Second, perform timed runs (the ones that count) - for (int32_t i = 0; i < timingIterationCount; i++) - { - // Waiting for threads to arrive for before-timing. - g_aicpu_executor.barrier(); - - rc |= g_aicpu_executor.run(runtime); - - // Waiting for threads to come back for after-timing. - g_aicpu_executor.barrier(); - } - } + if (runtime->get_timing_enabled() == true) rc |= g_aicpu_executor.performTimingRuns(runtime); // Perform actual kernel run rc |= g_aicpu_executor.run(runtime); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 608efbc60..c1651de13 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -863,6 +863,52 @@ int32_t SchedulerContext::init( return 0; } +void SchedulerContext::reset() +{ + // Reset all per-core execution state + for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { + core_exec_states_[i] = {}; + core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID; + core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID; + } + + // Clear per-core dispatch payloads + memset(payload_per_core_, 0, sizeof(payload_per_core_)); + memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_)); + + // Reset sync-start drain coordination — a previous run that aborted mid-drain + // would otherwise leave dirty pending/elected/ack state for the next reuse. + drain_state_.sync_start_pending.store(0, std::memory_order_release); + drain_state_.drain_worker_elected.store(0, std::memory_order_release); + drain_state_.drain_ack_mask.store(0, std::memory_order_release); + drain_state_.pending_task = nullptr; + + // Reset task counters and orchestrator state + completed_tasks_.store(0, std::memory_order_release); + total_tasks_ = 0; + orchestrator_done_ = false; + pto2_init_done_.store(false, std::memory_order_release); + pto2_init_complete_.store(false, std::memory_order_release); + + // Reset core transition state + transition_requested_.store(false, std::memory_order_release); + wait_reassign_.store(0, std::memory_order_release); + reassigned_.store(false, std::memory_order_release); + completed_.store(false, std::memory_order_release); + + // Reset core discovery and assignment state + aic_count_ = 0; + aiv_count_ = 0; + cores_total_num_ = 0; + thread_num_ = 0; + sched_thread_num_ = 0; + orch_to_sched_ = false; + active_sched_threads_ = 0; + for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) { + core_trackers_[t] = CoreTracker{}; + } +} + void SchedulerContext::deinit() { // Reset all per-core execution state for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h index 79fc6b648..5908c8615 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h @@ -62,6 +62,9 @@ class SchedulerContext { int32_t init(Runtime *runtime, int32_t thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base); + // Resets all scheduling progress to before execution + void reset(); + // Reset all SchedulerContext-owned state to its post-construction defaults. // Called by AicpuExecutor::deinit() during per-run teardown. void deinit(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index 54415364d..224d17aab 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -314,8 +314,6 @@ void SchedulerContext::dispatch_shape( // ============================================================================= int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) { - LOG_INFO_V0("Thread %d: At resolve_and_dispatch", thread_idx); - always_assert(sched_ != nullptr); CoreTracker &tracker = core_trackers_[thread_idx]; LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx); From f1953c504821c2b02a6f32111e73a37c989bb188 Mon Sep 17 00:00:00 2001 From: s00831018 Date: Wed, 20 May 2026 11:51:57 +0200 Subject: [PATCH 04/15] Simplifying --- .../aicpu/aicpu_executor.cpp | 16 +++---- .../runtime/scheduler/scheduler_cold_path.cpp | 46 ------------------- .../runtime/scheduler/scheduler_context.h | 3 -- 3 files changed, 6 insertions(+), 59 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index e71da8341..aa8c66939 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -148,7 +148,6 @@ struct AicpuExecutor { int32_t performTimingRuns(Runtime *runtime); void deinit(Runtime *runtime); int32_t getThreadId() { return thread_idx_accumulator.fetch_add(1); } - void resetSchedulerContext() { sched_ctx_.reset(); } // Barrier function to synchronize threads inline void barrier() @@ -789,9 +788,9 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) uint64_t t1_ts = get_sys_cnt_aicpu(); LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, t1_ts - t0_ts); - // Waiting for threads to come back for after-timing. - // deinit(runtime); - // init(runtime); + // Resetting execution back to start + deinit(runtime); + init(runtime); } // Second, perform timed runs (the ones that count) @@ -805,14 +804,11 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) uint64_t t1_ts = get_sys_cnt_aicpu(); LOG_INFO_V9("Thread %d: Timing %d/%d time: %luns", my_thread_idx_, i, timingIterationCount, t1_ts - t0_ts); - // Waiting for threads to come back for after-timing. - // deinit(runtime); - // init(runtime); + // Resetting execution back to start + deinit(runtime); + init(runtime); } - // A barier to make sure all threads agree at this point before running the actual run - barrier(); - if (rc != 0) LOG_ERROR("Thread %d - Timed runs failed with rc=%d", my_thread_idx_, rc); // Return code diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index c1651de13..608efbc60 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -863,52 +863,6 @@ int32_t SchedulerContext::init( return 0; } -void SchedulerContext::reset() -{ - // Reset all per-core execution state - for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { - core_exec_states_[i] = {}; - core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID; - core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID; - } - - // Clear per-core dispatch payloads - memset(payload_per_core_, 0, sizeof(payload_per_core_)); - memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_)); - - // Reset sync-start drain coordination — a previous run that aborted mid-drain - // would otherwise leave dirty pending/elected/ack state for the next reuse. - drain_state_.sync_start_pending.store(0, std::memory_order_release); - drain_state_.drain_worker_elected.store(0, std::memory_order_release); - drain_state_.drain_ack_mask.store(0, std::memory_order_release); - drain_state_.pending_task = nullptr; - - // Reset task counters and orchestrator state - completed_tasks_.store(0, std::memory_order_release); - total_tasks_ = 0; - orchestrator_done_ = false; - pto2_init_done_.store(false, std::memory_order_release); - pto2_init_complete_.store(false, std::memory_order_release); - - // Reset core transition state - transition_requested_.store(false, std::memory_order_release); - wait_reassign_.store(0, std::memory_order_release); - reassigned_.store(false, std::memory_order_release); - completed_.store(false, std::memory_order_release); - - // Reset core discovery and assignment state - aic_count_ = 0; - aiv_count_ = 0; - cores_total_num_ = 0; - thread_num_ = 0; - sched_thread_num_ = 0; - orch_to_sched_ = false; - active_sched_threads_ = 0; - for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) { - core_trackers_[t] = CoreTracker{}; - } -} - void SchedulerContext::deinit() { // Reset all per-core execution state for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h index 5908c8615..79fc6b648 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h @@ -62,9 +62,6 @@ class SchedulerContext { int32_t init(Runtime *runtime, int32_t thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base); - // Resets all scheduling progress to before execution - void reset(); - // Reset all SchedulerContext-owned state to its post-construction defaults. // Called by AicpuExecutor::deinit() during per-run teardown. void deinit(); From 3bf8780281c669762c0b396f8991d9eae19a92ce Mon Sep 17 00:00:00 2001 From: s00831018 Date: Wed, 20 May 2026 12:10:54 +0200 Subject: [PATCH 05/15] Addressing some agent suggestions --- .../aicpu/aicpu_executor.cpp | 18 +++++++++++------- .../host/runtime_maker.cpp | 2 +- .../tensormap_and_ringbuffer/runtime/runtime.h | 10 +++++----- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 717d3b83f..ba0187bd6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -154,10 +154,10 @@ struct AicpuExecutor { { // Two phase barrier (in_out), guarantees that all threads are retained in the rendezvous point, until all of them have arrived barrier_counter_in_.fetch_add(1); - while (barrier_counter_in_.load(std::memory_order_relaxed) % thread_num_ != 0); + while (barrier_counter_in_.load(std::memory_order_acquire) % thread_num_ != 0); barrier_counter_out_.fetch_add(1); - while (barrier_counter_out_.load(std::memory_order_relaxed) % thread_num_ != 0); + while (barrier_counter_out_.load(std::memory_order_acquire) % thread_num_ != 0); } @@ -698,7 +698,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // Check if this is the last thread to finish int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel); - if (prev_finished + 1 % thread_num_ == 0) { + if ((prev_finished + 1) % thread_num_ == 0) { finished_.store(true, std::memory_order_release); // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we // always tear them down here, but we keep the per-cid orch SO entries @@ -777,8 +777,10 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, t1_ts - t0_ts); // Resetting execution back to start - deinit(runtime); - init(runtime); + if (my_thread_idx_ == 0) { + deinit(runtime); + init(runtime); + } } // Second, perform timed runs (the ones that count) @@ -793,8 +795,10 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) LOG_INFO_V9("Thread %d: Timing %d/%d time: %luns", my_thread_idx_, i, timingIterationCount, t1_ts - t0_ts); // Resetting execution back to start - deinit(runtime); - init(runtime); + if (my_thread_idx_ == 0) { + deinit(runtime); + init(runtime); + } } if (rc != 0) LOG_ERROR("Thread %d - Timed runs failed with rc=%d", my_thread_idx_, rc); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 1209dece2..1abb6e855 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -277,7 +277,7 @@ extern "C" int bind_prepared_to_runtime_impl( runtime->timing_iteration_count = RUNTIME_DEFAULT_TIMING_ITERATION_COUNT; } } - LOG_INFO_V0("Warmup iteration count: %d", runtime->timing_iteration_count); + LOG_INFO_V0("Timing iteration count: %d", runtime->timing_iteration_count); } // Read ready queue shard count from environment for AICPU scheduler diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index efdce1321..4bd245bb5 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -176,11 +176,6 @@ class Runtime { int sche_cpu_num; // Number of AICPU threads for scheduling int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1) - // Timing parameters (for precise performance estimation) - bool is_timing_enabled; - int warmup_iteration_count; - int timing_iteration_count; - // Ring buffer size overrides (0 = use compile-time defaults) uint64_t task_window_size; uint64_t heap_size; @@ -220,6 +215,11 @@ class Runtime { char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; + // Timing parameters (for precise performance estimation) + bool is_timing_enabled; + int warmup_iteration_count; + int timing_iteration_count; + public: /** * Constructor - zero-initialize all arrays From f9f25c3b44b98484a5281a35a518df379f23d0dc Mon Sep 17 00:00:00 2001 From: s00831018 Date: Wed, 20 May 2026 12:14:18 +0200 Subject: [PATCH 06/15] Fixes --- .../tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp | 6 +++--- .../tensormap_and_ringbuffer/runtime/runtime.h | 11 +++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index ba0187bd6..0f4dc79b4 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -528,7 +528,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { #if PTO2_PROFILING if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { - l2_perf_aicpu_set_orch_thread_idx(thread_idx); + l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_); } #endif @@ -594,7 +594,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast(p.args_atomic_count) ); LOG_INFO_V9( - "Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus", thread_idx, + "Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus", my_thread_idx_, cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total, cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle) ); @@ -647,7 +647,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // Signal completion to the orchestrator state machine rt_orchestration_done(rt); - sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks); + sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks); } #if PTO2_PROFILING uint64_t orch_end_ts = get_sys_cnt_aicpu(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 4bd245bb5..0da9b550b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -190,6 +190,11 @@ class Runtime { // When false (default), orchestrator threads exit after orchestration without dispatching tasks. // Controlled via PTO2_ORCH_TO_SCHED environment variable. bool orch_to_sched; + + // Timing parameters (for precise performance estimation) + bool is_timing_enabled; + int warmup_iteration_count; + int timing_iteration_count; private: // Kernel binary tracking for cleanup @@ -214,12 +219,6 @@ class Runtime { bool register_new_callable_id_; char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; - - // Timing parameters (for precise performance estimation) - bool is_timing_enabled; - int warmup_iteration_count; - int timing_iteration_count; - public: /** * Constructor - zero-initialize all arrays From 220d1d9faa6e4de60a4b63cbd9d74efb085e03ac Mon Sep 17 00:00:00 2001 From: s00831018 Date: Wed, 20 May 2026 13:05:11 +0200 Subject: [PATCH 07/15] Fix --- .../tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp | 7 +++---- .../runtime/tensormap_and_ringbuffer/runtime/runtime.h | 1 + .../tensormap_and_ringbuffer/runtime/shared/runtime.cpp | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 0f4dc79b4..f840f24b6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -253,10 +253,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) { DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind; DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func; const bool reload_so = runtime->register_new_callable_id(); - const bool so_in_use = orch_so_table_[callable_id].in_use; - if (reload_so && so_in_use == false) { + if (reload_so) { LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", my_thread_idx_, callable_id); + runtime->notify_callable_id_registered(); + if (*p_handle != nullptr) { dlclose(*p_handle); *p_handle = nullptr; @@ -394,7 +395,6 @@ int32_t AicpuExecutor::run(Runtime *runtime) { *p_bind = bind_runtime_func; *p_config_func = config_func; snprintf(p_path, 256, "%s", so_path); - orch_so_table_[callable_id].in_use = true; } else { LOG_INFO_V0( "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", my_thread_idx_, *p_handle, callable_id @@ -434,7 +434,6 @@ int32_t AicpuExecutor::run(Runtime *runtime) { *p_func = nullptr; *p_bind = nullptr; *p_config_func = nullptr; - orch_so_table_[callable_id].in_use = false; // Unblock scheduler threads before returning so they don't spin forever. runtime_init_ready_.store(true, std::memory_order_release); return -1; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 0da9b550b..271e50a7a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -256,6 +256,7 @@ class Runtime { void set_active_callable_id(int32_t callable_id, bool is_new); int32_t get_active_callable_id() const; bool register_new_callable_id() const; + void notify_callable_id_registered(); void set_device_orch_func_name(const char *name); const char *get_device_orch_func_name() const; void set_device_orch_config_name(const char *name); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 0bc984871..4e6f25435 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -96,6 +96,7 @@ void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) { int32_t Runtime::get_active_callable_id() const { return active_callable_id_; } bool Runtime::register_new_callable_id() const { return register_new_callable_id_; } +void Runtime::notify_callable_id_registered() { register_new_callable_id_ = false; } void Runtime::set_device_orch_func_name(const char *name) { if (name == nullptr) { From 192ef42a57efd85fc18658aadc4071bcac2652f7 Mon Sep 17 00:00:00 2001 From: s00831018 Date: Thu, 21 May 2026 10:14:06 +0200 Subject: [PATCH 08/15] Succesffuly running two consecutive inner runs --- .../aicpu/aicpu_executor.cpp | 90 +++++++++++-------- .../runtime/scheduler/pto_scheduler.cpp | 18 ++-- .../runtime/scheduler/scheduler_dispatch.cpp | 8 +- 3 files changed, 71 insertions(+), 45 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index f840f24b6..d60a43601 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -145,6 +145,7 @@ struct AicpuExecutor { // ===== Methods ===== int32_t init(Runtime *runtime); int32_t run(Runtime *runtime); + int32_t shutdown(Runtime *runtime); int32_t performTimingRuns(Runtime *runtime); void deinit(Runtime *runtime); int32_t getThreadId() { return thread_idx_accumulator.fetch_add(1); } @@ -687,35 +688,6 @@ int32_t AicpuExecutor::run(Runtime *runtime) { LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_); - // Always shutdown AICore — even if sched_ctx_.completed_ was already true. - // platform_deinit_aicore_regs is idempotent; orchestrator threads have - // core_trackers_[my_thread_idx_].core_num() == 0 so they skip the loop harmlessly. - int32_t shutdown_rc = sched_ctx_.shutdown(my_thread_idx_); - if (shutdown_rc != 0 && run_rc == 0) { - run_rc = shutdown_rc; - } - - // Check if this is the last thread to finish - int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel); - if ((prev_finished + 1) % thread_num_ == 0) { - finished_.store(true, std::memory_order_release); - // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we - // always tear them down here, but we keep the per-cid orch SO entries - // alive for the next run's cache-hit reuse (see run() reload_so branch). - if (rt != nullptr) { - // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt. - const int32_t callable_id = runtime->get_active_callable_id(); - framework_bind_runtime(nullptr); - if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) { - DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind; - if (bind != nullptr) { - bind(nullptr); - } - } - runtime_destroy(rt); - } - } - return run_rc; } @@ -757,6 +729,37 @@ void AicpuExecutor::deinit(Runtime *runtime) { LOG_INFO_V0("DeInit: AicpuExecutor reset complete"); } +int32_t AicpuExecutor::shutdown(Runtime *runtime) +{ + // Always shutdown AICore — even if sched_ctx_.completed_ was already true. + // platform_deinit_aicore_regs is idempotent; orchestrator threads have + // core_trackers_[my_thread_idx_].core_num() == 0 so they skip the loop harmlessly. + int32_t shutdown_rc = sched_ctx_.shutdown(my_thread_idx_); + + // Check if this is the last thread to finish + int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel); + if ((prev_finished + 1) % thread_num_ == 0) { + finished_.store(true, std::memory_order_release); + // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we + // always tear them down here, but we keep the per-cid orch SO entries + // alive for the next run's cache-hit reuse (see run() reload_so branch). + if (rt != nullptr) { + // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt. + const int32_t callable_id = runtime->get_active_callable_id(); + framework_bind_runtime(nullptr); + if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) { + DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind; + if (bind != nullptr) { + bind(nullptr); + } + } + runtime_destroy(rt); + } + } + + return shutdown_rc; +} + int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) { const auto warmupIterationCount = runtime->get_warmup_iteration_count(); @@ -844,16 +847,38 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { int32_t rc = 0; // Performing timing evaluation, exclusively for performance evaluation: - if (runtime->get_timing_enabled() == true) rc |= g_aicpu_executor.performTimingRuns(runtime); + // if (runtime->get_timing_enabled() == true) rc |= g_aicpu_executor.performTimingRuns(runtime); // Perform actual kernel run rc |= g_aicpu_executor.run(runtime); + g_aicpu_executor.barrier(); + if (my_thread_idx_ == 0) + { + g_aicpu_executor.deinit(runtime); + g_aicpu_executor.init(runtime); + } + g_aicpu_executor.barrier(); + + rc |= g_aicpu_executor.run(runtime); + if (rc != 0) { LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc); } int32_t runtime_rc = read_pto2_runtime_status(runtime); + + if (runtime_rc != 0) { + LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc); + return runtime_rc; + } + + // Shutting down + int32_t shutdown_rc = g_aicpu_executor.shutdown(runtime); + if (shutdown_rc != 0) { + LOG_ERROR("aicpu_execute: shutdown failed with rc=%d", shutdown_rc); + return shutdown_rc; + } // Last thread cleans up if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) { @@ -861,11 +886,6 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { g_aicpu_executor.deinit(runtime); } - if (runtime_rc != 0) { - LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc); - return runtime_rc; - } - if (rc != 0) { return rc; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp index 428897f3d..48066ac9d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp @@ -224,14 +224,14 @@ void PTO2SchedulerState::destroy() { void PTO2SchedulerState::print_stats() { PTO2SchedulerState *sched = this; - LOG_INFO_V0("=== Scheduler Statistics ==="); + LOG_INFO_V9("=== Scheduler Statistics ==="); for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { if (sched->ring_sched_states[r].last_task_alive > 0) { - LOG_INFO_V0("Ring %d:", r); - LOG_INFO_V0(" last_task_alive: %d", sched->ring_sched_states[r].last_task_alive); + LOG_INFO_V9("Ring %d:", r); + LOG_INFO_V9(" last_task_alive: %d", sched->ring_sched_states[r].last_task_alive); auto &dp = sched->ring_sched_states[r].dep_pool; if (dp.top > 0) { - LOG_INFO_V0( + LOG_INFO_V9( " dep_pool: top=%d tail=%d used=%d high_water=%d capacity=%d", dp.top, dp.tail, dp.top - dp.tail, dp.high_water, dp.capacity ); @@ -242,19 +242,19 @@ void PTO2SchedulerState::print_stats() { LOG_INFO_V0("tasks_completed: %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed)); LOG_INFO_V0("tasks_consumed: %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed)); #endif - LOG_INFO_V0("============================"); + LOG_INFO_V9("============================"); } void PTO2SchedulerState::print_queues() { PTO2SchedulerState *sched = this; - LOG_INFO_V0("=== Ready Queues ==="); + LOG_INFO_V9("=== Ready Queues ==="); const char *shape_names[] = {"AIC", "AIV", "MIX"}; for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - LOG_INFO_V0(" %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size()); + LOG_INFO_V9(" %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size()); } - LOG_INFO_V0(" DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size()); + LOG_INFO_V9(" DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size()); - LOG_INFO_V0("===================="); + LOG_INFO_V9("===================="); } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index eccd96af1..e2970dbb3 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -238,6 +238,7 @@ void SchedulerContext::dispatch_shape( int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed ) { + #if PTO2_SCHED_PROFILING auto &l2_perf = sched_l2_perf_[thread_idx]; #endif @@ -260,15 +261,18 @@ void SchedulerContext::dispatch_shape( if (slot_state->active_mask.requires_sync_start()) { if (is_pending) { sched_->ready_queues[static_cast(shape)].push(slot_state); + LOG_INFO_V9("Thread %d - Pushed task", thread_idx); continue; } int32_t available = cores.count(); if (available < slot_state->logical_block_num) { if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) { sched_->ready_queues[static_cast(shape)].push(slot_state); + LOG_INFO_V9("Thread %d - Pushed task", thread_idx); } for (int rem = bi + 1; rem < got; rem++) { sched_->ready_queues[static_cast(shape)].push(batch[rem]); + LOG_INFO_V9("Thread %d - Pushed task", thread_idx); } entered_drain = true; break; @@ -277,6 +281,7 @@ void SchedulerContext::dispatch_shape( if (!cores.has_value()) { sched_->ready_queues[static_cast(shape)].push_batch(&batch[bi], got - bi); + LOG_INFO_V9("Thread %d - Pushed task", thread_idx); break; } @@ -293,6 +298,7 @@ void SchedulerContext::dispatch_shape( if (slot_state->next_block_idx < slot_state->logical_block_num) { sched_->ready_queues[static_cast(shape)].push(slot_state); + LOG_INFO_V9("Thread %d - Pushed task", thread_idx); } made_progress = true; #if PTO2_SCHED_PROFILING @@ -391,7 +397,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ l2_perf.sched_start_ts = get_sys_cnt_aicpu(); #endif - LOG_INFO_V0("Thread %d: Scheduling Start. (Completed: %s)", thread_idx, completed_.load() ? "True" : "False"); while (true) { if (completed_.load(std::memory_order_acquire)) { break; @@ -578,6 +583,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ #endif if (local_buf.count > 0) { ready_queue.push_batch(local_buf.slot_states, local_buf.count); + LOG_INFO_V9("Thread %d - Pushed task", thread_idx); local_buf.count = 0; } } From 108bfc4f7ff472803dcecd4ea957bdd8c663693a Mon Sep 17 00:00:00 2001 From: s00831018 Date: Thu, 21 May 2026 10:27:57 +0200 Subject: [PATCH 09/15] Recovering timing runs --- .../aicpu/aicpu_executor.cpp | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index d60a43601..e2c1320b6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -843,25 +843,18 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { } } - // Return code. Must be zero for all runs - int32_t rc = 0; - // Performing timing evaluation, exclusively for performance evaluation: - // if (runtime->get_timing_enabled() == true) rc |= g_aicpu_executor.performTimingRuns(runtime); - - // Perform actual kernel run - rc |= g_aicpu_executor.run(runtime); - - g_aicpu_executor.barrier(); - if (my_thread_idx_ == 0) + if (runtime->get_timing_enabled() == true) { - g_aicpu_executor.deinit(runtime); - g_aicpu_executor.init(runtime); + int32_t timing_rc = g_aicpu_executor.performTimingRuns(runtime); + if (timing_rc != 0) { + LOG_ERROR("aicpu_execute: timing run failed with rc=%d", timing_rc); + return timing_rc; + } } - g_aicpu_executor.barrier(); - - rc |= g_aicpu_executor.run(runtime); + // Perform actual kernel run + int32_t rc = g_aicpu_executor.run(runtime); if (rc != 0) { LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc); } From 55580d624b0d104d5e3c3148a4d0296af7e9e2a9 Mon Sep 17 00:00:00 2001 From: s00831018 Date: Thu, 21 May 2026 11:19:16 +0200 Subject: [PATCH 10/15] Separated orchestration loading from actual run --- .../aicpu/aicpu_executor.cpp | 773 +++++++++--------- .../runtime/shared/runtime.cpp | 1 - 2 files changed, 396 insertions(+), 378 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index e2c1320b6..f86e7f5eb 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -131,7 +131,7 @@ struct AicpuExecutor { std::atomic runtime_init_ready_{false}; // Cached orch args pointer set by the orchestration thread before scheduler - // init; consumed by the (*p_func)(*orch_args_cached_) invocation below. + // init; consumed by the (*p_func_)(*orch_args_cached_) invocation below. const ChipStorageTaskArgs *orch_args_cached_{nullptr}; // Per-callable_id table. Single orch thread today, so first-write/read @@ -142,10 +142,18 @@ struct AicpuExecutor { // ===== Scheduler context (owns all dispatch/completion/drain state) ===== SchedulerContext sched_ctx_; + // ===== Orchestrator dynamically loaded structures + void **p_handle_{nullptr}; + char *p_path_{nullptr}; + DeviceOrchestrationFunc *p_func_{nullptr}; + DeviceOrchestrationBindRuntimeFunc *p_bind_{nullptr}; + DeviceOrchestrationConfigFunc *p_config_func_{nullptr}; + // ===== Methods ===== int32_t init(Runtime *runtime); int32_t run(Runtime *runtime); int32_t shutdown(Runtime *runtime); + int32_t loadOrchestrator(Runtime* runtime); int32_t performTimingRuns(Runtime *runtime); void deinit(Runtime *runtime); int32_t getThreadId() { return thread_idx_accumulator.fetch_add(1); } @@ -213,442 +221,453 @@ int32_t AicpuExecutor::init(Runtime *runtime) { finished_count_.store(0, std::memory_order_release); + // Loading orchestrator + int32_t load_orch_rc = loadOrchestrator(runtime); + if (load_orch_rc != 0) + { + LOG_ERROR("Thread %d: Failed to load orchestrator", my_thread_idx_); + return load_orch_rc; + } + init_done_.store(true, std::memory_order_release); LOG_INFO_V0("AicpuExecutor: Init complete"); return 0; } -/** - * Shutdown AICore - Send exit signal via registers to all AICore kernels - */ -int32_t AicpuExecutor::run(Runtime *runtime) { - int32_t run_rc = 0; - LOG_INFO_V0("Thread %d: at AicpuExecutor::Run", my_thread_idx_); +int32_t AicpuExecutor::loadOrchestrator(Runtime* runtime) +{ + LOG_INFO_V0("Thread %d: Orchestrator Loading", my_thread_idx_); + + // Per-callable_id dispatch: the orch SO state lives in + // `orch_so_table_[callable_id]` keyed by registration order; + // reload is governed by `register_new_callable_id_`. + const int32_t callable_id = runtime->get_active_callable_id(); + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "Thread %d: invalid callable_id %d (limit=%d)", my_thread_idx_, callable_id, MAX_REGISTERED_CALLABLE_IDS + ); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + p_handle_ = &orch_so_table_[callable_id].handle; + p_path_ = orch_so_table_[callable_id].path; + p_func_ = &orch_so_table_[callable_id].func; + p_bind_ = &orch_so_table_[callable_id].bind; + p_config_func_ = &orch_so_table_[callable_id].config_func; + const bool reload_so = runtime->register_new_callable_id(); + + if (reload_so) { + LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", my_thread_idx_, callable_id); + + if (*p_handle_ != nullptr) { + dlclose(*p_handle_); + *p_handle_ = nullptr; + *p_func_ = nullptr; + *p_bind_ = nullptr; + if (p_path_[0] != '\0') { + // Unlink the old file so the new open() lands on a + // fresh inode — protects against SIGBUS / ETXTBSY when + // the kernel still has the old mapping pinned. + unlink(p_path_); + p_path_[0] = '\0'; + } + } - // Orchestrator check - if (my_thread_idx_ >= sched_thread_num_) { -#if PTO2_PROFILING - uint64_t orch_cycle_start = 0; - int32_t pto2_submitted_tasks = -1; -#endif + const void *so_data = reinterpret_cast(runtime->get_dev_orch_so_addr()); + size_t so_size = runtime->get_dev_orch_so_size(); - LOG_INFO_V0("Thread %d: Orchestrator Running", my_thread_idx_); + if (so_data == nullptr || so_size == 0) { + LOG_ERROR("Thread %d: Device orchestration SO not set", my_thread_idx_); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } - // Orchestrator thread: load + run the device orchestration SO. The braces - // scope the per-callable dlopen / SO-table locals to this block. - { - // Per-callable_id dispatch: the orch SO state lives in - // `orch_so_table_[callable_id]` keyed by registration order; - // reload is governed by `register_new_callable_id_`. - const int32_t callable_id = runtime->get_active_callable_id(); - if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { - LOG_ERROR( - "Thread %d: invalid callable_id %d (limit=%d)", my_thread_idx_, callable_id, MAX_REGISTERED_CALLABLE_IDS + // Try multiple paths that may allow execution on AICPU + char so_path[256]; + bool file_created = false; + const char *candidate_dirs[] = { + "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" + }; + const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); + + for (int32_t i = 0; i < num_candidates && !file_created; i++) { + int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path)); + if (fd < 0) { + LOG_INFO_V0( + "Thread %d: Cannot create SO at %s (errno=%d), trying next path", my_thread_idx_, so_path, errno ); - runtime_init_ready_.store(true, std::memory_order_release); - return -1; + continue; } - void **p_handle = &orch_so_table_[callable_id].handle; - char *p_path = orch_so_table_[callable_id].path; - DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func; - DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind; - DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func; - const bool reload_so = runtime->register_new_callable_id(); - - if (reload_so) { - LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", my_thread_idx_, callable_id); - runtime->notify_callable_id_registered(); - - if (*p_handle != nullptr) { - dlclose(*p_handle); - *p_handle = nullptr; - *p_func = nullptr; - *p_bind = nullptr; - if (p_path[0] != '\0') { - // Unlink the old file so the new open() lands on a - // fresh inode — protects against SIGBUS / ETXTBSY when - // the kernel still has the old mapping pinned. - unlink(p_path); - p_path[0] = '\0'; - } - } - - const void *so_data = reinterpret_cast(runtime->get_dev_orch_so_addr()); - size_t so_size = runtime->get_dev_orch_so_size(); - - if (so_data == nullptr || so_size == 0) { - LOG_ERROR("Thread %d: Device orchestration SO not set", my_thread_idx_); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - - // Try multiple paths that may allow execution on AICPU - char so_path[256]; - bool file_created = false; - const char *candidate_dirs[] = { - "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" - }; - const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); - - for (int32_t i = 0; i < num_candidates && !file_created; i++) { - int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path)); - if (fd < 0) { - LOG_INFO_V0( - "Thread %d: Cannot create SO at %s (errno=%d), trying next path", my_thread_idx_, so_path, errno - ); - continue; - } - ssize_t written = write(fd, so_data, so_size); - close(fd); - if (written != static_cast(so_size)) { - LOG_INFO_V0( - "Thread %d: Cannot write SO to %s (errno=%d), trying next path", my_thread_idx_, so_path, errno - ); - unlink(so_path); - continue; - } - file_created = true; - LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", my_thread_idx_, so_path, so_size); - } + ssize_t written = write(fd, so_data, so_size); + close(fd); + if (written != static_cast(so_size)) { + LOG_INFO_V0( + "Thread %d: Cannot write SO to %s (errno=%d), trying next path", my_thread_idx_, so_path, errno + ); + unlink(so_path); + continue; + } + file_created = true; + LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", my_thread_idx_, so_path, so_size); + } - if (!file_created) { - LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", my_thread_idx_); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } + if (!file_created) { + LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", my_thread_idx_); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } - dlerror(); - void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); - const char *dlopen_err = dlerror(); - if (handle == nullptr) { - LOG_ERROR("Thread %d: dlopen failed: %s", my_thread_idx_, dlopen_err ? dlopen_err : "unknown"); - unlink(so_path); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", my_thread_idx_, handle); - - // Unlink the on-disk SO immediately: dlopen has already mmap'd - // the image, so the kernel keeps the inode alive until the - // matching dlclose / process exit. This prevents stale - // libdevice_orch__.so files from accumulating in - // /tmp when child processes exit via os._exit(0), which skips - // ~AicpuExecutor (worker.py: _sub/_chip/_child loops). - unlink(so_path); + dlerror(); + void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); + const char *dlopen_err = dlerror(); + if (handle == nullptr) { + LOG_ERROR("Thread %d: dlopen failed: %s", my_thread_idx_, dlopen_err ? dlopen_err : "unknown"); + unlink(so_path); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", my_thread_idx_, handle); + + // Unlink the on-disk SO immediately: dlopen has already mmap'd + // the image, so the kernel keeps the inode alive until the + // matching dlclose / process exit. This prevents stale + // libdevice_orch__.so files from accumulating in + // /tmp when child processes exit via os._exit(0), which skips + // ~AicpuExecutor (worker.py: _sub/_chip/_child loops). + unlink(so_path); + + const char *entry_symbol = runtime->get_device_orch_func_name(); + if (entry_symbol == nullptr || entry_symbol[0] == '\0') { + entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL; + } + const char *config_symbol = runtime->get_device_orch_config_name(); + if (config_symbol == nullptr || config_symbol[0] == '\0') { + config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL; + } - const char *entry_symbol = runtime->get_device_orch_func_name(); - if (entry_symbol == nullptr || entry_symbol[0] == '\0') { - entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL; - } - const char *config_symbol = runtime->get_device_orch_config_name(); - if (config_symbol == nullptr || config_symbol[0] == '\0') { - config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL; - } + dlerror(); + DeviceOrchestrationFunc orch_func = + reinterpret_cast(dlsym(handle, entry_symbol)); + const char *entry_dlsym_error = dlerror(); + if (entry_dlsym_error != nullptr) { + LOG_ERROR( + "Thread %d: dlsym failed for entry symbol '%s': %s", my_thread_idx_, entry_symbol, entry_dlsym_error + ); + dlclose(handle); + unlink(so_path); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + if (orch_func == nullptr) { + LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", my_thread_idx_, entry_symbol); + dlclose(handle); + unlink(so_path); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } - dlerror(); - DeviceOrchestrationFunc orch_func = - reinterpret_cast(dlsym(handle, entry_symbol)); - const char *entry_dlsym_error = dlerror(); - if (entry_dlsym_error != nullptr) { - LOG_ERROR( - "Thread %d: dlsym failed for entry symbol '%s': %s", my_thread_idx_, entry_symbol, entry_dlsym_error - ); - dlclose(handle); - unlink(so_path); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - if (orch_func == nullptr) { - LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", my_thread_idx_, entry_symbol); - dlclose(handle); - unlink(so_path); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } + dlerror(); + auto config_func = reinterpret_cast(dlsym(handle, config_symbol)); + const char *config_dlsym_error = dlerror(); + if (config_dlsym_error != nullptr || config_func == nullptr) { + LOG_ERROR( + "Thread %d: dlsym failed for config symbol '%s': %s", my_thread_idx_, config_symbol, + config_dlsym_error ? config_dlsym_error : "NULL function pointer" + ); + config_func = nullptr; + } - dlerror(); - auto config_func = reinterpret_cast(dlsym(handle, config_symbol)); - const char *config_dlsym_error = dlerror(); - if (config_dlsym_error != nullptr || config_func == nullptr) { - LOG_ERROR( - "Thread %d: dlsym failed for config symbol '%s': %s", my_thread_idx_, config_symbol, - config_dlsym_error ? config_dlsym_error : "NULL function pointer" - ); - config_func = nullptr; - } + dlerror(); + auto bind_runtime_func = + reinterpret_cast(dlsym(handle, "framework_bind_runtime")); + const char *bind_runtime_error = dlerror(); + if (bind_runtime_error != nullptr) { + LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", my_thread_idx_, bind_runtime_error); + bind_runtime_func = nullptr; + } - dlerror(); - auto bind_runtime_func = - reinterpret_cast(dlsym(handle, "framework_bind_runtime")); - const char *bind_runtime_error = dlerror(); - if (bind_runtime_error != nullptr) { - LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", my_thread_idx_, bind_runtime_error); - bind_runtime_func = nullptr; - } + *p_handle_ = handle; + *p_func_ = orch_func; + *p_bind_ = bind_runtime_func; + *p_config_func_ = config_func; + snprintf(p_path_, 256, "%s", so_path); + } else { + LOG_INFO_V0( + "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", my_thread_idx_, *p_handle_, callable_id + ); + if (*p_handle_ == nullptr || *p_func_ == nullptr) { + LOG_ERROR( + "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", my_thread_idx_, + callable_id + ); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + } - *p_handle = handle; - *p_func = orch_func; - *p_bind = bind_runtime_func; - *p_config_func = config_func; - snprintf(p_path, 256, "%s", so_path); - } else { - LOG_INFO_V0( - "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", my_thread_idx_, *p_handle, callable_id + // Validate arg count on every run (reload or cache hit). + if (*p_config_func_ != nullptr) { + PTO2OrchestrationConfig cfg = (*p_config_func_)(runtime->get_orch_args()); + LOG_INFO_V0("Thread %d: Config: expected_args=%d", my_thread_idx_, cfg.expected_arg_count); + if (cfg.expected_arg_count > 0) { + const ChipStorageTaskArgs &args_validate = runtime->get_orch_args(); + int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count(); + if (actual_arg_count < cfg.expected_arg_count) { + LOG_ERROR( + "Thread %d: arg_count %d < expected %d", my_thread_idx_, actual_arg_count, + cfg.expected_arg_count ); - if (*p_handle == nullptr || *p_func == nullptr) { - LOG_ERROR( - "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", my_thread_idx_, - callable_id - ); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; + // Clean up cached state so a subsequent run does a full reload. + if (*p_handle_ != nullptr) { + dlclose(*p_handle_); + *p_handle_ = nullptr; } - } - - // Validate arg count on every run (reload or cache hit). - if (*p_config_func != nullptr) { - PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args()); - LOG_INFO_V0("Thread %d: Config: expected_args=%d", my_thread_idx_, cfg.expected_arg_count); - if (cfg.expected_arg_count > 0) { - const ChipStorageTaskArgs &args_validate = runtime->get_orch_args(); - int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count(); - if (actual_arg_count < cfg.expected_arg_count) { - LOG_ERROR( - "Thread %d: arg_count %d < expected %d", my_thread_idx_, actual_arg_count, - cfg.expected_arg_count - ); - // Clean up cached state so a subsequent run does a full reload. - if (*p_handle != nullptr) { - dlclose(*p_handle); - *p_handle = nullptr; - } - if (p_path[0] != '\0') { - unlink(p_path); - p_path[0] = '\0'; - } - *p_func = nullptr; - *p_bind = nullptr; - *p_config_func = nullptr; - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } + if (p_path_[0] != '\0') { + unlink(p_path_); + p_path_[0] = '\0'; } - } else { - LOG_INFO_V0("Thread %d: No config function, using defaults", my_thread_idx_); + *p_func_ = nullptr; + *p_bind_ = nullptr; + *p_config_func_ = nullptr; + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; } + } + } else { + LOG_INFO_V0("Thread %d: No config function, using defaults", my_thread_idx_); + } - // sm_handle / rt are bound to *this* run's memory and must be - // (re)created every run, regardless of whether the SO itself was - // reused above. - const ChipStorageTaskArgs &args = runtime->get_orch_args(); - int32_t arg_count = args.tensor_count() + args.scalar_count(); - LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", my_thread_idx_, runtime->get_gm_sm_ptr(), arg_count); - for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) { - const ContinuousTensor &t = args.tensor(i); - LOG_INFO_V0( - "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", my_thread_idx_, i, - static_cast(t.data), t.ndims, static_cast(t.dtype) - ); - } - for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) { - LOG_INFO_V0( - "Thread %d: orch_args[%d] = SCALAR(0x%lx)", my_thread_idx_, args.tensor_count() + i, - static_cast(args.scalar(i)) - ); - } + return 0; +} - uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE; - uint64_t heap_size = PTO2_HEAP_SIZE; +/** + * Shutdown AICore - Send exit signal via registers to all AICore kernels + */ +int32_t AicpuExecutor::run(Runtime *runtime) { + int32_t run_rc = 0; + LOG_INFO_V0("Thread %d: at AicpuExecutor::Run", my_thread_idx_); - if (runtime->task_window_size > 0) { - task_window_size = runtime->task_window_size; - } - if (runtime->heap_size > 0) { - heap_size = runtime->heap_size; - } - int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE; - if (runtime->dep_pool_size > 0) { - dep_pool_capacity = static_cast(runtime->dep_pool_size); - } + // Orchestrator check + if (my_thread_idx_ >= sched_thread_num_) { +#if PTO2_PROFILING + uint64_t orch_cycle_start = 0; + int32_t pto2_submitted_tasks = -1; +#endif + + // Orchestrator thread: load + run the device orchestration SO. + LOG_INFO_V0("Thread %d: Orchestrator Running", my_thread_idx_); + + // sm_handle / rt are bound to *this* run's memory and must be + // (re)created every run, regardless of whether the SO itself was + // reused above. + const ChipStorageTaskArgs &args = runtime->get_orch_args(); + int32_t arg_count = args.tensor_count() + args.scalar_count(); + LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", my_thread_idx_, runtime->get_gm_sm_ptr(), arg_count); + for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) { + const ContinuousTensor &t = args.tensor(i); + LOG_INFO_V0( + "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", my_thread_idx_, i, + static_cast(t.data), t.ndims, static_cast(t.dtype) + ); + } + for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) { LOG_INFO_V0( - "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", my_thread_idx_, - static_cast(task_window_size), static_cast(heap_size), dep_pool_capacity + "Thread %d: orch_args[%d] = SCALAR(0x%lx)", my_thread_idx_, args.tensor_count() + i, + static_cast(args.scalar(i)) ); + } - void *sm_ptr = runtime->get_gm_sm_ptr(); - void *gm_heap = runtime->get_gm_heap_ptr(); + uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE; + uint64_t heap_size = PTO2_HEAP_SIZE; - uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size); - PTO2SharedMemoryHandle *sm_handle = - PTO2SharedMemoryHandle::create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size); - if (!sm_handle) { - LOG_ERROR("Thread %d: Failed to create shared memory handle", my_thread_idx_); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } + if (runtime->task_window_size > 0) { + task_window_size = runtime->task_window_size; + } + if (runtime->heap_size > 0) { + heap_size = runtime->heap_size; + } + int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE; + if (runtime->dep_pool_size > 0) { + dep_pool_capacity = static_cast(runtime->dep_pool_size); + } + LOG_INFO_V0( + "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", my_thread_idx_, + static_cast(task_window_size), static_cast(heap_size), dep_pool_capacity + ); - rt = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity); - if (!rt) { - LOG_ERROR("Thread %d: Failed to create PTO2Runtime", my_thread_idx_); - sm_handle->destroy(); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } + void *sm_ptr = runtime->get_gm_sm_ptr(); + void *gm_heap = runtime->get_gm_heap_ptr(); + + uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size); + PTO2SharedMemoryHandle *sm_handle = + PTO2SharedMemoryHandle::create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size); + if (!sm_handle) { + LOG_ERROR("Thread %d: Failed to create shared memory handle", my_thread_idx_); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + + rt = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity); + if (!rt) { + LOG_ERROR("Thread %d: Failed to create PTO2Runtime", my_thread_idx_); + sm_handle->destroy(); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } #if PTO2_PROFILING - rt->orchestrator.l2_perf_level = get_l2_perf_level(); + rt->orchestrator.l2_perf_level = get_l2_perf_level(); #endif - // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). - rt->orchestrator.total_cluster_count = sched_ctx_.aic_count(); - rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count(); + // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). + rt->orchestrator.total_cluster_count = sched_ctx_.aic_count(); + rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count(); - // With multi-ring, slot_states are per-ring inside the scheduler. - runtime->set_slot_states_ptr(nullptr); + // With multi-ring, slot_states are per-ring inside the scheduler. + runtime->set_slot_states_ptr(nullptr); - orch_args_cached_ = &args; + orch_args_cached_ = &args; - // Wire scheduler context to the newly created PTO2Runtime before - // releasing scheduler threads from runtime_init_ready_. - sched_ctx_.bind_runtime(rt); + // Wire scheduler context to the newly created PTO2Runtime before + // releasing scheduler threads from runtime_init_ready_. + sched_ctx_.bind_runtime(rt); - runtime_init_ready_.store(true, std::memory_order_release); + runtime_init_ready_.store(true, std::memory_order_release); - // Wait for scheduler's one-time init to complete - sched_ctx_.wait_pto2_init_complete(); + // Wait for scheduler's one-time init to complete + sched_ctx_.wait_pto2_init_complete(); #if PTO2_PROFILING - if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { - l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_); - } + if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { + l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_); + } #endif - // dep_gen plugs into the orchestrator thread (single-instance subsystem): - // set the per-thread queue index and pop the initial buffer before any - // submit_task can fire inside orch_func_. - if (is_dep_gen_enabled()) { - dep_gen_aicpu_set_orch_thread_idx(my_thread_idx_); - dep_gen_aicpu_init(); - } + // dep_gen plugs into the orchestrator thread (single-instance subsystem): + // set the per-thread queue index and pop the initial buffer before any + // submit_task can fire inside orch_func_. + if (is_dep_gen_enabled()) { + dep_gen_aicpu_set_orch_thread_idx(my_thread_idx_); + dep_gen_aicpu_init(); + } #if PTO2_PROFILING - orch_cycle_start = get_sys_cnt_aicpu(); + orch_cycle_start = get_sys_cnt_aicpu(); #endif - framework_bind_runtime(rt); - if (*p_bind != nullptr) { - (*p_bind)(rt); - } - rt_scope_begin(rt); - (*p_func)(*orch_args_cached_); - rt_scope_end(rt); - - // Flush the (potentially partially-filled) DepGenBuffer so the host - // collector can pick it up before this orchestrator thread joins. - if (is_dep_gen_enabled()) { - dep_gen_aicpu_flush(); - } + framework_bind_runtime(rt); + if (*p_bind_ != nullptr) { + (*p_bind_)(rt); + } + rt_scope_begin(rt); + (*p_func_)(*orch_args_cached_); + rt_scope_end(rt); + + // Flush the (potentially partially-filled) DepGenBuffer so the host + // collector can pick it up before this orchestrator thread joins. + if (is_dep_gen_enabled()) { + dep_gen_aicpu_flush(); + } #if PTO2_PROFILING - uint64_t orch_cycle_end = get_sys_cnt_aicpu(); - (void)orch_cycle_end; + uint64_t orch_cycle_end = get_sys_cnt_aicpu(); + (void)orch_cycle_end; #endif - // Print orchestrator profiling data + // Print orchestrator profiling data #if PTO2_ORCH_PROFILING - PTO2OrchProfilingData p = orchestrator_get_profiling(); - uint64_t total = - p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle; - if (total == 0) total = 1; // avoid div-by-zero - LOG_INFO_V9( - "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", my_thread_idx_, - static_cast(p.submit_count), cycles_to_us(total) - ); - LOG_INFO_V9( - "Thread %d: task+heap_alloc: %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", - my_thread_idx_, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total, - cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle), - static_cast(p.alloc_atomic_count) - ); - LOG_INFO_V9( - "Thread %d: sync_tensormap : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.sync_cycle), - p.sync_cycle * 100.0 / total - ); - LOG_INFO_V9( - "Thread %d: lookup+dep : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.lookup_cycle), - p.lookup_cycle * 100.0 / total - ); - LOG_INFO_V9( - "Thread %d: tensormap_ins : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.insert_cycle), - p.insert_cycle * 100.0 / total - ); - LOG_INFO_V9( - "Thread %d: param_copy : %.3fus (%.1f%%) atomics=%" PRIu64 "", my_thread_idx_, - cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast(p.args_atomic_count) - ); - LOG_INFO_V9( - "Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus", my_thread_idx_, - cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total, - cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle) - ); - LOG_INFO_V9( - "Thread %d: avg/task : %.3fus", my_thread_idx_, - p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0 - ); + PTO2OrchProfilingData p = orchestrator_get_profiling(); + uint64_t total = + p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle; + if (total == 0) total = 1; // avoid div-by-zero + LOG_INFO_V9( + "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", my_thread_idx_, + static_cast(p.submit_count), cycles_to_us(total) + ); + LOG_INFO_V9( + "Thread %d: task+heap_alloc: %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", + my_thread_idx_, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total, + cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle), + static_cast(p.alloc_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: sync_tensormap : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.sync_cycle), + p.sync_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: lookup+dep : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.lookup_cycle), + p.lookup_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: tensormap_ins : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.insert_cycle), + p.insert_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: param_copy : %.3fus (%.1f%%) atomics=%" PRIu64 "", my_thread_idx_, + cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast(p.args_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus", my_thread_idx_, + cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total, + cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle) + ); + LOG_INFO_V9( + "Thread %d: avg/task : %.3fus", my_thread_idx_, + p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0 + ); #if PTO2_TENSORMAP_PROFILING - PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling(); - LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", my_thread_idx_); - LOG_INFO_V9( - "Thread %d: lookups : %" PRIu64 ", inserts: %" PRIu64 "", my_thread_idx_, - static_cast(tp.lookup_count), static_cast(tp.insert_count) - ); - LOG_INFO_V9( - "Thread %d: chain walked : total=%" PRIu64 ", avg=%.1f, max=%d", my_thread_idx_, - static_cast(tp.lookup_chain_total), - tp.lookup_count > 0 ? static_cast(tp.lookup_chain_total) / tp.lookup_count : 0.0, - tp.lookup_chain_max - ); - LOG_INFO_V9( - "Thread %d: overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", my_thread_idx_, - static_cast(tp.overlap_checks), static_cast(tp.overlap_hits), - tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0 - ); + PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling(); + LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", my_thread_idx_); + LOG_INFO_V9( + "Thread %d: lookups : %" PRIu64 ", inserts: %" PRIu64 "", my_thread_idx_, + static_cast(tp.lookup_count), static_cast(tp.insert_count) + ); + LOG_INFO_V9( + "Thread %d: chain walked : total=%" PRIu64 ", avg=%.1f, max=%d", my_thread_idx_, + static_cast(tp.lookup_chain_total), + tp.lookup_count > 0 ? static_cast(tp.lookup_chain_total) / tp.lookup_count : 0.0, + tp.lookup_chain_max + ); + LOG_INFO_V9( + "Thread %d: overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", my_thread_idx_, + static_cast(tp.overlap_checks), static_cast(tp.overlap_hits), + tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0 + ); #endif #endif // PTO2_ORCH_PROFILING - // Latch task count from PTO2 shared memory to hand off to the - // scheduler. The orchestrator's run window (start_time / end_time / - // submit_count) is no longer published to shared memory — the - // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line - // below carries the same envelope info for debugging, and - // host-side swimlane derives per-phase timing from the per-event - // AicpuPhaseRecord[] stream that already covers everything inside - // submit_task(). - int32_t total_tasks = 0; - if (rt->orchestrator.sm_header) { - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - total_tasks += - rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire); - } + // Latch task count from PTO2 shared memory to hand off to the + // scheduler. The orchestrator's run window (start_time / end_time / + // submit_count) is no longer published to shared memory — the + // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line + // below carries the same envelope info for debugging, and + // host-side swimlane derives per-phase timing from the per-event + // AicpuPhaseRecord[] stream that already covers everything inside + // submit_task(). + int32_t total_tasks = 0; + if (rt->orchestrator.sm_header) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + total_tasks += + rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire); } + } #if PTO2_PROFILING - pto2_submitted_tasks = total_tasks; + pto2_submitted_tasks = total_tasks; #endif - // Signal completion to the orchestrator state machine - rt_orchestration_done(rt); + // Signal completion to the orchestrator state machine + rt_orchestration_done(rt); - sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks); - } + sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks); #if PTO2_PROFILING uint64_t orch_end_ts = get_sys_cnt_aicpu(); LOG_INFO_V9( diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 4e6f25435..0bc984871 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -96,7 +96,6 @@ void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) { int32_t Runtime::get_active_callable_id() const { return active_callable_id_; } bool Runtime::register_new_callable_id() const { return register_new_callable_id_; } -void Runtime::notify_callable_id_registered() { register_new_callable_id_ = false; } void Runtime::set_device_orch_func_name(const char *name) { if (name == nullptr) { From 570660f83e392ba2f2aec72f1c7555211b6340fb Mon Sep 17 00:00:00 2001 From: s00831018 Date: Thu, 21 May 2026 11:31:36 +0200 Subject: [PATCH 11/15] Separating orhestration from scheduling activities --- .../aicpu/aicpu_executor.cpp | 456 +++++++++--------- 1 file changed, 234 insertions(+), 222 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index f86e7f5eb..9b8b4110c 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -151,7 +151,8 @@ struct AicpuExecutor { // ===== Methods ===== int32_t init(Runtime *runtime); - int32_t run(Runtime *runtime); + int32_t runScheduling(Runtime *runtime); + int32_t runOrchestration(Runtime* runtime); int32_t shutdown(Runtime *runtime); int32_t loadOrchestrator(Runtime* runtime); int32_t performTimingRuns(Runtime *runtime); @@ -450,264 +451,269 @@ int32_t AicpuExecutor::loadOrchestrator(Runtime* runtime) /** * Shutdown AICore - Send exit signal via registers to all AICore kernels */ -int32_t AicpuExecutor::run(Runtime *runtime) { +int32_t AicpuExecutor::runScheduling(Runtime *runtime) { int32_t run_rc = 0; - LOG_INFO_V0("Thread %d: at AicpuExecutor::Run", my_thread_idx_); + LOG_INFO_V0("Thread %d: at AicpuExecutor::runScheduling", my_thread_idx_); - // Orchestrator check - if (my_thread_idx_ >= sched_thread_num_) { -#if PTO2_PROFILING - uint64_t orch_cycle_start = 0; - int32_t pto2_submitted_tasks = -1; -#endif - - // Orchestrator thread: load + run the device orchestration SO. - LOG_INFO_V0("Thread %d: Orchestrator Running", my_thread_idx_); - - // sm_handle / rt are bound to *this* run's memory and must be - // (re)created every run, regardless of whether the SO itself was - // reused above. - const ChipStorageTaskArgs &args = runtime->get_orch_args(); - int32_t arg_count = args.tensor_count() + args.scalar_count(); - LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", my_thread_idx_, runtime->get_gm_sm_ptr(), arg_count); - for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) { - const ContinuousTensor &t = args.tensor(i); - LOG_INFO_V0( - "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", my_thread_idx_, i, - static_cast(t.data), t.ndims, static_cast(t.dtype) - ); + // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false) + if (!sched_ctx_.is_completed() && (my_thread_idx_ < sched_thread_num_ || orch_to_sched_)) { + // Device orchestration: wait for the primary orchestrator to initialize the SM header + while (!runtime_init_ready_.load(std::memory_order_acquire)) { + SPIN_WAIT_HINT(); } - for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) { - LOG_INFO_V0( - "Thread %d: orch_args[%d] = SCALAR(0x%lx)", my_thread_idx_, args.tensor_count() + i, - static_cast(args.scalar(i)) - ); + if (rt == nullptr) { + LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", my_thread_idx_); + } else { + sched_ctx_.bind_runtime(rt); + int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, my_thread_idx_); + if (completed < 0) { + LOG_ERROR("Thread %d: Scheduler failed with rc=%d", my_thread_idx_, completed); + run_rc = completed; + } else { + LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", my_thread_idx_, completed); + } } + } - uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE; - uint64_t heap_size = PTO2_HEAP_SIZE; + LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_); - if (runtime->task_window_size > 0) { - task_window_size = runtime->task_window_size; - } - if (runtime->heap_size > 0) { - heap_size = runtime->heap_size; - } - int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE; - if (runtime->dep_pool_size > 0) { - dep_pool_capacity = static_cast(runtime->dep_pool_size); - } + return run_rc; +} + +int32_t AicpuExecutor::runOrchestration(Runtime* runtime) +{ + // Only the orchestrator thread runs this + if (my_thread_idx_ < sched_thread_num_) return 0; + +#if PTO2_PROFILING + uint64_t orch_cycle_start = 0; + int32_t pto2_submitted_tasks = -1; +#endif + + // Orchestrator thread: load + run the device orchestration SO. + LOG_INFO_V0("Thread %d: Orchestrator Running", my_thread_idx_); + + // sm_handle / rt are bound to *this* run's memory and must be + // (re)created every run, regardless of whether the SO itself was + // reused above. + const ChipStorageTaskArgs &args = runtime->get_orch_args(); + int32_t arg_count = args.tensor_count() + args.scalar_count(); + LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", my_thread_idx_, runtime->get_gm_sm_ptr(), arg_count); + for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) { + const ContinuousTensor &t = args.tensor(i); LOG_INFO_V0( - "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", my_thread_idx_, - static_cast(task_window_size), static_cast(heap_size), dep_pool_capacity + "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", my_thread_idx_, i, + static_cast(t.data), t.ndims, static_cast(t.dtype) ); + } + for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) { + LOG_INFO_V0( + "Thread %d: orch_args[%d] = SCALAR(0x%lx)", my_thread_idx_, args.tensor_count() + i, + static_cast(args.scalar(i)) + ); + } - void *sm_ptr = runtime->get_gm_sm_ptr(); - void *gm_heap = runtime->get_gm_heap_ptr(); + uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE; + uint64_t heap_size = PTO2_HEAP_SIZE; - uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size); - PTO2SharedMemoryHandle *sm_handle = - PTO2SharedMemoryHandle::create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size); - if (!sm_handle) { - LOG_ERROR("Thread %d: Failed to create shared memory handle", my_thread_idx_); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } + if (runtime->task_window_size > 0) { + task_window_size = runtime->task_window_size; + } + if (runtime->heap_size > 0) { + heap_size = runtime->heap_size; + } + int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE; + if (runtime->dep_pool_size > 0) { + dep_pool_capacity = static_cast(runtime->dep_pool_size); + } + LOG_INFO_V0( + "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", my_thread_idx_, + static_cast(task_window_size), static_cast(heap_size), dep_pool_capacity + ); + + void *sm_ptr = runtime->get_gm_sm_ptr(); + void *gm_heap = runtime->get_gm_heap_ptr(); + + uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size); + PTO2SharedMemoryHandle *sm_handle = + PTO2SharedMemoryHandle::create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size); + if (!sm_handle) { + LOG_ERROR("Thread %d: Failed to create shared memory handle", my_thread_idx_); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } - rt = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity); - if (!rt) { - LOG_ERROR("Thread %d: Failed to create PTO2Runtime", my_thread_idx_); - sm_handle->destroy(); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } + rt = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity); + if (!rt) { + LOG_ERROR("Thread %d: Failed to create PTO2Runtime", my_thread_idx_); + sm_handle->destroy(); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } #if PTO2_PROFILING - rt->orchestrator.l2_perf_level = get_l2_perf_level(); + rt->orchestrator.l2_perf_level = get_l2_perf_level(); #endif - // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). - rt->orchestrator.total_cluster_count = sched_ctx_.aic_count(); - rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count(); + // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). + rt->orchestrator.total_cluster_count = sched_ctx_.aic_count(); + rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count(); - // With multi-ring, slot_states are per-ring inside the scheduler. - runtime->set_slot_states_ptr(nullptr); + // With multi-ring, slot_states are per-ring inside the scheduler. + runtime->set_slot_states_ptr(nullptr); - orch_args_cached_ = &args; + orch_args_cached_ = &args; - // Wire scheduler context to the newly created PTO2Runtime before - // releasing scheduler threads from runtime_init_ready_. - sched_ctx_.bind_runtime(rt); + // Wire scheduler context to the newly created PTO2Runtime before + // releasing scheduler threads from runtime_init_ready_. + sched_ctx_.bind_runtime(rt); - runtime_init_ready_.store(true, std::memory_order_release); + runtime_init_ready_.store(true, std::memory_order_release); - // Wait for scheduler's one-time init to complete - sched_ctx_.wait_pto2_init_complete(); + // Wait for scheduler's one-time init to complete + sched_ctx_.wait_pto2_init_complete(); #if PTO2_PROFILING - if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { - l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_); - } + if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { + l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_); + } #endif - // dep_gen plugs into the orchestrator thread (single-instance subsystem): - // set the per-thread queue index and pop the initial buffer before any - // submit_task can fire inside orch_func_. - if (is_dep_gen_enabled()) { - dep_gen_aicpu_set_orch_thread_idx(my_thread_idx_); - dep_gen_aicpu_init(); - } + // dep_gen plugs into the orchestrator thread (single-instance subsystem): + // set the per-thread queue index and pop the initial buffer before any + // submit_task can fire inside orch_func_. + if (is_dep_gen_enabled()) { + dep_gen_aicpu_set_orch_thread_idx(my_thread_idx_); + dep_gen_aicpu_init(); + } #if PTO2_PROFILING - orch_cycle_start = get_sys_cnt_aicpu(); + orch_cycle_start = get_sys_cnt_aicpu(); #endif - framework_bind_runtime(rt); - if (*p_bind_ != nullptr) { - (*p_bind_)(rt); - } - rt_scope_begin(rt); - (*p_func_)(*orch_args_cached_); - rt_scope_end(rt); - - // Flush the (potentially partially-filled) DepGenBuffer so the host - // collector can pick it up before this orchestrator thread joins. - if (is_dep_gen_enabled()) { - dep_gen_aicpu_flush(); - } + framework_bind_runtime(rt); + if (*p_bind_ != nullptr) { + (*p_bind_)(rt); + } + rt_scope_begin(rt); + (*p_func_)(*orch_args_cached_); + rt_scope_end(rt); + + // Flush the (potentially partially-filled) DepGenBuffer so the host + // collector can pick it up before this orchestrator thread joins. + if (is_dep_gen_enabled()) { + dep_gen_aicpu_flush(); + } #if PTO2_PROFILING - uint64_t orch_cycle_end = get_sys_cnt_aicpu(); - (void)orch_cycle_end; + uint64_t orch_cycle_end = get_sys_cnt_aicpu(); + (void)orch_cycle_end; #endif - // Print orchestrator profiling data + // Print orchestrator profiling data #if PTO2_ORCH_PROFILING - PTO2OrchProfilingData p = orchestrator_get_profiling(); - uint64_t total = - p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle; - if (total == 0) total = 1; // avoid div-by-zero - LOG_INFO_V9( - "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", my_thread_idx_, - static_cast(p.submit_count), cycles_to_us(total) - ); - LOG_INFO_V9( - "Thread %d: task+heap_alloc: %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", - my_thread_idx_, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total, - cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle), - static_cast(p.alloc_atomic_count) - ); - LOG_INFO_V9( - "Thread %d: sync_tensormap : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.sync_cycle), - p.sync_cycle * 100.0 / total - ); - LOG_INFO_V9( - "Thread %d: lookup+dep : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.lookup_cycle), - p.lookup_cycle * 100.0 / total - ); - LOG_INFO_V9( - "Thread %d: tensormap_ins : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.insert_cycle), - p.insert_cycle * 100.0 / total - ); - LOG_INFO_V9( - "Thread %d: param_copy : %.3fus (%.1f%%) atomics=%" PRIu64 "", my_thread_idx_, - cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast(p.args_atomic_count) - ); - LOG_INFO_V9( - "Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus", my_thread_idx_, - cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total, - cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle) - ); - LOG_INFO_V9( - "Thread %d: avg/task : %.3fus", my_thread_idx_, - p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0 - ); + PTO2OrchProfilingData p = orchestrator_get_profiling(); + uint64_t total = + p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle; + if (total == 0) total = 1; // avoid div-by-zero + LOG_INFO_V9( + "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", my_thread_idx_, + static_cast(p.submit_count), cycles_to_us(total) + ); + LOG_INFO_V9( + "Thread %d: task+heap_alloc: %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", + my_thread_idx_, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total, + cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle), + static_cast(p.alloc_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: sync_tensormap : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.sync_cycle), + p.sync_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: lookup+dep : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.lookup_cycle), + p.lookup_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: tensormap_ins : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.insert_cycle), + p.insert_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: param_copy : %.3fus (%.1f%%) atomics=%" PRIu64 "", my_thread_idx_, + cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast(p.args_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus", my_thread_idx_, + cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total, + cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle) + ); + LOG_INFO_V9( + "Thread %d: avg/task : %.3fus", my_thread_idx_, + p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0 + ); #if PTO2_TENSORMAP_PROFILING - PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling(); - LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", my_thread_idx_); - LOG_INFO_V9( - "Thread %d: lookups : %" PRIu64 ", inserts: %" PRIu64 "", my_thread_idx_, - static_cast(tp.lookup_count), static_cast(tp.insert_count) - ); - LOG_INFO_V9( - "Thread %d: chain walked : total=%" PRIu64 ", avg=%.1f, max=%d", my_thread_idx_, - static_cast(tp.lookup_chain_total), - tp.lookup_count > 0 ? static_cast(tp.lookup_chain_total) / tp.lookup_count : 0.0, - tp.lookup_chain_max - ); - LOG_INFO_V9( - "Thread %d: overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", my_thread_idx_, - static_cast(tp.overlap_checks), static_cast(tp.overlap_hits), - tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0 - ); + PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling(); + LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", my_thread_idx_); + LOG_INFO_V9( + "Thread %d: lookups : %" PRIu64 ", inserts: %" PRIu64 "", my_thread_idx_, + static_cast(tp.lookup_count), static_cast(tp.insert_count) + ); + LOG_INFO_V9( + "Thread %d: chain walked : total=%" PRIu64 ", avg=%.1f, max=%d", my_thread_idx_, + static_cast(tp.lookup_chain_total), + tp.lookup_count > 0 ? static_cast(tp.lookup_chain_total) / tp.lookup_count : 0.0, + tp.lookup_chain_max + ); + LOG_INFO_V9( + "Thread %d: overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", my_thread_idx_, + static_cast(tp.overlap_checks), static_cast(tp.overlap_hits), + tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0 + ); #endif #endif // PTO2_ORCH_PROFILING - // Latch task count from PTO2 shared memory to hand off to the - // scheduler. The orchestrator's run window (start_time / end_time / - // submit_count) is no longer published to shared memory — the - // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line - // below carries the same envelope info for debugging, and - // host-side swimlane derives per-phase timing from the per-event - // AicpuPhaseRecord[] stream that already covers everything inside - // submit_task(). - int32_t total_tasks = 0; - if (rt->orchestrator.sm_header) { - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - total_tasks += - rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire); - } + // Latch task count from PTO2 shared memory to hand off to the + // scheduler. The orchestrator's run window (start_time / end_time / + // submit_count) is no longer published to shared memory — the + // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line + // below carries the same envelope info for debugging, and + // host-side swimlane derives per-phase timing from the per-event + // AicpuPhaseRecord[] stream that already covers everything inside + // submit_task(). + int32_t total_tasks = 0; + if (rt->orchestrator.sm_header) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + total_tasks += + rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire); } + } #if PTO2_PROFILING - pto2_submitted_tasks = total_tasks; + pto2_submitted_tasks = total_tasks; #endif - // Signal completion to the orchestrator state machine - rt_orchestration_done(rt); + // Signal completion to the orchestrator state machine + rt_orchestration_done(rt); - sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks); + sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks); #if PTO2_PROFILING - uint64_t orch_end_ts = get_sys_cnt_aicpu(); + uint64_t orch_end_ts = get_sys_cnt_aicpu(); + LOG_INFO_V9( + "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", my_thread_idx_, + static_cast(orch_cycle_start), static_cast(orch_end_ts), + cycles_to_us(orch_end_ts - orch_cycle_start) + ); + if (pto2_submitted_tasks >= 0) { LOG_INFO_V9( - "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", my_thread_idx_, - static_cast(orch_cycle_start), static_cast(orch_end_ts), - cycles_to_us(orch_end_ts - orch_cycle_start) + "PTO2 total submitted tasks = %d, already executed %d tasks", pto2_submitted_tasks, + sched_ctx_.completed_tasks_count() ); - if (pto2_submitted_tasks >= 0) { - LOG_INFO_V9( - "PTO2 total submitted tasks = %d, already executed %d tasks", pto2_submitted_tasks, - sched_ctx_.completed_tasks_count() - ); - } -#endif - LOG_INFO_V0("Thread %d: Orchestrator completed", my_thread_idx_); - } - - // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false) - if (!sched_ctx_.is_completed() && (my_thread_idx_ < sched_thread_num_ || orch_to_sched_)) { - // Device orchestration: wait for the primary orchestrator to initialize the SM header - while (!runtime_init_ready_.load(std::memory_order_acquire)) { - SPIN_WAIT_HINT(); - } - if (rt == nullptr) { - LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", my_thread_idx_); - } else { - sched_ctx_.bind_runtime(rt); - int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, my_thread_idx_); - if (completed < 0) { - LOG_ERROR("Thread %d: Scheduler failed with rc=%d", my_thread_idx_, completed); - run_rc = completed; - } else { - LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", my_thread_idx_, completed); - } - } } +#endif + LOG_INFO_V0("Thread %d: Orchestrator completed", my_thread_idx_); - LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_); - - return run_rc; + return 0; } void AicpuExecutor::deinit(Runtime *runtime) { @@ -792,7 +798,8 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) { barrier(); uint64_t t0_ts = get_sys_cnt_aicpu(); - rc |= run(runtime); + rc |= runOrchestration(runtime); + rc |= runScheduling(runtime); barrier(); uint64_t t1_ts = get_sys_cnt_aicpu(); LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, t1_ts - t0_ts); @@ -810,7 +817,8 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) // Waiting for threads to arrive for before-timing. barrier(); uint64_t t0_ts = get_sys_cnt_aicpu(); - rc |= run(runtime); + rc |= runOrchestration(runtime); + rc |= runScheduling(runtime); barrier(); uint64_t t1_ts = get_sys_cnt_aicpu(); LOG_INFO_V9("Thread %d: Timing %d/%d time: %luns", my_thread_idx_, i, timingIterationCount, t1_ts - t0_ts); @@ -872,10 +880,18 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { } } - // Perform actual kernel run - int32_t rc = g_aicpu_executor.run(runtime); - if (rc != 0) { - LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc); + // Perform orchestration step + int32_t orch_rc = g_aicpu_executor.runOrchestration(runtime); + if (orch_rc != 0) { + LOG_ERROR("aicpu_execute: Orhestration execution failed with rc=%d", orch_rc); + return orch_rc; + } + + // Perform scheduling step + int32_t sched_rc = g_aicpu_executor.runScheduling(runtime); + if (sched_rc != 0) { + LOG_ERROR("aicpu_execute: Scheduling execution failed with rc=%d", sched_rc); + return sched_rc; } int32_t runtime_rc = read_pto2_runtime_status(runtime); @@ -898,10 +914,6 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { g_aicpu_executor.deinit(runtime); } - if (rc != 0) { - return rc; - } - LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully"); return 0; } From 20140dd634ddf810ed1874dd8c778874782a4878 Mon Sep 17 00:00:00 2001 From: s00831018 Date: Thu, 21 May 2026 13:00:24 +0200 Subject: [PATCH 12/15] Adding timing statistics --- .../aicpu/aicpu_executor.cpp | 91 +++++++++++++++---- 1 file changed, 73 insertions(+), 18 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 9b8b4110c..fc34e216e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef __linux__ #include #endif @@ -475,7 +476,7 @@ int32_t AicpuExecutor::runScheduling(Runtime *runtime) { } } - LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_); + LOG_INFO_V9("Thread %d: Scheduling Completed", my_thread_idx_); return run_rc; } @@ -711,7 +712,7 @@ int32_t AicpuExecutor::runOrchestration(Runtime* runtime) ); } #endif - LOG_INFO_V0("Thread %d: Orchestrator completed", my_thread_idx_); + LOG_INFO_V9("Thread %d: Orchestrator completed", my_thread_idx_); return 0; } @@ -797,12 +798,12 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) for (int32_t i = 0; i < warmupIterationCount; i++) { barrier(); - uint64_t t0_ts = get_sys_cnt_aicpu(); + uint64_t t0 = get_sys_cnt_aicpu(); rc |= runOrchestration(runtime); rc |= runScheduling(runtime); barrier(); - uint64_t t1_ts = get_sys_cnt_aicpu(); - LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, t1_ts - t0_ts); + uint64_t tf = get_sys_cnt_aicpu(); + LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, tf - t0); // Resetting execution back to start if (my_thread_idx_ == 0) { @@ -812,28 +813,81 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) } // Second, perform timed runs (the ones that count) + std::vector orchTimes; + std::vector schedTimes; + std::vector runTimes; for (int32_t i = 0; i < timingIterationCount; i++) { // Waiting for threads to arrive for before-timing. barrier(); - uint64_t t0_ts = get_sys_cnt_aicpu(); + uint64_t t0 = get_sys_cnt_aicpu(); rc |= runOrchestration(runtime); + + uint64_t t1 = get_sys_cnt_aicpu(); rc |= runScheduling(runtime); barrier(); - uint64_t t1_ts = get_sys_cnt_aicpu(); - LOG_INFO_V9("Thread %d: Timing %d/%d time: %luns", my_thread_idx_, i, timingIterationCount, t1_ts - t0_ts); + uint64_t tf = get_sys_cnt_aicpu(); - // Resetting execution back to start + // Calculating time segments and adding them to the timing vector + const uint64_t orchTime = t1 - t0; + const uint64_t schedTime = tf - t1; + const uint64_t runTime = tf - t0; + orchTimes.push_back(orchTime); + schedTimes.push_back(schedTime); + runTimes.push_back(runTime); + + LOG_INFO_V9("Thread %d: Timing %d/%d Total Time: %luns (Orch: %luns + Sched: %luns)", my_thread_idx_, i, timingIterationCount, runTime, orchTime, schedTime); + + // Resetting execution back to start before the next run if (my_thread_idx_ == 0) { deinit(runtime); init(runtime); } } - if (rc != 0) LOG_ERROR("Thread %d - Timed runs failed with rc=%d", my_thread_idx_, rc); + if (rc != 0) + { + LOG_ERROR("Thread %d - Timed runs failed with rc=%d", my_thread_idx_, rc); + return rc; + } + + // The orchestrator thread now calculates and reports timing + if (my_thread_idx_ == sched_thread_num_) + { + uint64_t orchSum = 0; + uint64_t schedSum = 0; + uint64_t runSum = 0; + + for (const auto t : orchTimes) orchSum += t; + for (const auto t : schedTimes) schedSum += t; + for (const auto t : runTimes) runSum += t; + + // Calculating averages + const auto runCount = runTimes.size(); + double orchAvg = (double)orchSum / (double)runCount; + double schedAvg = (double)schedSum / (double)runCount; + double runAvg = (double)runSum / (double)runCount; + + // Calculating L2 norms + double orchDiff = 0; + double schedDiff = 0; + double runDiff = 0; - // Return code - return rc; + for (const auto t : orchTimes) orchDiff += ((double)t - orchAvg) * ((double)t - orchAvg) ; + for (const auto t : schedTimes) schedDiff += ((double)t - schedAvg) * ((double)t - schedAvg); + for (const auto t : runTimes) runDiff += ((double)t - runAvg) * ((double)t - runAvg) ; + + double orchStdDev = runCount == 1 ? 0.0 : std::sqrt(orchDiff / (double)(runCount-1)); + double schedStdDev = runCount == 1 ? 0.0 : std::sqrt(schedDiff / (double)(runCount-1)); + double runStdDev = runCount == 1 ? 0.0 : std::sqrt(runDiff / (double)(runCount-1)); + + LOG_INFO_V9("Thread %d: [Timing] Average Runtime over %d timed runs: ", my_thread_idx_, timingIterationCount); + LOG_INFO_V9("Thread %d: [Timing] + Orchestration: %10.0fns +- %6.0fns", my_thread_idx_, orchAvg, orchStdDev); + LOG_INFO_V9("Thread %d: [Timing] + Scheduling: %10.0fns +- %6.0fns", my_thread_idx_, schedAvg, schedStdDev); + LOG_INFO_V9("Thread %d: [Timing] + Run Total: %10.0fns +- %6.0fns", my_thread_idx_, runAvg, runStdDev); + } + + return 0; } // ===== Public Entry Point ===== @@ -894,13 +948,19 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { return sched_rc; } + // Reading PTO2 runtime status int32_t runtime_rc = read_pto2_runtime_status(runtime); - if (runtime_rc != 0) { LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc); return runtime_rc; } + // Last thread cleans up + if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) { + LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up"); + g_aicpu_executor.deinit(runtime); + } + // Shutting down int32_t shutdown_rc = g_aicpu_executor.shutdown(runtime); if (shutdown_rc != 0) { @@ -908,11 +968,6 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) { return shutdown_rc; } - // Last thread cleans up - if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) { - LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up"); - g_aicpu_executor.deinit(runtime); - } LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully"); return 0; From 454f8755d98333f47c53958d1d36e4167f59c98b Mon Sep 17 00:00:00 2001 From: s00831018 Date: Thu, 21 May 2026 13:50:02 +0200 Subject: [PATCH 13/15] Moving initialization routine outside main loop --- .../aicpu/aicpu_executor.cpp | 4 +- .../runtime/scheduler/scheduler_context.h | 2 + .../runtime/scheduler/scheduler_dispatch.cpp | 56 ++++++++++--------- 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index fc34e216e..59a73e465 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -223,6 +223,8 @@ int32_t AicpuExecutor::init(Runtime *runtime) { finished_count_.store(0, std::memory_order_release); + sched_ctx_.initializePerfCounters(); + // Loading orchestrator int32_t load_orch_rc = loadOrchestrator(runtime); if (load_orch_rc != 0) @@ -825,7 +827,7 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) uint64_t t1 = get_sys_cnt_aicpu(); rc |= runScheduling(runtime); - barrier(); + uint64_t tf = get_sys_cnt_aicpu(); // Calculating time segments and adding them to the timing vector diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h index 943f3ed06..2b810c70f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h @@ -67,6 +67,8 @@ class SchedulerContext { // Called by AicpuExecutor::deinit() during per-run teardown. void deinit(); + void initializePerfCounters(); + // ========================================================================= // Per-thread execution entry points (called by AicpuExecutor::run) // ========================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index e2970dbb3..1537d7292 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -314,6 +314,36 @@ void SchedulerContext::dispatch_shape( } } +void SchedulerContext::initializePerfCounters() +{ + // One-time init: assign perf buffers (one thread does it; others wait) + if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) + { + LOG_INFO_V0("Initializing scheduler perf counters"); + +#if PTO2_PROFILING + if (is_dump_tensor_enabled()) { + dump_tensor_init(orch_to_sched_ ? thread_num_ : sched_thread_num_); + } +#endif + +#if PTO2_PROFILING + // Initialize PMU: program events, start counters, and pop initial buffers + if (is_pmu_enabled()) { + pmu_aicpu_init(physical_core_ids_, cores_total_num_); + LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_); + } +#endif + + LOG_INFO_V0("Initialized scheduler perf counters"); + pto2_init_complete_.store(true, std::memory_order_release); + } else { + while (!pto2_init_complete_.load(std::memory_order_acquire)) { + SPIN_WAIT_HINT(); + } + } +} + // ============================================================================= // Main scheduler dispatch loop // ============================================================================= @@ -340,32 +370,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ static_cast(header->rings[0].task_window_size) ); - // One-time init: assign perf buffers (one thread does it; others wait) - if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) { - LOG_INFO_V0("Thread %d: doing one-time init", thread_idx); - -#if PTO2_PROFILING - if (is_dump_tensor_enabled()) { - dump_tensor_init(orch_to_sched_ ? thread_num_ : sched_thread_num_); - } -#endif - -#if PTO2_PROFILING - // Initialize PMU: program events, start counters, and pop initial buffers - if (is_pmu_enabled()) { - pmu_aicpu_init(physical_core_ids_, cores_total_num_); - LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_); - } -#endif - - LOG_INFO_V0("Thread %d: one-time init done", thread_idx); - pto2_init_complete_.store(true, std::memory_order_release); - } else { - while (!pto2_init_complete_.load(std::memory_order_acquire)) { - SPIN_WAIT_HINT(); - } - } - LOG_INFO_V0("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_trackers_[thread_idx].core_num()); int32_t cur_thread_completed = 0; int32_t idle_iterations = 0; From 4da588ac243e4d10bea3c8d3ffcd84f73d63cdcf Mon Sep 17 00:00:00 2001 From: s00831018 Date: Thu, 21 May 2026 15:47:29 +0200 Subject: [PATCH 14/15] Now producing avg+stddev --- .../aicpu/aicpu_executor.cpp | 51 +++++-------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 59a73e465..08a885f76 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -122,12 +122,11 @@ struct AicpuExecutor { int32_t thread_num_{0}; - // Barrier counters for synchronization (timing) across threads + // ====== Barrier counters for synchronization (timing) across threads std::atomic barrier_counter_in_{0}; std::atomic barrier_counter_out_{0}; // ===== Task queue state (managed by scheduler ready queues) ===== - std::atomic finished_count_{0}; std::atomic runtime_init_ready_{false}; @@ -186,6 +185,8 @@ struct AicpuExecutor { }; static AicpuExecutor g_aicpu_executor; + +// Thread-local identifier thread_local int32_t my_thread_idx_; // ===== AicpuExecutor Method Implementations ===== @@ -478,7 +479,7 @@ int32_t AicpuExecutor::runScheduling(Runtime *runtime) { } } - LOG_INFO_V9("Thread %d: Scheduling Completed", my_thread_idx_); + LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_); return run_rc; } @@ -576,7 +577,7 @@ int32_t AicpuExecutor::runOrchestration(Runtime* runtime) runtime_init_ready_.store(true, std::memory_order_release); // Wait for scheduler's one-time init to complete - sched_ctx_.wait_pto2_init_complete(); + // sched_ctx_.wait_pto2_init_complete(); #if PTO2_PROFILING if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) { @@ -714,7 +715,7 @@ int32_t AicpuExecutor::runOrchestration(Runtime* runtime) ); } #endif - LOG_INFO_V9("Thread %d: Orchestrator completed", my_thread_idx_); + LOG_INFO_V0("Thread %d: Orchestrator completed", my_thread_idx_); return 0; } @@ -805,7 +806,7 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) rc |= runScheduling(runtime); barrier(); uint64_t tf = get_sys_cnt_aicpu(); - LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, tf - t0); + LOG_INFO_V0("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, tf - t0); // Resetting execution back to start if (my_thread_idx_ == 0) { @@ -815,8 +816,6 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) } // Second, perform timed runs (the ones that count) - std::vector orchTimes; - std::vector schedTimes; std::vector runTimes; for (int32_t i = 0; i < timingIterationCount; i++) { @@ -824,21 +823,15 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) barrier(); uint64_t t0 = get_sys_cnt_aicpu(); rc |= runOrchestration(runtime); - - uint64_t t1 = get_sys_cnt_aicpu(); rc |= runScheduling(runtime); - + barrier(); uint64_t tf = get_sys_cnt_aicpu(); // Calculating time segments and adding them to the timing vector - const uint64_t orchTime = t1 - t0; - const uint64_t schedTime = tf - t1; const uint64_t runTime = tf - t0; - orchTimes.push_back(orchTime); - schedTimes.push_back(schedTime); runTimes.push_back(runTime); - LOG_INFO_V9("Thread %d: Timing %d/%d Total Time: %luns (Orch: %luns + Sched: %luns)", my_thread_idx_, i, timingIterationCount, runTime, orchTime, schedTime); + LOG_INFO_V0("Thread %d: Timing %d/%d Total Time: %luns", my_thread_idx_, i, timingIterationCount, runTime); // Resetting execution back to start before the next run if (my_thread_idx_ == 0) { @@ -856,37 +849,21 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) // The orchestrator thread now calculates and reports timing if (my_thread_idx_ == sched_thread_num_) { - uint64_t orchSum = 0; - uint64_t schedSum = 0; + // Calculating timing sum over all runs uint64_t runSum = 0; - - for (const auto t : orchTimes) orchSum += t; - for (const auto t : schedTimes) schedSum += t; for (const auto t : runTimes) runSum += t; - // Calculating averages + // Calculating average const auto runCount = runTimes.size(); - double orchAvg = (double)orchSum / (double)runCount; - double schedAvg = (double)schedSum / (double)runCount; double runAvg = (double)runSum / (double)runCount; - // Calculating L2 norms - double orchDiff = 0; - double schedDiff = 0; + // Calculating stddev double runDiff = 0; - - for (const auto t : orchTimes) orchDiff += ((double)t - orchAvg) * ((double)t - orchAvg) ; - for (const auto t : schedTimes) schedDiff += ((double)t - schedAvg) * ((double)t - schedAvg); for (const auto t : runTimes) runDiff += ((double)t - runAvg) * ((double)t - runAvg) ; - - double orchStdDev = runCount == 1 ? 0.0 : std::sqrt(orchDiff / (double)(runCount-1)); - double schedStdDev = runCount == 1 ? 0.0 : std::sqrt(schedDiff / (double)(runCount-1)); double runStdDev = runCount == 1 ? 0.0 : std::sqrt(runDiff / (double)(runCount-1)); - LOG_INFO_V9("Thread %d: [Timing] Average Runtime over %d timed runs: ", my_thread_idx_, timingIterationCount); - LOG_INFO_V9("Thread %d: [Timing] + Orchestration: %10.0fns +- %6.0fns", my_thread_idx_, orchAvg, orchStdDev); - LOG_INFO_V9("Thread %d: [Timing] + Scheduling: %10.0fns +- %6.0fns", my_thread_idx_, schedAvg, schedStdDev); - LOG_INFO_V9("Thread %d: [Timing] + Run Total: %10.0fns +- %6.0fns", my_thread_idx_, runAvg, runStdDev); + // Printing + LOG_INFO_V9("Thread %d: [Timing] Average Runtime over %d timed runs: %10.0fns +- %6.0fns", my_thread_idx_, timingIterationCount, runAvg, runStdDev); } return 0; From 726c3bc2894db524ce8b70f89b633c90882d9430 Mon Sep 17 00:00:00 2001 From: s00831018 Date: Thu, 21 May 2026 16:08:26 +0200 Subject: [PATCH 15/15] Adding missing barrier --- .../runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 08a885f76..3d7249fcf 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -838,6 +838,9 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime) deinit(runtime); init(runtime); } + + // Synchronizing before actual run + barrier(); } if (rc != 0)