From a1ecccff6192b8282717dd479083bac53aab3688 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Tue, 19 May 2026 15:24:17 +0200
Subject: [PATCH 01/15] Adding the capability of re-running a task within a
 single kernel run. This is important for accurate timing

---
 .../aicpu/aicpu_executor.cpp                  | 166 ++++++++++++------
 .../host/runtime_maker.cpp                    |  36 ++++
 .../runtime/runtime.h                         |   9 +
 .../runtime/shared/runtime.cpp                |   2 +
 4 files changed, 155 insertions(+), 58 deletions(-)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 851015e21..a72e27c36 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -113,7 +113,7 @@ struct AicpuExecutor {
     bool orch_to_sched_{false};
 
     // ===== Thread management state =====
-    std::atomic<int32_t> thread_idx_{0};
+    std::atomic<int32_t> thread_idx_accumulator{0};
     std::atomic<bool> initialized_{false};
     std::atomic<bool> init_done_{false};
     std::atomic<bool> init_failed_{false};
@@ -121,6 +121,10 @@ struct AicpuExecutor {
 
     int32_t thread_num_{0};
 
+    // Barrier counters for synchronization (timing) across threads
+    std::atomic<uint64_t> barrier_counter_in_{0};
+    std::atomic<uint64_t> barrier_counter_out_{0};
+
     // ===== Task queue state (managed by scheduler ready queues) =====
 
     std::atomic<int32_t> finished_count_{0};
@@ -142,6 +146,19 @@ struct AicpuExecutor {
     int32_t init(Runtime *runtime);
     int32_t run(Runtime *runtime);
     void deinit(Runtime *runtime);
+    int32_t getThreadId() { return thread_idx_accumulator.fetch_add(1); }
+
+    // Barrier function to synchronize threads
+    inline void barrier()
+    {
+        // Two phase barrier (in_out), guarantees that all threads are retained in the rendezvous point, until all of them have arrived
+       barrier_counter_in_.fetch_add(1);
+       while (barrier_counter_in_.load(std::memory_order_relaxed) % thread_num_ != 0);
+       
+       barrier_counter_out_.fetch_add(1);
+       while (barrier_counter_out_.load(std::memory_order_relaxed) % thread_num_ != 0);
+    }
+
 
     ~AicpuExecutor() {
         // Process-wide teardown (the single static instance dies here). Every
@@ -157,6 +174,7 @@ struct AicpuExecutor {
 };
 
 static AicpuExecutor g_aicpu_executor;
+thread_local int32_t my_thread_idx_;
 
 // ===== AicpuExecutor Method Implementations =====
 
@@ -202,16 +220,18 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
  * Shutdown AICore - Send exit signal via registers to all AICore kernels
  */
 int32_t AicpuExecutor::run(Runtime *runtime) {
-    int32_t thread_idx = thread_idx_++;
     int32_t run_rc = 0;
-    LOG_INFO_V0("Thread %d: Start", thread_idx);
+    LOG_INFO_V0("Thread %d: at AicpuExecutor::Run", my_thread_idx_);
 
     // Orchestrator check
-    if (thread_idx >= sched_thread_num_) {
+    if (my_thread_idx_ >= sched_thread_num_) {
 #if PTO2_PROFILING
         uint64_t orch_cycle_start = 0;
         int32_t pto2_submitted_tasks = -1;
 #endif
+
+        LOG_INFO_V0("Thread %d: Orchestrator Running", my_thread_idx_);
+
         // Orchestrator thread: load + run the device orchestration SO. The braces
         // scope the per-callable dlopen / SO-table locals to this block.
         {
@@ -221,7 +241,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             const int32_t callable_id = runtime->get_active_callable_id();
             if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
                 LOG_ERROR(
-                    "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
+                    "Thread %d: invalid callable_id %d (limit=%d)", my_thread_idx_, callable_id, MAX_REGISTERED_CALLABLE_IDS
                 );
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
@@ -232,9 +252,10 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind;
             DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
             const bool reload_so = runtime->register_new_callable_id();
+            const bool so_in_use = orch_so_table_[callable_id].in_use;
 
-            if (reload_so) {
-                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
+            if (reload_so && so_in_use == false) {
+                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", my_thread_idx_, callable_id);
                 if (*p_handle != nullptr) {
                     dlclose(*p_handle);
                     *p_handle = nullptr;
@@ -253,7 +274,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 size_t so_size = runtime->get_dev_orch_so_size();
 
                 if (so_data == nullptr || so_size == 0) {
-                    LOG_ERROR("Thread %d: Device orchestration SO not set", thread_idx);
+                    LOG_ERROR("Thread %d: Device orchestration SO not set", my_thread_idx_);
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
@@ -271,7 +292,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path));
                     if (fd < 0) {
                         LOG_INFO_V0(
-                            "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
+                            "Thread %d: Cannot create SO at %s (errno=%d), trying next path", my_thread_idx_, so_path, errno
                         );
                         continue;
                     }
@@ -279,17 +300,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     close(fd);
                     if (written != static_cast<ssize_t>(so_size)) {
                         LOG_INFO_V0(
-                            "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno
+                            "Thread %d: Cannot write SO to %s (errno=%d), trying next path", my_thread_idx_, so_path, errno
                         );
                         unlink(so_path);
                         continue;
                     }
                     file_created = true;
-                    LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size);
+                    LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", my_thread_idx_, so_path, so_size);
                 }
 
                 if (!file_created) {
-                    LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx);
+                    LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", my_thread_idx_);
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
@@ -299,13 +320,13 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
                 const char *dlopen_err = dlerror();
                 if (handle == nullptr) {
-                    LOG_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown");
+                    LOG_ERROR("Thread %d: dlopen failed: %s", my_thread_idx_, dlopen_err ? dlopen_err : "unknown");
                     unlink(so_path);
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
                 }
-                LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
+                LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", my_thread_idx_, handle);
 
                 // Unlink the on-disk SO immediately: dlopen has already mmap'd
                 // the image, so the kernel keeps the inode alive until the
@@ -330,7 +351,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 const char *entry_dlsym_error = dlerror();
                 if (entry_dlsym_error != nullptr) {
                     LOG_ERROR(
-                        "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error
+                        "Thread %d: dlsym failed for entry symbol '%s': %s", my_thread_idx_, entry_symbol, entry_dlsym_error
                     );
                     dlclose(handle);
                     unlink(so_path);
@@ -339,7 +360,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     return -1;
                 }
                 if (orch_func == nullptr) {
-                    LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol);
+                    LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", my_thread_idx_, entry_symbol);
                     dlclose(handle);
                     unlink(so_path);
                     // Unblock scheduler threads before returning so they don't spin forever.
@@ -352,7 +373,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 const char *config_dlsym_error = dlerror();
                 if (config_dlsym_error != nullptr || config_func == nullptr) {
                     LOG_ERROR(
-                        "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol,
+                        "Thread %d: dlsym failed for config symbol '%s': %s", my_thread_idx_, config_symbol,
                         config_dlsym_error ? config_dlsym_error : "NULL function pointer"
                     );
                     config_func = nullptr;
@@ -363,7 +384,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "framework_bind_runtime"));
                 const char *bind_runtime_error = dlerror();
                 if (bind_runtime_error != nullptr) {
-                    LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", thread_idx, bind_runtime_error);
+                    LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", my_thread_idx_, bind_runtime_error);
                     bind_runtime_func = nullptr;
                 }
 
@@ -375,11 +396,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 orch_so_table_[callable_id].in_use = true;
             } else {
                 LOG_INFO_V0(
-                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
+                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", my_thread_idx_, *p_handle, callable_id
                 );
                 if (*p_handle == nullptr || *p_func == nullptr) {
                     LOG_ERROR(
-                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx,
+                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", my_thread_idx_,
                         callable_id
                     );
                     // Unblock scheduler threads before returning so they don't spin forever.
@@ -391,13 +412,13 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // Validate arg count on every run (reload or cache hit).
             if (*p_config_func != nullptr) {
                 PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args());
-                LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count);
+                LOG_INFO_V0("Thread %d: Config: expected_args=%d", my_thread_idx_, cfg.expected_arg_count);
                 if (cfg.expected_arg_count > 0) {
                     const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
                     int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count();
                     if (actual_arg_count < cfg.expected_arg_count) {
                         LOG_ERROR(
-                            "Thread %d: arg_count %d < expected %d", thread_idx, actual_arg_count,
+                            "Thread %d: arg_count %d < expected %d", my_thread_idx_, actual_arg_count,
                             cfg.expected_arg_count
                         );
                         // Clean up cached state so a subsequent run does a full reload.
@@ -419,7 +440,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     }
                 }
             } else {
-                LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx);
+                LOG_INFO_V0("Thread %d: No config function, using defaults", my_thread_idx_);
             }
 
             // sm_handle / rt are bound to *this* run's memory and must be
@@ -427,17 +448,17 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // reused above.
             const ChipStorageTaskArgs &args = runtime->get_orch_args();
             int32_t arg_count = args.tensor_count() + args.scalar_count();
-            LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", thread_idx, runtime->get_gm_sm_ptr(), arg_count);
+            LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", my_thread_idx_, runtime->get_gm_sm_ptr(), arg_count);
             for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) {
                 const ContinuousTensor &t = args.tensor(i);
                 LOG_INFO_V0(
-                    "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", thread_idx, i,
+                    "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", my_thread_idx_, i,
                     static_cast<uint64_t>(t.data), t.ndims, static_cast<unsigned>(t.dtype)
                 );
             }
             for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) {
                 LOG_INFO_V0(
-                    "Thread %d: orch_args[%d] = SCALAR(0x%lx)", thread_idx, args.tensor_count() + i,
+                    "Thread %d: orch_args[%d] = SCALAR(0x%lx)", my_thread_idx_, args.tensor_count() + i,
                     static_cast<uint64_t>(args.scalar(i))
                 );
             }
@@ -456,7 +477,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
             }
             LOG_INFO_V0(
-                "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", thread_idx,
+                "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", my_thread_idx_,
                 static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
             );
 
@@ -467,7 +488,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             PTO2SharedMemoryHandle *sm_handle =
                 PTO2SharedMemoryHandle::create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size);
             if (!sm_handle) {
-                LOG_ERROR("Thread %d: Failed to create shared memory handle", thread_idx);
+                LOG_ERROR("Thread %d: Failed to create shared memory handle", my_thread_idx_);
                 // Unblock scheduler threads before returning so they don't spin forever.
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
@@ -475,7 +496,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
             rt = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity);
             if (!rt) {
-                LOG_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx);
+                LOG_ERROR("Thread %d: Failed to create PTO2Runtime", my_thread_idx_);
                 sm_handle->destroy();
                 // Unblock scheduler threads before returning so they don't spin forever.
                 runtime_init_ready_.store(true, std::memory_order_release);
@@ -506,7 +527,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
 #if PTO2_PROFILING
             if (is_l2_swimlane_enabled()) {
-                l2_perf_aicpu_set_orch_thread_idx(thread_idx);
+                l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_);
             }
 #endif
 
@@ -514,7 +535,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // set the per-thread queue index and pop the initial buffer before any
             // submit_task can fire inside orch_func_.
             if (is_dep_gen_enabled()) {
-                dep_gen_aicpu_set_orch_thread_idx(thread_idx);
+                dep_gen_aicpu_set_orch_thread_idx(my_thread_idx_);
                 dep_gen_aicpu_init();
             }
 
@@ -546,57 +567,57 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle;
             if (total == 0) total = 1;  // avoid div-by-zero
             LOG_INFO_V9(
-                "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", thread_idx,
+                "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", my_thread_idx_,
                 static_cast<int64_t>(p.submit_count), cycles_to_us(total)
             );
             LOG_INFO_V9(
                 "Thread %d:   task+heap_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
-                thread_idx, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
+                my_thread_idx_, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
                 cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
                 static_cast<uint64_t>(p.alloc_atomic_count)
             );
             LOG_INFO_V9(
-                "Thread %d:   sync_tensormap : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.sync_cycle),
+                "Thread %d:   sync_tensormap : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.sync_cycle),
                 p.sync_cycle * 100.0 / total
             );
             LOG_INFO_V9(
-                "Thread %d:   lookup+dep     : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.lookup_cycle),
+                "Thread %d:   lookup+dep     : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.lookup_cycle),
                 p.lookup_cycle * 100.0 / total
             );
             LOG_INFO_V9(
-                "Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.insert_cycle),
+                "Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.insert_cycle),
                 p.insert_cycle * 100.0 / total
             );
             LOG_INFO_V9(
-                "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
+                "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", my_thread_idx_,
                 cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
             );
             LOG_INFO_V9(
                 "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
-                thread_idx, cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
+                my_thread_idx_, cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
                 cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle),
                 static_cast<uint64_t>(p.fanin_atomic_count)
             );
             LOG_INFO_V9(
-                "Thread %d:   avg/task       : %.3fus", thread_idx,
+                "Thread %d:   avg/task       : %.3fus", my_thread_idx_,
                 p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0
             );
 
 #if PTO2_TENSORMAP_PROFILING
             PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling();
-            LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", thread_idx);
+            LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", my_thread_idx_);
             LOG_INFO_V9(
-                "Thread %d:   lookups        : %" PRIu64 ", inserts: %" PRIu64 "", thread_idx,
+                "Thread %d:   lookups        : %" PRIu64 ", inserts: %" PRIu64 "", my_thread_idx_,
                 static_cast<uint64_t>(tp.lookup_count), static_cast<uint64_t>(tp.insert_count)
             );
             LOG_INFO_V9(
-                "Thread %d:   chain walked   : total=%" PRIu64 ", avg=%.1f, max=%d", thread_idx,
+                "Thread %d:   chain walked   : total=%" PRIu64 ", avg=%.1f, max=%d", my_thread_idx_,
                 static_cast<uint64_t>(tp.lookup_chain_total),
                 tp.lookup_count > 0 ? static_cast<double>(tp.lookup_chain_total) / tp.lookup_count : 0.0,
                 tp.lookup_chain_max
             );
             LOG_INFO_V9(
-                "Thread %d:   overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", thread_idx,
+                "Thread %d:   overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", my_thread_idx_,
                 static_cast<uint64_t>(tp.overlap_checks), static_cast<uint64_t>(tp.overlap_hits),
                 tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0
             );
@@ -637,12 +658,12 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             pto2_submitted_tasks = total_tasks;
 #endif
 
-            sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks);
+            sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks);
         }
 #if PTO2_PROFILING
         uint64_t orch_end_ts = get_sys_cnt_aicpu();
         LOG_INFO_V9(
-            "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", thread_idx,
+            "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", my_thread_idx_,
             static_cast<uint64_t>(orch_cycle_start), static_cast<uint64_t>(orch_end_ts),
             cycles_to_us(orch_end_ts - orch_cycle_start)
         );
@@ -653,42 +674,42 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             );
         }
 #endif
-        LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx);
+        LOG_INFO_V0("Thread %d: Orchestrator completed", my_thread_idx_);
     }
 
     // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
-    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
+    if (!sched_ctx_.is_completed() && (my_thread_idx_ < sched_thread_num_ || orch_to_sched_)) {
         // Device orchestration: wait for the primary orchestrator to initialize the SM header
         while (!runtime_init_ready_.load(std::memory_order_acquire)) {
             SPIN_WAIT_HINT();
         }
         if (rt == nullptr) {
-            LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx);
+            LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", my_thread_idx_);
         } else {
             sched_ctx_.bind_runtime(rt);
-            int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, thread_idx);
+            int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, my_thread_idx_);
             if (completed < 0) {
-                LOG_ERROR("Thread %d: Scheduler failed with rc=%d", thread_idx, completed);
+                LOG_ERROR("Thread %d: Scheduler failed with rc=%d", my_thread_idx_, completed);
                 run_rc = completed;
             } else {
-                LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", thread_idx, completed);
+                LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", my_thread_idx_, completed);
             }
         }
     }
 
+    LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_);
+
     // Always shutdown AICore — even if sched_ctx_.completed_ was already true.
     // platform_deinit_aicore_regs is idempotent; orchestrator threads have
-    // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly.
-    int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx);
+    // core_trackers_[my_thread_idx_].core_num() == 0 so they skip the loop harmlessly.
+    int32_t shutdown_rc = sched_ctx_.shutdown(my_thread_idx_);
     if (shutdown_rc != 0 && run_rc == 0) {
         run_rc = shutdown_rc;
     }
 
-    LOG_INFO_V0("Thread %d: Completed", thread_idx);
-
     // Check if this is the last thread to finish
     int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
-    if (prev_finished + 1 == thread_num_) {
+    if (prev_finished + 1 % thread_num_ == 0) {
         finished_.store(true, std::memory_order_release);
         // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
         // always tear them down here, but we keep the per-cid orch SO entries
@@ -742,7 +763,7 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     initialized_.store(false, std::memory_order_release);
     init_done_.store(false, std::memory_order_release);
     init_failed_.store(false, std::memory_order_release);
-    thread_idx_.store(0, std::memory_order_release);
+    thread_idx_accumulator.store(0, std::memory_order_release);
     finished_.store(false, std::memory_order_release);
 
     LOG_INFO_V0("DeInit: AicpuExecutor reset complete");
@@ -769,6 +790,8 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
         return -1;
     }
 
+    my_thread_idx_ = g_aicpu_executor.getThreadId();
+
     LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution");
 
     g_aicpu_executor.init(runtime);
@@ -780,7 +803,34 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
         }
     }
 
-    int32_t rc = g_aicpu_executor.run(runtime);
+    // Return code for running the kernel. It must be zero (no error) for all runs
+    int32_t rc = 0;
+
+    // Adding timing runs, exclusively for performance evaluation:
+    const auto warmupIterationCount = runtime->get_warmup_iteration_count(); 
+    const auto timingIterationCount = runtime->get_timing_iteration_count();
+
+    // First, perform warmup to disregard any cold cache effects and thread initialization times
+    for (int32_t i = 0; i < warmupIterationCount; i++)
+    {
+        rc |= g_aicpu_executor.run(runtime);
+
+        // Waiting for threads to come back before re-running.
+        g_aicpu_executor.barrier();
+    }   
+
+    // Second, perform timed runs (the ones that count)
+    for (int32_t i = 0; i < timingIterationCount; i++)
+    {
+         rc |= g_aicpu_executor.run(runtime);
+
+        // Waiting for threads to come back before re-running.
+        g_aicpu_executor.barrier();
+    }   
+
+    // Perform actual kernel run
+    rc |= g_aicpu_executor.run(runtime);
+
     if (rc != 0) {
         LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc);
     }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index ab3cf838b..a0ecb1311 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -224,6 +224,42 @@ extern "C" int bind_prepared_to_runtime_impl(
     }
     int64_t t_args_end = _now_ms();
 
+    // (Timing) Specify whether to perform warmup runs inside the aicpu executor
+    {
+        const char *env_warmup_iterations = std::getenv("PTO2_WARMUP_ITERATION_COUNT");
+        if (env_warmup_iterations) {
+            char *endptr;
+            int64_t val = strtol(env_warmup_iterations, &endptr, 10);
+            if (endptr != env_warmup_iterations && *endptr == '\0') {
+                runtime->warmup_iteration_count = static_cast<int>(val);
+            } else {
+                LOG_WARN(
+                    "PTO2_WARMUP_ITERATION_COUNT=%s is invalid, using default %d", env_warmup_iterations, RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT
+                );
+                runtime->warmup_iteration_count = RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT;
+            }
+        }
+        LOG_INFO_V0("Warmup iteration count: %d", runtime->warmup_iteration_count);
+    }
+
+    // (Timing) Specify whether to perform timing runs inside the aicpu executor
+    {
+        const char *env_timing_iterations = std::getenv("PTO2_TIMING_ITERATION_COUNT");
+        if (env_timing_iterations) {
+            char *endptr;
+            int64_t val = strtol(env_timing_iterations, &endptr, 10);
+            if (endptr != env_timing_iterations && *endptr == '\0') {
+                runtime->timing_iteration_count = static_cast<int>(val);
+            } else {
+                LOG_WARN(
+                    "PTO2_TIMING_ITERATION_COUNT=%s is invalid, using default %d", env_timing_iterations, RUNTIME_DEFAULT_TIMING_ITERATION_COUNT
+                );
+                runtime->timing_iteration_count = RUNTIME_DEFAULT_TIMING_ITERATION_COUNT;
+            }
+        }
+        LOG_INFO_V0("Warmup iteration count: %d", runtime->timing_iteration_count);
+    }
+
     // Read ready queue shard count from environment for AICPU scheduler
     {
         const char *env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS");
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index bd9d08ae8..2b4637193 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -54,6 +54,8 @@
 
 // Default ready queue shards: one shard per worker thread (total minus orchestrator)
 constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
+constexpr int RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT = 0;
+constexpr int RUNTIME_DEFAULT_TIMING_ITERATION_COUNT = 0;
 
 // =============================================================================
 // Data Structures
@@ -167,6 +169,10 @@ class Runtime {
     int sche_cpu_num;        // Number of AICPU threads for scheduling
     int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
 
+    // Timing parameters (for precise performance estimation)
+    int warmup_iteration_count;
+    int timing_iteration_count;
+
     // Ring buffer size overrides (0 = use compile-time defaults)
     uint64_t task_window_size;
     uint64_t heap_size;
@@ -216,6 +222,9 @@ class Runtime {
     // Performance Profiling
     // =========================================================================
 
+    inline int32_t get_warmup_iteration_count() const { return warmup_iteration_count; };
+    inline int32_t get_timing_iteration_count() const { return timing_iteration_count; };
+
     // =========================================================================
     // Device orchestration (for AICPU thread 3)
     // =========================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index d64a2b7d4..b43a222df 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -34,6 +34,8 @@ Runtime::Runtime() {
     worker_count = 0;
     sche_cpu_num = 1;
     ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
+    warmup_iteration_count = RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT;
+    timing_iteration_count = RUNTIME_DEFAULT_TIMING_ITERATION_COUNT;
     task_window_size = 0;
     heap_size = 0;
     dep_pool_size = 0;

From 847b64c3f1e5df6ea3328e931a9ca6fa2bd82563 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 20 May 2026 09:53:31 +0200
Subject: [PATCH 02/15] Adding timing

---
 .../aicpu/aicpu_executor.cpp                  | 35 ++++++++++---------
 .../host/runtime_maker.cpp                    | 32 +++++++++++++----
 .../runtime/runtime.h                         |  2 ++
 .../runtime/scheduler/scheduler_dispatch.cpp  |  3 ++
 .../runtime/shared/runtime.cpp                |  1 +
 5 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index a72e27c36..0460563b6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -806,27 +806,28 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
     // Return code for running the kernel. It must be zero (no error) for all runs
     int32_t rc = 0;
 
-    // Adding timing runs, exclusively for performance evaluation:
-    const auto warmupIterationCount = runtime->get_warmup_iteration_count(); 
-    const auto timingIterationCount = runtime->get_timing_iteration_count();
-
-    // First, perform warmup to disregard any cold cache effects and thread initialization times
-    for (int32_t i = 0; i < warmupIterationCount; i++)
+    // Performing timing evaluation, exclusively for performance evaluation:
+    const auto isTimingEnabled = runtime->get_timing_enabled(); 
+    if (isTimingEnabled == true)
     {
-        rc |= g_aicpu_executor.run(runtime);
+        const auto warmupIterationCount = runtime->get_warmup_iteration_count(); 
+        const auto timingIterationCount = runtime->get_timing_iteration_count();
 
-        // Waiting for threads to come back before re-running.
-        g_aicpu_executor.barrier();
-    }   
+        // First, perform warmup to disregard any cold cache effects and thread initialization times
+        for (int32_t i = 0; i < warmupIterationCount; i++) rc |= g_aicpu_executor.run(runtime);
 
-    // Second, perform timed runs (the ones that count)
-    for (int32_t i = 0; i < timingIterationCount; i++)
-    {
-         rc |= g_aicpu_executor.run(runtime);
+        // Second, perform timed runs (the ones that count)
+        for (int32_t i = 0; i < timingIterationCount; i++)
+        {
+            // Waiting for threads to arrive for before-timing.
+            g_aicpu_executor.barrier();
 
-        // Waiting for threads to come back before re-running.
-        g_aicpu_executor.barrier();
-    }   
+            rc |= g_aicpu_executor.run(runtime);
+
+            // Waiting for threads to come back for after-timing.
+            g_aicpu_executor.barrier();
+        }   
+    }
 
     // Perform actual kernel run
     rc |= g_aicpu_executor.run(runtime);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index a0ecb1311..13a4f99ef 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -224,9 +224,28 @@ extern "C" int bind_prepared_to_runtime_impl(
     }
     int64_t t_args_end = _now_ms();
 
-    // (Timing) Specify whether to perform warmup runs inside the aicpu executor
+    
+    // (Timing) Specify whether to perform timing analysis inside the aicpu executor
     {
-        const char *env_warmup_iterations = std::getenv("PTO2_WARMUP_ITERATION_COUNT");
+        const char *env_timing_iterations = std::getenv("PTO2_KERNEL_TIMING_ENABLED");
+        if (env_timing_iterations) {
+            std::string env_timing_iterations_string = std::string(env_timing_iterations);
+            bool isValidValue = false;
+            if (env_timing_iterations_string == "True") { runtime->is_timing_enabled = true; isValidValue = true; }
+            if (env_timing_iterations_string == "False") { runtime->is_timing_enabled = false; isValidValue = true; }
+            if (isValidValue == false) 
+            {
+                LOG_WARN("PTO2_KERNEL_TIMING_ENABLED=%s is invalid, using default: \"False\"", env_timing_iterations);
+                runtime->is_timing_enabled = false;
+            }
+        }
+        LOG_INFO_V0("Is kernel timing enabled? %s", runtime->is_timing_enabled ? "True" : "False");
+    }
+
+    // (Timing) Specify how many warmup runs inside the aicpu executor, if timing is enabled
+    if (runtime->is_timing_enabled == true)
+    {
+        const char *env_warmup_iterations = std::getenv("PTO2_KERNEL_TIMING_WARMUP_COUNT");
         if (env_warmup_iterations) {
             char *endptr;
             int64_t val = strtol(env_warmup_iterations, &endptr, 10);
@@ -234,7 +253,7 @@ extern "C" int bind_prepared_to_runtime_impl(
                 runtime->warmup_iteration_count = static_cast<int>(val);
             } else {
                 LOG_WARN(
-                    "PTO2_WARMUP_ITERATION_COUNT=%s is invalid, using default %d", env_warmup_iterations, RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT
+                    "PTO2_KERNEL_TIMING_WARMUP_COUNT=%s is invalid, using default %d", env_warmup_iterations, RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT
                 );
                 runtime->warmup_iteration_count = RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT;
             }
@@ -242,9 +261,10 @@ extern "C" int bind_prepared_to_runtime_impl(
         LOG_INFO_V0("Warmup iteration count: %d", runtime->warmup_iteration_count);
     }
 
-    // (Timing) Specify whether to perform timing runs inside the aicpu executor
+    // (Timing) Specify how many timing runs inside the aicpu executor, if timing is enabled
+    if (runtime->is_timing_enabled == true)
     {
-        const char *env_timing_iterations = std::getenv("PTO2_TIMING_ITERATION_COUNT");
+        const char *env_timing_iterations = std::getenv("PTO2_KERNEL_TIMING_ITERATION_COUNT");
         if (env_timing_iterations) {
             char *endptr;
             int64_t val = strtol(env_timing_iterations, &endptr, 10);
@@ -252,7 +272,7 @@ extern "C" int bind_prepared_to_runtime_impl(
                 runtime->timing_iteration_count = static_cast<int>(val);
             } else {
                 LOG_WARN(
-                    "PTO2_TIMING_ITERATION_COUNT=%s is invalid, using default %d", env_timing_iterations, RUNTIME_DEFAULT_TIMING_ITERATION_COUNT
+                    "PTO2_KERNEL_TIMING_ITERATION_COUNT=%s is invalid, using default %d", env_timing_iterations, RUNTIME_DEFAULT_TIMING_ITERATION_COUNT
                 );
                 runtime->timing_iteration_count = RUNTIME_DEFAULT_TIMING_ITERATION_COUNT;
             }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 2b4637193..ea518eca8 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -170,6 +170,7 @@ class Runtime {
     int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
 
     // Timing parameters (for precise performance estimation)
+    bool is_timing_enabled;
     int warmup_iteration_count;
     int timing_iteration_count;
 
@@ -222,6 +223,7 @@ class Runtime {
     // Performance Profiling
     // =========================================================================
 
+    inline bool    get_timing_enabled() const { return is_timing_enabled; };
     inline int32_t get_warmup_iteration_count() const { return warmup_iteration_count; };
     inline int32_t get_timing_iteration_count() const { return timing_iteration_count; };
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index a028fd138..54415364d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -314,6 +314,8 @@ void SchedulerContext::dispatch_shape(
 // =============================================================================
 
 int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) {
+    LOG_INFO_V0("Thread %d: At resolve_and_dispatch", thread_idx);
+
     always_assert(sched_ != nullptr);
     CoreTracker &tracker = core_trackers_[thread_idx];
     LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx);
@@ -392,6 +394,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     l2_perf.sched_start_ts = get_sys_cnt_aicpu();
 #endif
 
+    LOG_INFO_V0("Thread %d: Scheduling Start. (Completed: %s)", thread_idx, completed_.load() ? "True" : "False");
     while (true) {
         if (completed_.load(std::memory_order_acquire)) {
             break;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index b43a222df..0bc984871 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -34,6 +34,7 @@ Runtime::Runtime() {
     worker_count = 0;
     sche_cpu_num = 1;
     ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
+    is_timing_enabled = false;
     warmup_iteration_count = RUNTIME_DEFAULT_WARMUP_ITERATION_COUNT;
     timing_iteration_count = RUNTIME_DEFAULT_TIMING_ITERATION_COUNT;
     task_window_size = 0;

From 38e50089d8d823e689c8cdd8d89396dd1170d20d Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 20 May 2026 10:58:44 +0200
Subject: [PATCH 03/15] Progress

---
 .../aicpu/aicpu_executor.cpp                  | 74 +++++++++++++------
 .../runtime/scheduler/scheduler_cold_path.cpp | 46 ++++++++++++
 .../runtime/scheduler/scheduler_context.h     |  3 +
 .../runtime/scheduler/scheduler_dispatch.cpp  |  2 -
 4 files changed, 101 insertions(+), 24 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 0460563b6..e71da8341 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -145,8 +145,10 @@ struct AicpuExecutor {
     // ===== Methods =====
     int32_t init(Runtime *runtime);
     int32_t run(Runtime *runtime);
+    int32_t performTimingRuns(Runtime *runtime);
     void deinit(Runtime *runtime);
     int32_t getThreadId() { return thread_idx_accumulator.fetch_add(1); }
+    void resetSchedulerContext() { sched_ctx_.reset(); }
 
     // Barrier function to synchronize threads
     inline void barrier()
@@ -769,6 +771,54 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     LOG_INFO_V0("DeInit: AicpuExecutor reset complete");
 }
 
+int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
+{
+    const auto warmupIterationCount = runtime->get_warmup_iteration_count(); 
+    const auto timingIterationCount = runtime->get_timing_iteration_count();
+
+    // Return code for running the kernel. It must be zero (no error) for all runs
+    int32_t rc = 0;
+
+    // First, perform warmup to disregard any cold cache effects and thread initialization times
+    for (int32_t i = 0; i < warmupIterationCount; i++)
+    {
+        barrier();
+        uint64_t t0_ts = get_sys_cnt_aicpu();
+        rc |= run(runtime);
+        barrier();
+        uint64_t t1_ts = get_sys_cnt_aicpu();
+        LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, t1_ts - t0_ts);
+
+        // Waiting for threads to come back for after-timing.
+        // deinit(runtime);
+        // init(runtime);
+    } 
+
+    // Second, perform timed runs (the ones that count)
+    for (int32_t i = 0; i < timingIterationCount; i++)
+    {
+        // Waiting for threads to arrive for before-timing.
+        barrier();
+        uint64_t t0_ts = get_sys_cnt_aicpu();
+        rc |= run(runtime);
+        barrier();
+        uint64_t t1_ts = get_sys_cnt_aicpu();
+        LOG_INFO_V9("Thread %d: Timing %d/%d time: %luns", my_thread_idx_, i, timingIterationCount, t1_ts - t0_ts);
+
+        // Waiting for threads to come back for after-timing.
+        // deinit(runtime);
+        // init(runtime);
+    }   
+
+    // A barier to make sure all threads agree at this point before running the actual run
+    barrier();
+
+    if (rc != 0) LOG_ERROR("Thread %d - Timed runs failed with rc=%d", my_thread_idx_, rc);
+
+    // Return code
+    return rc;
+}
+
 // ===== Public Entry Point =====
 
 /**
@@ -803,31 +853,11 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
         }
     }
 
-    // Return code for running the kernel. It must be zero (no error) for all runs
+    // Return code. Must be zero for all runs
     int32_t rc = 0;
 
     // Performing timing evaluation, exclusively for performance evaluation:
-    const auto isTimingEnabled = runtime->get_timing_enabled(); 
-    if (isTimingEnabled == true)
-    {
-        const auto warmupIterationCount = runtime->get_warmup_iteration_count(); 
-        const auto timingIterationCount = runtime->get_timing_iteration_count();
-
-        // First, perform warmup to disregard any cold cache effects and thread initialization times
-        for (int32_t i = 0; i < warmupIterationCount; i++) rc |= g_aicpu_executor.run(runtime);
-
-        // Second, perform timed runs (the ones that count)
-        for (int32_t i = 0; i < timingIterationCount; i++)
-        {
-            // Waiting for threads to arrive for before-timing.
-            g_aicpu_executor.barrier();
-
-            rc |= g_aicpu_executor.run(runtime);
-
-            // Waiting for threads to come back for after-timing.
-            g_aicpu_executor.barrier();
-        }   
-    }
+    if (runtime->get_timing_enabled() == true) rc |= g_aicpu_executor.performTimingRuns(runtime);
 
     // Perform actual kernel run
     rc |= g_aicpu_executor.run(runtime);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index 608efbc60..c1651de13 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -863,6 +863,52 @@ int32_t SchedulerContext::init(
     return 0;
 }
 
+void SchedulerContext::reset()
+{
+    // Reset all per-core execution state
+    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
+        core_exec_states_[i] = {};
+        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+    }
+
+    // Clear per-core dispatch payloads
+    memset(payload_per_core_, 0, sizeof(payload_per_core_));
+    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+    // Reset sync-start drain coordination — a previous run that aborted mid-drain
+    // would otherwise leave dirty pending/elected/ack state for the next reuse.
+    drain_state_.sync_start_pending.store(0, std::memory_order_release);
+    drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+    drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+    drain_state_.pending_task = nullptr;
+
+    // Reset task counters and orchestrator state
+    completed_tasks_.store(0, std::memory_order_release);
+    total_tasks_ = 0;
+    orchestrator_done_ = false;
+    pto2_init_done_.store(false, std::memory_order_release);
+    pto2_init_complete_.store(false, std::memory_order_release);
+
+    // Reset core transition state
+    transition_requested_.store(false, std::memory_order_release);
+    wait_reassign_.store(0, std::memory_order_release);
+    reassigned_.store(false, std::memory_order_release);
+    completed_.store(false, std::memory_order_release);
+
+    // Reset core discovery and assignment state
+    aic_count_ = 0;
+    aiv_count_ = 0;
+    cores_total_num_ = 0;
+    thread_num_ = 0;
+    sched_thread_num_ = 0;
+    orch_to_sched_ = false;
+    active_sched_threads_ = 0;
+    for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) {
+        core_trackers_[t] = CoreTracker{};
+    }
+}
+
 void SchedulerContext::deinit() {
     // Reset all per-core execution state
     for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 79fc6b648..5908c8615 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -62,6 +62,9 @@ class SchedulerContext {
     int32_t
     init(Runtime *runtime, int32_t thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base);
 
+    // Resets all scheduling progress to before execution
+    void reset();
+
     // Reset all SchedulerContext-owned state to its post-construction defaults.
     // Called by AicpuExecutor::deinit() during per-run teardown.
     void deinit();
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index 54415364d..224d17aab 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -314,8 +314,6 @@ void SchedulerContext::dispatch_shape(
 // =============================================================================
 
 int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) {
-    LOG_INFO_V0("Thread %d: At resolve_and_dispatch", thread_idx);
-
     always_assert(sched_ != nullptr);
     CoreTracker &tracker = core_trackers_[thread_idx];
     LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx);

From f1953c504821c2b02a6f32111e73a37c989bb188 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 20 May 2026 11:51:57 +0200
Subject: [PATCH 04/15] Simplifying

---
 .../aicpu/aicpu_executor.cpp                  | 16 +++----
 .../runtime/scheduler/scheduler_cold_path.cpp | 46 -------------------
 .../runtime/scheduler/scheduler_context.h     |  3 --
 3 files changed, 6 insertions(+), 59 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index e71da8341..aa8c66939 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -148,7 +148,6 @@ struct AicpuExecutor {
     int32_t performTimingRuns(Runtime *runtime);
     void deinit(Runtime *runtime);
     int32_t getThreadId() { return thread_idx_accumulator.fetch_add(1); }
-    void resetSchedulerContext() { sched_ctx_.reset(); }
 
     // Barrier function to synchronize threads
     inline void barrier()
@@ -789,9 +788,9 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
         uint64_t t1_ts = get_sys_cnt_aicpu();
         LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, t1_ts - t0_ts);
 
-        // Waiting for threads to come back for after-timing.
-        // deinit(runtime);
-        // init(runtime);
+        // Resetting execution back to start
+        deinit(runtime);
+        init(runtime);
     } 
 
     // Second, perform timed runs (the ones that count)
@@ -805,14 +804,11 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
         uint64_t t1_ts = get_sys_cnt_aicpu();
         LOG_INFO_V9("Thread %d: Timing %d/%d time: %luns", my_thread_idx_, i, timingIterationCount, t1_ts - t0_ts);
 
-        // Waiting for threads to come back for after-timing.
-        // deinit(runtime);
-        // init(runtime);
+        // Resetting execution back to start
+        deinit(runtime);
+        init(runtime);
     }   
 
-    // A barier to make sure all threads agree at this point before running the actual run
-    barrier();
-
     if (rc != 0) LOG_ERROR("Thread %d - Timed runs failed with rc=%d", my_thread_idx_, rc);
 
     // Return code
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index c1651de13..608efbc60 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -863,52 +863,6 @@ int32_t SchedulerContext::init(
     return 0;
 }
 
-void SchedulerContext::reset()
-{
-    // Reset all per-core execution state
-    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        core_exec_states_[i] = {};
-        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
-        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
-    }
-
-    // Clear per-core dispatch payloads
-    memset(payload_per_core_, 0, sizeof(payload_per_core_));
-    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
-
-    // Reset sync-start drain coordination — a previous run that aborted mid-drain
-    // would otherwise leave dirty pending/elected/ack state for the next reuse.
-    drain_state_.sync_start_pending.store(0, std::memory_order_release);
-    drain_state_.drain_worker_elected.store(0, std::memory_order_release);
-    drain_state_.drain_ack_mask.store(0, std::memory_order_release);
-    drain_state_.pending_task = nullptr;
-
-    // Reset task counters and orchestrator state
-    completed_tasks_.store(0, std::memory_order_release);
-    total_tasks_ = 0;
-    orchestrator_done_ = false;
-    pto2_init_done_.store(false, std::memory_order_release);
-    pto2_init_complete_.store(false, std::memory_order_release);
-
-    // Reset core transition state
-    transition_requested_.store(false, std::memory_order_release);
-    wait_reassign_.store(0, std::memory_order_release);
-    reassigned_.store(false, std::memory_order_release);
-    completed_.store(false, std::memory_order_release);
-
-    // Reset core discovery and assignment state
-    aic_count_ = 0;
-    aiv_count_ = 0;
-    cores_total_num_ = 0;
-    thread_num_ = 0;
-    sched_thread_num_ = 0;
-    orch_to_sched_ = false;
-    active_sched_threads_ = 0;
-    for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) {
-        core_trackers_[t] = CoreTracker{};
-    }
-}
-
 void SchedulerContext::deinit() {
     // Reset all per-core execution state
     for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 5908c8615..79fc6b648 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -62,9 +62,6 @@ class SchedulerContext {
     int32_t
     init(Runtime *runtime, int32_t thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base);
 
-    // Resets all scheduling progress to before execution
-    void reset();
-
     // Reset all SchedulerContext-owned state to its post-construction defaults.
     // Called by AicpuExecutor::deinit() during per-run teardown.
     void deinit();

From 3bf8780281c669762c0b396f8991d9eae19a92ce Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 20 May 2026 12:10:54 +0200
Subject: [PATCH 05/15] Addressing some agent suggestions

---
 .../aicpu/aicpu_executor.cpp                   | 18 +++++++++++-------
 .../host/runtime_maker.cpp                     |  2 +-
 .../tensormap_and_ringbuffer/runtime/runtime.h | 10 +++++-----
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 717d3b83f..ba0187bd6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -154,10 +154,10 @@ struct AicpuExecutor {
     {
         // Two phase barrier (in_out), guarantees that all threads are retained in the rendezvous point, until all of them have arrived
        barrier_counter_in_.fetch_add(1);
-       while (barrier_counter_in_.load(std::memory_order_relaxed) % thread_num_ != 0);
+       while (barrier_counter_in_.load(std::memory_order_acquire) % thread_num_ != 0);
        
        barrier_counter_out_.fetch_add(1);
-       while (barrier_counter_out_.load(std::memory_order_relaxed) % thread_num_ != 0);
+       while (barrier_counter_out_.load(std::memory_order_acquire) % thread_num_ != 0);
     }
 
 
@@ -698,7 +698,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
     // Check if this is the last thread to finish
     int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
-    if (prev_finished + 1 % thread_num_ == 0) {
+    if ((prev_finished + 1) % thread_num_ == 0) {
         finished_.store(true, std::memory_order_release);
         // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
         // always tear them down here, but we keep the per-cid orch SO entries
@@ -777,8 +777,10 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
         LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, t1_ts - t0_ts);
 
         // Resetting execution back to start
-        deinit(runtime);
-        init(runtime);
+        if (my_thread_idx_ == 0) {
+            deinit(runtime);
+            init(runtime);
+        }
     } 
 
     // Second, perform timed runs (the ones that count)
@@ -793,8 +795,10 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
         LOG_INFO_V9("Thread %d: Timing %d/%d time: %luns", my_thread_idx_, i, timingIterationCount, t1_ts - t0_ts);
 
         // Resetting execution back to start
-        deinit(runtime);
-        init(runtime);
+        if (my_thread_idx_ == 0) {
+            deinit(runtime);
+            init(runtime);
+        }
     }   
 
     if (rc != 0) LOG_ERROR("Thread %d - Timed runs failed with rc=%d", my_thread_idx_, rc);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 1209dece2..1abb6e855 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -277,7 +277,7 @@ extern "C" int bind_prepared_to_runtime_impl(
                 runtime->timing_iteration_count = RUNTIME_DEFAULT_TIMING_ITERATION_COUNT;
             }
         }
-        LOG_INFO_V0("Warmup iteration count: %d", runtime->timing_iteration_count);
+        LOG_INFO_V0("Timing iteration count: %d", runtime->timing_iteration_count);
     }
 
     // Read ready queue shard count from environment for AICPU scheduler
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index efdce1321..4bd245bb5 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -176,11 +176,6 @@ class Runtime {
     int sche_cpu_num;        // Number of AICPU threads for scheduling
     int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
 
-    // Timing parameters (for precise performance estimation)
-    bool is_timing_enabled;
-    int warmup_iteration_count;
-    int timing_iteration_count;
-
     // Ring buffer size overrides (0 = use compile-time defaults)
     uint64_t task_window_size;
     uint64_t heap_size;
@@ -220,6 +215,11 @@ class Runtime {
     char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
     char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
 
+    // Timing parameters (for precise performance estimation)
+    bool is_timing_enabled;
+    int warmup_iteration_count;
+    int timing_iteration_count;
+
 public:
     /**
      * Constructor - zero-initialize all arrays

From f9f25c3b44b98484a5281a35a518df379f23d0dc Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 20 May 2026 12:14:18 +0200
Subject: [PATCH 06/15] Fixes

---
 .../tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp |  6 +++---
 .../tensormap_and_ringbuffer/runtime/runtime.h        | 11 +++++------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index ba0187bd6..0f4dc79b4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -528,7 +528,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
 #if PTO2_PROFILING
             if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) {
-                l2_perf_aicpu_set_orch_thread_idx(thread_idx);
+                l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_);
             }
 #endif
 
@@ -594,7 +594,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
             );
             LOG_INFO_V9(
-                "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus", thread_idx,
+                "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus", my_thread_idx_,
                 cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
                 cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle)
             );
@@ -647,7 +647,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // Signal completion to the orchestrator state machine
             rt_orchestration_done(rt);
 
-            sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks);
+            sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks);
         }
 #if PTO2_PROFILING
         uint64_t orch_end_ts = get_sys_cnt_aicpu();
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 4bd245bb5..0da9b550b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -190,6 +190,11 @@ class Runtime {
     // When false (default), orchestrator threads exit after orchestration without dispatching tasks.
     // Controlled via PTO2_ORCH_TO_SCHED environment variable.
     bool orch_to_sched;
+    
+    // Timing parameters (for precise performance estimation)
+    bool is_timing_enabled;
+    int warmup_iteration_count;
+    int timing_iteration_count;
 
 private:
     // Kernel binary tracking for cleanup
@@ -214,12 +219,6 @@ class Runtime {
     bool register_new_callable_id_;
     char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
     char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
-
-    // Timing parameters (for precise performance estimation)
-    bool is_timing_enabled;
-    int warmup_iteration_count;
-    int timing_iteration_count;
-
 public:
     /**
      * Constructor - zero-initialize all arrays

From 220d1d9faa6e4de60a4b63cbd9d74efb085e03ac Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 20 May 2026 13:05:11 +0200
Subject: [PATCH 07/15] Fix

---
 .../tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp      | 7 +++----
 .../runtime/tensormap_and_ringbuffer/runtime/runtime.h     | 1 +
 .../tensormap_and_ringbuffer/runtime/shared/runtime.cpp    | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 0f4dc79b4..f840f24b6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -253,10 +253,11 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind;
             DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
             const bool reload_so = runtime->register_new_callable_id();
-            const bool so_in_use = orch_so_table_[callable_id].in_use;
 
-            if (reload_so && so_in_use == false) {
+            if (reload_so) {
                 LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", my_thread_idx_, callable_id);
+                runtime->notify_callable_id_registered();
+                
                 if (*p_handle != nullptr) {
                     dlclose(*p_handle);
                     *p_handle = nullptr;
@@ -394,7 +395,6 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 *p_bind = bind_runtime_func;
                 *p_config_func = config_func;
                 snprintf(p_path, 256, "%s", so_path);
-                orch_so_table_[callable_id].in_use = true;
             } else {
                 LOG_INFO_V0(
                     "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", my_thread_idx_, *p_handle, callable_id
@@ -434,7 +434,6 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                         *p_func = nullptr;
                         *p_bind = nullptr;
                         *p_config_func = nullptr;
-                        orch_so_table_[callable_id].in_use = false;
                         // Unblock scheduler threads before returning so they don't spin forever.
                         runtime_init_ready_.store(true, std::memory_order_release);
                         return -1;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 0da9b550b..271e50a7a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -256,6 +256,7 @@ class Runtime {
     void set_active_callable_id(int32_t callable_id, bool is_new);
     int32_t get_active_callable_id() const;
     bool register_new_callable_id() const;
+    void notify_callable_id_registered();
     void set_device_orch_func_name(const char *name);
     const char *get_device_orch_func_name() const;
     void set_device_orch_config_name(const char *name);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 0bc984871..4e6f25435 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -96,6 +96,7 @@ void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
 int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
 
 bool Runtime::register_new_callable_id() const { return register_new_callable_id_; }
+void Runtime::notify_callable_id_registered() { register_new_callable_id_ = false; }
 
 void Runtime::set_device_orch_func_name(const char *name) {
     if (name == nullptr) {

From 192ef42a57efd85fc18658aadc4071bcac2652f7 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 21 May 2026 10:14:06 +0200
Subject: [PATCH 08/15] Succesffuly running two consecutive inner runs

---
 .../aicpu/aicpu_executor.cpp                  | 90 +++++++++++--------
 .../runtime/scheduler/pto_scheduler.cpp       | 18 ++--
 .../runtime/scheduler/scheduler_dispatch.cpp  |  8 +-
 3 files changed, 71 insertions(+), 45 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index f840f24b6..d60a43601 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -145,6 +145,7 @@ struct AicpuExecutor {
     // ===== Methods =====
     int32_t init(Runtime *runtime);
     int32_t run(Runtime *runtime);
+    int32_t shutdown(Runtime *runtime);
     int32_t performTimingRuns(Runtime *runtime);
     void deinit(Runtime *runtime);
     int32_t getThreadId() { return thread_idx_accumulator.fetch_add(1); }
@@ -687,35 +688,6 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
     LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_);
 
-    // Always shutdown AICore — even if sched_ctx_.completed_ was already true.
-    // platform_deinit_aicore_regs is idempotent; orchestrator threads have
-    // core_trackers_[my_thread_idx_].core_num() == 0 so they skip the loop harmlessly.
-    int32_t shutdown_rc = sched_ctx_.shutdown(my_thread_idx_);
-    if (shutdown_rc != 0 && run_rc == 0) {
-        run_rc = shutdown_rc;
-    }
-
-    // Check if this is the last thread to finish
-    int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
-    if ((prev_finished + 1) % thread_num_ == 0) {
-        finished_.store(true, std::memory_order_release);
-        // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
-        // always tear them down here, but we keep the per-cid orch SO entries
-        // alive for the next run's cache-hit reuse (see run() reload_so branch).
-        if (rt != nullptr) {
-            // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt.
-            const int32_t callable_id = runtime->get_active_callable_id();
-            framework_bind_runtime(nullptr);
-            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) {
-                DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind;
-                if (bind != nullptr) {
-                    bind(nullptr);
-                }
-            }
-            runtime_destroy(rt);
-        }
-    }
-
     return run_rc;
 }
 
@@ -757,6 +729,37 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     LOG_INFO_V0("DeInit: AicpuExecutor reset complete");
 }
 
+int32_t AicpuExecutor::shutdown(Runtime *runtime)
+{
+    // Always shutdown AICore — even if sched_ctx_.completed_ was already true.
+    // platform_deinit_aicore_regs is idempotent; orchestrator threads have
+    // core_trackers_[my_thread_idx_].core_num() == 0 so they skip the loop harmlessly.
+    int32_t shutdown_rc = sched_ctx_.shutdown(my_thread_idx_);
+
+    // Check if this is the last thread to finish
+    int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
+    if ((prev_finished + 1) % thread_num_ == 0) {
+        finished_.store(true, std::memory_order_release);
+        // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
+        // always tear them down here, but we keep the per-cid orch SO entries
+        // alive for the next run's cache-hit reuse (see run() reload_so branch).
+        if (rt != nullptr) {
+            // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt.
+            const int32_t callable_id = runtime->get_active_callable_id();
+            framework_bind_runtime(nullptr);
+            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) {
+                DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind;
+                if (bind != nullptr) {
+                    bind(nullptr);
+                }
+            }
+            runtime_destroy(rt);
+        }
+    }
+
+    return shutdown_rc;
+}
+
 int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
 {
     const auto warmupIterationCount = runtime->get_warmup_iteration_count(); 
@@ -844,16 +847,38 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
     int32_t rc = 0;
 
     // Performing timing evaluation, exclusively for performance evaluation:
-    if (runtime->get_timing_enabled() == true) rc |= g_aicpu_executor.performTimingRuns(runtime);
+    // if (runtime->get_timing_enabled() == true) rc |= g_aicpu_executor.performTimingRuns(runtime);
 
     // Perform actual kernel run
     rc |= g_aicpu_executor.run(runtime);
 
+    g_aicpu_executor.barrier();
+    if (my_thread_idx_ == 0)
+    {
+        g_aicpu_executor.deinit(runtime);
+        g_aicpu_executor.init(runtime);
+    }
+    g_aicpu_executor.barrier();
+
+    rc |= g_aicpu_executor.run(runtime);
+
     if (rc != 0) {
         LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc);
     }
 
     int32_t runtime_rc = read_pto2_runtime_status(runtime);
+    
+    if (runtime_rc != 0) {
+        LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc);
+        return runtime_rc;
+    }
+
+    // Shutting down
+    int32_t shutdown_rc = g_aicpu_executor.shutdown(runtime);
+    if (shutdown_rc != 0) {
+        LOG_ERROR("aicpu_execute: shutdown failed with rc=%d", shutdown_rc);
+        return shutdown_rc;
+    }
 
     // Last thread cleans up
     if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) {
@@ -861,11 +886,6 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
         g_aicpu_executor.deinit(runtime);
     }
 
-    if (runtime_rc != 0) {
-        LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc);
-        return runtime_rc;
-    }
-
     if (rc != 0) {
         return rc;
     }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
index 428897f3d..48066ac9d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
@@ -224,14 +224,14 @@ void PTO2SchedulerState::destroy() {
 
 void PTO2SchedulerState::print_stats() {
     PTO2SchedulerState *sched = this;
-    LOG_INFO_V0("=== Scheduler Statistics ===");
+    LOG_INFO_V9("=== Scheduler Statistics ===");
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
         if (sched->ring_sched_states[r].last_task_alive > 0) {
-            LOG_INFO_V0("Ring %d:", r);
-            LOG_INFO_V0("  last_task_alive: %d", sched->ring_sched_states[r].last_task_alive);
+            LOG_INFO_V9("Ring %d:", r);
+            LOG_INFO_V9("  last_task_alive: %d", sched->ring_sched_states[r].last_task_alive);
             auto &dp = sched->ring_sched_states[r].dep_pool;
             if (dp.top > 0) {
-                LOG_INFO_V0(
+                LOG_INFO_V9(
                     "  dep_pool: top=%d tail=%d used=%d high_water=%d capacity=%d", dp.top, dp.tail, dp.top - dp.tail,
                     dp.high_water, dp.capacity
                 );
@@ -242,19 +242,19 @@ void PTO2SchedulerState::print_stats() {
     LOG_INFO_V0("tasks_completed:   %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed));
     LOG_INFO_V0("tasks_consumed:    %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed));
 #endif
-    LOG_INFO_V0("============================");
+    LOG_INFO_V9("============================");
 }
 
 void PTO2SchedulerState::print_queues() {
     PTO2SchedulerState *sched = this;
-    LOG_INFO_V0("=== Ready Queues ===");
+    LOG_INFO_V9("=== Ready Queues ===");
 
     const char *shape_names[] = {"AIC", "AIV", "MIX"};
 
     for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        LOG_INFO_V0("  %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size());
+        LOG_INFO_V9("  %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size());
     }
-    LOG_INFO_V0("  DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size());
+    LOG_INFO_V9("  DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size());
 
-    LOG_INFO_V0("====================");
+    LOG_INFO_V9("====================");
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index eccd96af1..e2970dbb3 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -238,6 +238,7 @@ void SchedulerContext::dispatch_shape(
     int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
     CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
 ) {
+
 #if PTO2_SCHED_PROFILING
     auto &l2_perf = sched_l2_perf_[thread_idx];
 #endif
@@ -260,15 +261,18 @@ void SchedulerContext::dispatch_shape(
             if (slot_state->active_mask.requires_sync_start()) {
                 if (is_pending) {
                     sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                    LOG_INFO_V9("Thread %d - Pushed task", thread_idx);
                     continue;
                 }
                 int32_t available = cores.count();
                 if (available < slot_state->logical_block_num) {
                     if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) {
                         sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                        LOG_INFO_V9("Thread %d - Pushed task", thread_idx);
                     }
                     for (int rem = bi + 1; rem < got; rem++) {
                         sched_->ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
+                        LOG_INFO_V9("Thread %d - Pushed task", thread_idx);
                     }
                     entered_drain = true;
                     break;
@@ -277,6 +281,7 @@ void SchedulerContext::dispatch_shape(
 
             if (!cores.has_value()) {
                 sched_->ready_queues[static_cast<int32_t>(shape)].push_batch(&batch[bi], got - bi);
+                LOG_INFO_V9("Thread %d - Pushed task", thread_idx);
                 break;
             }
 
@@ -293,6 +298,7 @@ void SchedulerContext::dispatch_shape(
 
             if (slot_state->next_block_idx < slot_state->logical_block_num) {
                 sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                LOG_INFO_V9("Thread %d - Pushed task", thread_idx);
             }
             made_progress = true;
 #if PTO2_SCHED_PROFILING
@@ -391,7 +397,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     l2_perf.sched_start_ts = get_sys_cnt_aicpu();
 #endif
 
-    LOG_INFO_V0("Thread %d: Scheduling Start. (Completed: %s)", thread_idx, completed_.load() ? "True" : "False");
     while (true) {
         if (completed_.load(std::memory_order_acquire)) {
             break;
@@ -578,6 +583,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
 #endif
             if (local_buf.count > 0) {
                 ready_queue.push_batch(local_buf.slot_states, local_buf.count);
+                LOG_INFO_V9("Thread %d - Pushed task", thread_idx);
                 local_buf.count = 0;
             }
         }

From 108bfc4f7ff472803dcecd4ea957bdd8c663693a Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 21 May 2026 10:27:57 +0200
Subject: [PATCH 09/15] Recovering timing runs

---
 .../aicpu/aicpu_executor.cpp                  | 23 +++++++------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index d60a43601..e2c1320b6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -843,25 +843,18 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
         }
     }
 
-    // Return code. Must be zero for all runs
-    int32_t rc = 0;
-
     // Performing timing evaluation, exclusively for performance evaluation:
-    // if (runtime->get_timing_enabled() == true) rc |= g_aicpu_executor.performTimingRuns(runtime);
-
-    // Perform actual kernel run
-    rc |= g_aicpu_executor.run(runtime);
-
-    g_aicpu_executor.barrier();
-    if (my_thread_idx_ == 0)
+    if (runtime->get_timing_enabled() == true)
     {
-        g_aicpu_executor.deinit(runtime);
-        g_aicpu_executor.init(runtime);
+        int32_t timing_rc = g_aicpu_executor.performTimingRuns(runtime);
+        if (timing_rc != 0) {
+            LOG_ERROR("aicpu_execute: timing run failed with rc=%d", timing_rc);
+            return timing_rc;
+        }
     }
-    g_aicpu_executor.barrier();
-
-    rc |= g_aicpu_executor.run(runtime);
 
+    // Perform actual kernel run
+    int32_t rc = g_aicpu_executor.run(runtime);
     if (rc != 0) {
         LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc);
     }

From 55580d624b0d104d5e3c3148a4d0296af7e9e2a9 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 21 May 2026 11:19:16 +0200
Subject: [PATCH 10/15] Separated orchestration loading from actual run

---
 .../aicpu/aicpu_executor.cpp                  | 773 +++++++++---------
 .../runtime/shared/runtime.cpp                |   1 -
 2 files changed, 396 insertions(+), 378 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index e2c1320b6..f86e7f5eb 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -131,7 +131,7 @@ struct AicpuExecutor {
     std::atomic<bool> runtime_init_ready_{false};
 
     // Cached orch args pointer set by the orchestration thread before scheduler
-    // init; consumed by the (*p_func)(*orch_args_cached_) invocation below.
+    // init; consumed by the (*p_func_)(*orch_args_cached_) invocation below.
     const ChipStorageTaskArgs *orch_args_cached_{nullptr};
 
     // Per-callable_id table. Single orch thread today, so first-write/read
@@ -142,10 +142,18 @@ struct AicpuExecutor {
     // ===== Scheduler context (owns all dispatch/completion/drain state) =====
     SchedulerContext sched_ctx_;
 
+    // ===== Orchestrator dynamically loaded structures
+    void **p_handle_{nullptr};
+    char *p_path_{nullptr};
+    DeviceOrchestrationFunc *p_func_{nullptr};
+    DeviceOrchestrationBindRuntimeFunc *p_bind_{nullptr};
+    DeviceOrchestrationConfigFunc *p_config_func_{nullptr};
+
     // ===== Methods =====
     int32_t init(Runtime *runtime);
     int32_t run(Runtime *runtime);
     int32_t shutdown(Runtime *runtime);
+    int32_t loadOrchestrator(Runtime* runtime);
     int32_t performTimingRuns(Runtime *runtime);
     void deinit(Runtime *runtime);
     int32_t getThreadId() { return thread_idx_accumulator.fetch_add(1); }
@@ -213,442 +221,453 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
 
     finished_count_.store(0, std::memory_order_release);
 
+    // Loading orchestrator
+    int32_t load_orch_rc = loadOrchestrator(runtime);
+    if (load_orch_rc != 0)
+    {
+        LOG_ERROR("Thread %d: Failed to load orchestrator", my_thread_idx_);
+        return load_orch_rc;
+    }
+
     init_done_.store(true, std::memory_order_release);
     LOG_INFO_V0("AicpuExecutor: Init complete");
     return 0;
 }
 
-/**
- * Shutdown AICore - Send exit signal via registers to all AICore kernels
- */
-int32_t AicpuExecutor::run(Runtime *runtime) {
-    int32_t run_rc = 0;
-    LOG_INFO_V0("Thread %d: at AicpuExecutor::Run", my_thread_idx_);
+int32_t AicpuExecutor::loadOrchestrator(Runtime* runtime)
+{
+    LOG_INFO_V0("Thread %d: Orchestrator Loading", my_thread_idx_);
+
+    // Per-callable_id dispatch: the orch SO state lives in
+    // `orch_so_table_[callable_id]` keyed by registration order;
+    // reload is governed by `register_new_callable_id_`.
+    const int32_t callable_id = runtime->get_active_callable_id();
+    if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
+        LOG_ERROR(
+            "Thread %d: invalid callable_id %d (limit=%d)", my_thread_idx_, callable_id, MAX_REGISTERED_CALLABLE_IDS
+        );
+        runtime_init_ready_.store(true, std::memory_order_release);
+        return -1;
+    }
+    p_handle_ = &orch_so_table_[callable_id].handle;
+    p_path_ = orch_so_table_[callable_id].path;
+    p_func_ = &orch_so_table_[callable_id].func;
+    p_bind_ = &orch_so_table_[callable_id].bind;
+    p_config_func_ = &orch_so_table_[callable_id].config_func;
+    const bool reload_so = runtime->register_new_callable_id();
+
+    if (reload_so) {
+        LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", my_thread_idx_, callable_id);
+        
+        if (*p_handle_ != nullptr) {
+            dlclose(*p_handle_);
+            *p_handle_ = nullptr;
+            *p_func_ = nullptr;
+            *p_bind_ = nullptr;
+            if (p_path_[0] != '\0') {
+                // Unlink the old file so the new open() lands on a
+                // fresh inode — protects against SIGBUS / ETXTBSY when
+                // the kernel still has the old mapping pinned.
+                unlink(p_path_);
+                p_path_[0] = '\0';
+            }
+        }
 
-    // Orchestrator check
-    if (my_thread_idx_ >= sched_thread_num_) {
-#if PTO2_PROFILING
-        uint64_t orch_cycle_start = 0;
-        int32_t pto2_submitted_tasks = -1;
-#endif
+        const void *so_data = reinterpret_cast<const void *>(runtime->get_dev_orch_so_addr());
+        size_t so_size = runtime->get_dev_orch_so_size();
 
-        LOG_INFO_V0("Thread %d: Orchestrator Running", my_thread_idx_);
+        if (so_data == nullptr || so_size == 0) {
+            LOG_ERROR("Thread %d: Device orchestration SO not set", my_thread_idx_);
+            // Unblock scheduler threads before returning so they don't spin forever.
+            runtime_init_ready_.store(true, std::memory_order_release);
+            return -1;
+        }
 
-        // Orchestrator thread: load + run the device orchestration SO. The braces
-        // scope the per-callable dlopen / SO-table locals to this block.
-        {
-            // Per-callable_id dispatch: the orch SO state lives in
-            // `orch_so_table_[callable_id]` keyed by registration order;
-            // reload is governed by `register_new_callable_id_`.
-            const int32_t callable_id = runtime->get_active_callable_id();
-            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
-                LOG_ERROR(
-                    "Thread %d: invalid callable_id %d (limit=%d)", my_thread_idx_, callable_id, MAX_REGISTERED_CALLABLE_IDS
+        // Try multiple paths that may allow execution on AICPU
+        char so_path[256];
+        bool file_created = false;
+        const char *candidate_dirs[] = {
+            "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp"
+        };
+        const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
+
+        for (int32_t i = 0; i < num_candidates && !file_created; i++) {
+            int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path));
+            if (fd < 0) {
+                LOG_INFO_V0(
+                    "Thread %d: Cannot create SO at %s (errno=%d), trying next path", my_thread_idx_, so_path, errno
                 );
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
+                continue;
             }
-            void **p_handle = &orch_so_table_[callable_id].handle;
-            char *p_path = orch_so_table_[callable_id].path;
-            DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func;
-            DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind;
-            DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
-            const bool reload_so = runtime->register_new_callable_id();
-
-            if (reload_so) {
-                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", my_thread_idx_, callable_id);
-                runtime->notify_callable_id_registered();
-                
-                if (*p_handle != nullptr) {
-                    dlclose(*p_handle);
-                    *p_handle = nullptr;
-                    *p_func = nullptr;
-                    *p_bind = nullptr;
-                    if (p_path[0] != '\0') {
-                        // Unlink the old file so the new open() lands on a
-                        // fresh inode — protects against SIGBUS / ETXTBSY when
-                        // the kernel still has the old mapping pinned.
-                        unlink(p_path);
-                        p_path[0] = '\0';
-                    }
-                }
-
-                const void *so_data = reinterpret_cast<const void *>(runtime->get_dev_orch_so_addr());
-                size_t so_size = runtime->get_dev_orch_so_size();
-
-                if (so_data == nullptr || so_size == 0) {
-                    LOG_ERROR("Thread %d: Device orchestration SO not set", my_thread_idx_);
-                    // Unblock scheduler threads before returning so they don't spin forever.
-                    runtime_init_ready_.store(true, std::memory_order_release);
-                    return -1;
-                }
-
-                // Try multiple paths that may allow execution on AICPU
-                char so_path[256];
-                bool file_created = false;
-                const char *candidate_dirs[] = {
-                    "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp"
-                };
-                const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
-
-                for (int32_t i = 0; i < num_candidates && !file_created; i++) {
-                    int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, so_path, sizeof(so_path));
-                    if (fd < 0) {
-                        LOG_INFO_V0(
-                            "Thread %d: Cannot create SO at %s (errno=%d), trying next path", my_thread_idx_, so_path, errno
-                        );
-                        continue;
-                    }
-                    ssize_t written = write(fd, so_data, so_size);
-                    close(fd);
-                    if (written != static_cast<ssize_t>(so_size)) {
-                        LOG_INFO_V0(
-                            "Thread %d: Cannot write SO to %s (errno=%d), trying next path", my_thread_idx_, so_path, errno
-                        );
-                        unlink(so_path);
-                        continue;
-                    }
-                    file_created = true;
-                    LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", my_thread_idx_, so_path, so_size);
-                }
+            ssize_t written = write(fd, so_data, so_size);
+            close(fd);
+            if (written != static_cast<ssize_t>(so_size)) {
+                LOG_INFO_V0(
+                    "Thread %d: Cannot write SO to %s (errno=%d), trying next path", my_thread_idx_, so_path, errno
+                );
+                unlink(so_path);
+                continue;
+            }
+            file_created = true;
+            LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", my_thread_idx_, so_path, so_size);
+        }
 
-                if (!file_created) {
-                    LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", my_thread_idx_);
-                    // Unblock scheduler threads before returning so they don't spin forever.
-                    runtime_init_ready_.store(true, std::memory_order_release);
-                    return -1;
-                }
+        if (!file_created) {
+            LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", my_thread_idx_);
+            // Unblock scheduler threads before returning so they don't spin forever.
+            runtime_init_ready_.store(true, std::memory_order_release);
+            return -1;
+        }
 
-                dlerror();
-                void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
-                const char *dlopen_err = dlerror();
-                if (handle == nullptr) {
-                    LOG_ERROR("Thread %d: dlopen failed: %s", my_thread_idx_, dlopen_err ? dlopen_err : "unknown");
-                    unlink(so_path);
-                    // Unblock scheduler threads before returning so they don't spin forever.
-                    runtime_init_ready_.store(true, std::memory_order_release);
-                    return -1;
-                }
-                LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", my_thread_idx_, handle);
-
-                // Unlink the on-disk SO immediately: dlopen has already mmap'd
-                // the image, so the kernel keeps the inode alive until the
-                // matching dlclose / process exit. This prevents stale
-                // libdevice_orch_<pid>_<cid>.so files from accumulating in
-                // /tmp when child processes exit via os._exit(0), which skips
-                // ~AicpuExecutor (worker.py: _sub/_chip/_child loops).
-                unlink(so_path);
+        dlerror();
+        void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
+        const char *dlopen_err = dlerror();
+        if (handle == nullptr) {
+            LOG_ERROR("Thread %d: dlopen failed: %s", my_thread_idx_, dlopen_err ? dlopen_err : "unknown");
+            unlink(so_path);
+            // Unblock scheduler threads before returning so they don't spin forever.
+            runtime_init_ready_.store(true, std::memory_order_release);
+            return -1;
+        }
+        LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", my_thread_idx_, handle);
+
+        // Unlink the on-disk SO immediately: dlopen has already mmap'd
+        // the image, so the kernel keeps the inode alive until the
+        // matching dlclose / process exit. This prevents stale
+        // libdevice_orch_<pid>_<cid>.so files from accumulating in
+        // /tmp when child processes exit via os._exit(0), which skips
+        // ~AicpuExecutor (worker.py: _sub/_chip/_child loops).
+        unlink(so_path);
+
+        const char *entry_symbol = runtime->get_device_orch_func_name();
+        if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
+            entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
+        }
+        const char *config_symbol = runtime->get_device_orch_config_name();
+        if (config_symbol == nullptr || config_symbol[0] == '\0') {
+            config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
+        }
 
-                const char *entry_symbol = runtime->get_device_orch_func_name();
-                if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
-                    entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
-                }
-                const char *config_symbol = runtime->get_device_orch_config_name();
-                if (config_symbol == nullptr || config_symbol[0] == '\0') {
-                    config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
-                }
+        dlerror();
+        DeviceOrchestrationFunc orch_func =
+            reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
+        const char *entry_dlsym_error = dlerror();
+        if (entry_dlsym_error != nullptr) {
+            LOG_ERROR(
+                "Thread %d: dlsym failed for entry symbol '%s': %s", my_thread_idx_, entry_symbol, entry_dlsym_error
+            );
+            dlclose(handle);
+            unlink(so_path);
+            // Unblock scheduler threads before returning so they don't spin forever.
+            runtime_init_ready_.store(true, std::memory_order_release);
+            return -1;
+        }
+        if (orch_func == nullptr) {
+            LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", my_thread_idx_, entry_symbol);
+            dlclose(handle);
+            unlink(so_path);
+            // Unblock scheduler threads before returning so they don't spin forever.
+            runtime_init_ready_.store(true, std::memory_order_release);
+            return -1;
+        }
 
-                dlerror();
-                DeviceOrchestrationFunc orch_func =
-                    reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
-                const char *entry_dlsym_error = dlerror();
-                if (entry_dlsym_error != nullptr) {
-                    LOG_ERROR(
-                        "Thread %d: dlsym failed for entry symbol '%s': %s", my_thread_idx_, entry_symbol, entry_dlsym_error
-                    );
-                    dlclose(handle);
-                    unlink(so_path);
-                    // Unblock scheduler threads before returning so they don't spin forever.
-                    runtime_init_ready_.store(true, std::memory_order_release);
-                    return -1;
-                }
-                if (orch_func == nullptr) {
-                    LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", my_thread_idx_, entry_symbol);
-                    dlclose(handle);
-                    unlink(so_path);
-                    // Unblock scheduler threads before returning so they don't spin forever.
-                    runtime_init_ready_.store(true, std::memory_order_release);
-                    return -1;
-                }
+        dlerror();
+        auto config_func = reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, config_symbol));
+        const char *config_dlsym_error = dlerror();
+        if (config_dlsym_error != nullptr || config_func == nullptr) {
+            LOG_ERROR(
+                "Thread %d: dlsym failed for config symbol '%s': %s", my_thread_idx_, config_symbol,
+                config_dlsym_error ? config_dlsym_error : "NULL function pointer"
+            );
+            config_func = nullptr;
+        }
 
-                dlerror();
-                auto config_func = reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, config_symbol));
-                const char *config_dlsym_error = dlerror();
-                if (config_dlsym_error != nullptr || config_func == nullptr) {
-                    LOG_ERROR(
-                        "Thread %d: dlsym failed for config symbol '%s': %s", my_thread_idx_, config_symbol,
-                        config_dlsym_error ? config_dlsym_error : "NULL function pointer"
-                    );
-                    config_func = nullptr;
-                }
+        dlerror();
+        auto bind_runtime_func =
+            reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "framework_bind_runtime"));
+        const char *bind_runtime_error = dlerror();
+        if (bind_runtime_error != nullptr) {
+            LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", my_thread_idx_, bind_runtime_error);
+            bind_runtime_func = nullptr;
+        }
 
-                dlerror();
-                auto bind_runtime_func =
-                    reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "framework_bind_runtime"));
-                const char *bind_runtime_error = dlerror();
-                if (bind_runtime_error != nullptr) {
-                    LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", my_thread_idx_, bind_runtime_error);
-                    bind_runtime_func = nullptr;
-                }
+        *p_handle_ = handle;
+        *p_func_ = orch_func;
+        *p_bind_ = bind_runtime_func;
+        *p_config_func_ = config_func;
+        snprintf(p_path_, 256, "%s", so_path);
+    } else {
+        LOG_INFO_V0(
+            "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", my_thread_idx_, *p_handle_, callable_id
+        );
+        if (*p_handle_ == nullptr || *p_func_ == nullptr) {
+            LOG_ERROR(
+                "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", my_thread_idx_,
+                callable_id
+            );
+            // Unblock scheduler threads before returning so they don't spin forever.
+            runtime_init_ready_.store(true, std::memory_order_release);
+            return -1;
+        }
+    }
 
-                *p_handle = handle;
-                *p_func = orch_func;
-                *p_bind = bind_runtime_func;
-                *p_config_func = config_func;
-                snprintf(p_path, 256, "%s", so_path);
-            } else {
-                LOG_INFO_V0(
-                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", my_thread_idx_, *p_handle, callable_id
+    // Validate arg count on every run (reload or cache hit).
+    if (*p_config_func_ != nullptr) {
+        PTO2OrchestrationConfig cfg = (*p_config_func_)(runtime->get_orch_args());
+        LOG_INFO_V0("Thread %d: Config: expected_args=%d", my_thread_idx_, cfg.expected_arg_count);
+        if (cfg.expected_arg_count > 0) {
+            const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
+            int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count();
+            if (actual_arg_count < cfg.expected_arg_count) {
+                LOG_ERROR(
+                    "Thread %d: arg_count %d < expected %d", my_thread_idx_, actual_arg_count,
+                    cfg.expected_arg_count
                 );
-                if (*p_handle == nullptr || *p_func == nullptr) {
-                    LOG_ERROR(
-                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", my_thread_idx_,
-                        callable_id
-                    );
-                    // Unblock scheduler threads before returning so they don't spin forever.
-                    runtime_init_ready_.store(true, std::memory_order_release);
-                    return -1;
+                // Clean up cached state so a subsequent run does a full reload.
+                if (*p_handle_ != nullptr) {
+                    dlclose(*p_handle_);
+                    *p_handle_ = nullptr;
                 }
-            }
-
-            // Validate arg count on every run (reload or cache hit).
-            if (*p_config_func != nullptr) {
-                PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args());
-                LOG_INFO_V0("Thread %d: Config: expected_args=%d", my_thread_idx_, cfg.expected_arg_count);
-                if (cfg.expected_arg_count > 0) {
-                    const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
-                    int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count();
-                    if (actual_arg_count < cfg.expected_arg_count) {
-                        LOG_ERROR(
-                            "Thread %d: arg_count %d < expected %d", my_thread_idx_, actual_arg_count,
-                            cfg.expected_arg_count
-                        );
-                        // Clean up cached state so a subsequent run does a full reload.
-                        if (*p_handle != nullptr) {
-                            dlclose(*p_handle);
-                            *p_handle = nullptr;
-                        }
-                        if (p_path[0] != '\0') {
-                            unlink(p_path);
-                            p_path[0] = '\0';
-                        }
-                        *p_func = nullptr;
-                        *p_bind = nullptr;
-                        *p_config_func = nullptr;
-                        // Unblock scheduler threads before returning so they don't spin forever.
-                        runtime_init_ready_.store(true, std::memory_order_release);
-                        return -1;
-                    }
+                if (p_path_[0] != '\0') {
+                    unlink(p_path_);
+                    p_path_[0] = '\0';
                 }
-            } else {
-                LOG_INFO_V0("Thread %d: No config function, using defaults", my_thread_idx_);
+                *p_func_ = nullptr;
+                *p_bind_ = nullptr;
+                *p_config_func_ = nullptr;
+                // Unblock scheduler threads before returning so they don't spin forever.
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
             }
+        }
+    } else {
+        LOG_INFO_V0("Thread %d: No config function, using defaults", my_thread_idx_);
+    }
 
-            // sm_handle / rt are bound to *this* run's memory and must be
-            // (re)created every run, regardless of whether the SO itself was
-            // reused above.
-            const ChipStorageTaskArgs &args = runtime->get_orch_args();
-            int32_t arg_count = args.tensor_count() + args.scalar_count();
-            LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", my_thread_idx_, runtime->get_gm_sm_ptr(), arg_count);
-            for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) {
-                const ContinuousTensor &t = args.tensor(i);
-                LOG_INFO_V0(
-                    "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", my_thread_idx_, i,
-                    static_cast<uint64_t>(t.data), t.ndims, static_cast<unsigned>(t.dtype)
-                );
-            }
-            for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) {
-                LOG_INFO_V0(
-                    "Thread %d: orch_args[%d] = SCALAR(0x%lx)", my_thread_idx_, args.tensor_count() + i,
-                    static_cast<uint64_t>(args.scalar(i))
-                );
-            }
+    return 0;
+}
 
-            uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE;
-            uint64_t heap_size = PTO2_HEAP_SIZE;
+/**
+ * Shutdown AICore - Send exit signal via registers to all AICore kernels
+ */
+int32_t AicpuExecutor::run(Runtime *runtime) {
+    int32_t run_rc = 0;
+    LOG_INFO_V0("Thread %d: at AicpuExecutor::Run", my_thread_idx_);
 
-            if (runtime->task_window_size > 0) {
-                task_window_size = runtime->task_window_size;
-            }
-            if (runtime->heap_size > 0) {
-                heap_size = runtime->heap_size;
-            }
-            int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
-            if (runtime->dep_pool_size > 0) {
-                dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
-            }
+    // Orchestrator check
+    if (my_thread_idx_ >= sched_thread_num_) {
+#if PTO2_PROFILING
+        uint64_t orch_cycle_start = 0;
+        int32_t pto2_submitted_tasks = -1;
+#endif
+
+        // Orchestrator thread: load + run the device orchestration SO. 
+        LOG_INFO_V0("Thread %d: Orchestrator Running", my_thread_idx_);
+
+        // sm_handle / rt are bound to *this* run's memory and must be
+        // (re)created every run, regardless of whether the SO itself was
+        // reused above.
+        const ChipStorageTaskArgs &args = runtime->get_orch_args();
+        int32_t arg_count = args.tensor_count() + args.scalar_count();
+        LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", my_thread_idx_, runtime->get_gm_sm_ptr(), arg_count);
+        for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) {
+            const ContinuousTensor &t = args.tensor(i);
+            LOG_INFO_V0(
+                "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", my_thread_idx_, i,
+                static_cast<uint64_t>(t.data), t.ndims, static_cast<unsigned>(t.dtype)
+            );
+        }
+        for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) {
             LOG_INFO_V0(
-                "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", my_thread_idx_,
-                static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
+                "Thread %d: orch_args[%d] = SCALAR(0x%lx)", my_thread_idx_, args.tensor_count() + i,
+                static_cast<uint64_t>(args.scalar(i))
             );
+        }
 
-            void *sm_ptr = runtime->get_gm_sm_ptr();
-            void *gm_heap = runtime->get_gm_heap_ptr();
+        uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE;
+        uint64_t heap_size = PTO2_HEAP_SIZE;
 
-            uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size);
-            PTO2SharedMemoryHandle *sm_handle =
-                PTO2SharedMemoryHandle::create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size);
-            if (!sm_handle) {
-                LOG_ERROR("Thread %d: Failed to create shared memory handle", my_thread_idx_);
-                // Unblock scheduler threads before returning so they don't spin forever.
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
-            }
+        if (runtime->task_window_size > 0) {
+            task_window_size = runtime->task_window_size;
+        }
+        if (runtime->heap_size > 0) {
+            heap_size = runtime->heap_size;
+        }
+        int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
+        if (runtime->dep_pool_size > 0) {
+            dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
+        }
+        LOG_INFO_V0(
+            "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", my_thread_idx_,
+            static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
+        );
 
-            rt = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity);
-            if (!rt) {
-                LOG_ERROR("Thread %d: Failed to create PTO2Runtime", my_thread_idx_);
-                sm_handle->destroy();
-                // Unblock scheduler threads before returning so they don't spin forever.
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
-            }
+        void *sm_ptr = runtime->get_gm_sm_ptr();
+        void *gm_heap = runtime->get_gm_heap_ptr();
+
+        uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size);
+        PTO2SharedMemoryHandle *sm_handle =
+            PTO2SharedMemoryHandle::create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size);
+        if (!sm_handle) {
+            LOG_ERROR("Thread %d: Failed to create shared memory handle", my_thread_idx_);
+            // Unblock scheduler threads before returning so they don't spin forever.
+            runtime_init_ready_.store(true, std::memory_order_release);
+            return -1;
+        }
+
+        rt = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity);
+        if (!rt) {
+            LOG_ERROR("Thread %d: Failed to create PTO2Runtime", my_thread_idx_);
+            sm_handle->destroy();
+            // Unblock scheduler threads before returning so they don't spin forever.
+            runtime_init_ready_.store(true, std::memory_order_release);
+            return -1;
+        }
 
 #if PTO2_PROFILING
-            rt->orchestrator.l2_perf_level = get_l2_perf_level();
+        rt->orchestrator.l2_perf_level = get_l2_perf_level();
 #endif
 
-            // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
-            rt->orchestrator.total_cluster_count = sched_ctx_.aic_count();
-            rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count();
+        // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
+        rt->orchestrator.total_cluster_count = sched_ctx_.aic_count();
+        rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count();
 
-            // With multi-ring, slot_states are per-ring inside the scheduler.
-            runtime->set_slot_states_ptr(nullptr);
+        // With multi-ring, slot_states are per-ring inside the scheduler.
+        runtime->set_slot_states_ptr(nullptr);
 
-            orch_args_cached_ = &args;
+        orch_args_cached_ = &args;
 
-            // Wire scheduler context to the newly created PTO2Runtime before
-            // releasing scheduler threads from runtime_init_ready_.
-            sched_ctx_.bind_runtime(rt);
+        // Wire scheduler context to the newly created PTO2Runtime before
+        // releasing scheduler threads from runtime_init_ready_.
+        sched_ctx_.bind_runtime(rt);
 
-            runtime_init_ready_.store(true, std::memory_order_release);
+        runtime_init_ready_.store(true, std::memory_order_release);
 
-            // Wait for scheduler's one-time init to complete
-            sched_ctx_.wait_pto2_init_complete();
+        // Wait for scheduler's one-time init to complete
+        sched_ctx_.wait_pto2_init_complete();
 
 #if PTO2_PROFILING
-            if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) {
-                l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_);
-            }
+        if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) {
+            l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_);
+        }
 #endif
 
-            // dep_gen plugs into the orchestrator thread (single-instance subsystem):
-            // set the per-thread queue index and pop the initial buffer before any
-            // submit_task can fire inside orch_func_.
-            if (is_dep_gen_enabled()) {
-                dep_gen_aicpu_set_orch_thread_idx(my_thread_idx_);
-                dep_gen_aicpu_init();
-            }
+        // dep_gen plugs into the orchestrator thread (single-instance subsystem):
+        // set the per-thread queue index and pop the initial buffer before any
+        // submit_task can fire inside orch_func_.
+        if (is_dep_gen_enabled()) {
+            dep_gen_aicpu_set_orch_thread_idx(my_thread_idx_);
+            dep_gen_aicpu_init();
+        }
 
 #if PTO2_PROFILING
-            orch_cycle_start = get_sys_cnt_aicpu();
+        orch_cycle_start = get_sys_cnt_aicpu();
 #endif
-            framework_bind_runtime(rt);
-            if (*p_bind != nullptr) {
-                (*p_bind)(rt);
-            }
-            rt_scope_begin(rt);
-            (*p_func)(*orch_args_cached_);
-            rt_scope_end(rt);
-
-            // Flush the (potentially partially-filled) DepGenBuffer so the host
-            // collector can pick it up before this orchestrator thread joins.
-            if (is_dep_gen_enabled()) {
-                dep_gen_aicpu_flush();
-            }
+        framework_bind_runtime(rt);
+        if (*p_bind_ != nullptr) {
+            (*p_bind_)(rt);
+        }
+        rt_scope_begin(rt);
+        (*p_func_)(*orch_args_cached_);
+        rt_scope_end(rt);
+
+        // Flush the (potentially partially-filled) DepGenBuffer so the host
+        // collector can pick it up before this orchestrator thread joins.
+        if (is_dep_gen_enabled()) {
+            dep_gen_aicpu_flush();
+        }
 #if PTO2_PROFILING
-            uint64_t orch_cycle_end = get_sys_cnt_aicpu();
-            (void)orch_cycle_end;
+        uint64_t orch_cycle_end = get_sys_cnt_aicpu();
+        (void)orch_cycle_end;
 #endif
 
-            // Print orchestrator profiling data
+        // Print orchestrator profiling data
 #if PTO2_ORCH_PROFILING
-            PTO2OrchProfilingData p = orchestrator_get_profiling();
-            uint64_t total =
-                p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle;
-            if (total == 0) total = 1;  // avoid div-by-zero
-            LOG_INFO_V9(
-                "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", my_thread_idx_,
-                static_cast<int64_t>(p.submit_count), cycles_to_us(total)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   task+heap_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
-                my_thread_idx_, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
-                cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
-                static_cast<uint64_t>(p.alloc_atomic_count)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   sync_tensormap : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.sync_cycle),
-                p.sync_cycle * 100.0 / total
-            );
-            LOG_INFO_V9(
-                "Thread %d:   lookup+dep     : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.lookup_cycle),
-                p.lookup_cycle * 100.0 / total
-            );
-            LOG_INFO_V9(
-                "Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.insert_cycle),
-                p.insert_cycle * 100.0 / total
-            );
-            LOG_INFO_V9(
-                "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", my_thread_idx_,
-                cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus", my_thread_idx_,
-                cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
-                cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   avg/task       : %.3fus", my_thread_idx_,
-                p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0
-            );
+        PTO2OrchProfilingData p = orchestrator_get_profiling();
+        uint64_t total =
+            p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle;
+        if (total == 0) total = 1;  // avoid div-by-zero
+        LOG_INFO_V9(
+            "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", my_thread_idx_,
+            static_cast<int64_t>(p.submit_count), cycles_to_us(total)
+        );
+        LOG_INFO_V9(
+            "Thread %d:   task+heap_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
+            my_thread_idx_, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
+            cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
+            static_cast<uint64_t>(p.alloc_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:   sync_tensormap : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.sync_cycle),
+            p.sync_cycle * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "Thread %d:   lookup+dep     : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.lookup_cycle),
+            p.lookup_cycle * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.insert_cycle),
+            p.insert_cycle * 100.0 / total
+        );
+        LOG_INFO_V9(
+            "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", my_thread_idx_,
+            cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus", my_thread_idx_,
+            cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
+            cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle)
+        );
+        LOG_INFO_V9(
+            "Thread %d:   avg/task       : %.3fus", my_thread_idx_,
+            p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0
+        );
 
 #if PTO2_TENSORMAP_PROFILING
-            PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling();
-            LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", my_thread_idx_);
-            LOG_INFO_V9(
-                "Thread %d:   lookups        : %" PRIu64 ", inserts: %" PRIu64 "", my_thread_idx_,
-                static_cast<uint64_t>(tp.lookup_count), static_cast<uint64_t>(tp.insert_count)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   chain walked   : total=%" PRIu64 ", avg=%.1f, max=%d", my_thread_idx_,
-                static_cast<uint64_t>(tp.lookup_chain_total),
-                tp.lookup_count > 0 ? static_cast<double>(tp.lookup_chain_total) / tp.lookup_count : 0.0,
-                tp.lookup_chain_max
-            );
-            LOG_INFO_V9(
-                "Thread %d:   overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", my_thread_idx_,
-                static_cast<uint64_t>(tp.overlap_checks), static_cast<uint64_t>(tp.overlap_hits),
-                tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0
-            );
+        PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling();
+        LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", my_thread_idx_);
+        LOG_INFO_V9(
+            "Thread %d:   lookups        : %" PRIu64 ", inserts: %" PRIu64 "", my_thread_idx_,
+            static_cast<uint64_t>(tp.lookup_count), static_cast<uint64_t>(tp.insert_count)
+        );
+        LOG_INFO_V9(
+            "Thread %d:   chain walked   : total=%" PRIu64 ", avg=%.1f, max=%d", my_thread_idx_,
+            static_cast<uint64_t>(tp.lookup_chain_total),
+            tp.lookup_count > 0 ? static_cast<double>(tp.lookup_chain_total) / tp.lookup_count : 0.0,
+            tp.lookup_chain_max
+        );
+        LOG_INFO_V9(
+            "Thread %d:   overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", my_thread_idx_,
+            static_cast<uint64_t>(tp.overlap_checks), static_cast<uint64_t>(tp.overlap_hits),
+            tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0
+        );
 #endif
 #endif  // PTO2_ORCH_PROFILING
 
-            // Latch task count from PTO2 shared memory to hand off to the
-            // scheduler. The orchestrator's run window (start_time / end_time /
-            // submit_count) is no longer published to shared memory — the
-            // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line
-            // below carries the same envelope info for debugging, and
-            // host-side swimlane derives per-phase timing from the per-event
-            // AicpuPhaseRecord[] stream that already covers everything inside
-            // submit_task().
-            int32_t total_tasks = 0;
-            if (rt->orchestrator.sm_header) {
-                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-                    total_tasks +=
-                        rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
-                }
+        // Latch task count from PTO2 shared memory to hand off to the
+        // scheduler. The orchestrator's run window (start_time / end_time /
+        // submit_count) is no longer published to shared memory — the
+        // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line
+        // below carries the same envelope info for debugging, and
+        // host-side swimlane derives per-phase timing from the per-event
+        // AicpuPhaseRecord[] stream that already covers everything inside
+        // submit_task().
+        int32_t total_tasks = 0;
+        if (rt->orchestrator.sm_header) {
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+                total_tasks +=
+                    rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
             }
+        }
 
 #if PTO2_PROFILING
-            pto2_submitted_tasks = total_tasks;
+        pto2_submitted_tasks = total_tasks;
 #endif
 
-            // Signal completion to the orchestrator state machine
-            rt_orchestration_done(rt);
+        // Signal completion to the orchestrator state machine
+        rt_orchestration_done(rt);
 
-            sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks);
-        }
+        sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks);
 #if PTO2_PROFILING
         uint64_t orch_end_ts = get_sys_cnt_aicpu();
         LOG_INFO_V9(
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
index 4e6f25435..0bc984871 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
@@ -96,7 +96,6 @@ void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
 int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
 
 bool Runtime::register_new_callable_id() const { return register_new_callable_id_; }
-void Runtime::notify_callable_id_registered() { register_new_callable_id_ = false; }
 
 void Runtime::set_device_orch_func_name(const char *name) {
     if (name == nullptr) {

From 570660f83e392ba2f2aec72f1c7555211b6340fb Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 21 May 2026 11:31:36 +0200
Subject: [PATCH 11/15] Separating orhestration from scheduling activities

---
 .../aicpu/aicpu_executor.cpp                  | 456 +++++++++---------
 1 file changed, 234 insertions(+), 222 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index f86e7f5eb..9b8b4110c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -151,7 +151,8 @@ struct AicpuExecutor {
 
     // ===== Methods =====
     int32_t init(Runtime *runtime);
-    int32_t run(Runtime *runtime);
+    int32_t runScheduling(Runtime *runtime);
+    int32_t runOrchestration(Runtime* runtime);
     int32_t shutdown(Runtime *runtime);
     int32_t loadOrchestrator(Runtime* runtime);
     int32_t performTimingRuns(Runtime *runtime);
@@ -450,264 +451,269 @@ int32_t AicpuExecutor::loadOrchestrator(Runtime* runtime)
 /**
  * Shutdown AICore - Send exit signal via registers to all AICore kernels
  */
-int32_t AicpuExecutor::run(Runtime *runtime) {
+int32_t AicpuExecutor::runScheduling(Runtime *runtime) {
     int32_t run_rc = 0;
-    LOG_INFO_V0("Thread %d: at AicpuExecutor::Run", my_thread_idx_);
+    LOG_INFO_V0("Thread %d: at AicpuExecutor::runScheduling", my_thread_idx_);
 
-    // Orchestrator check
-    if (my_thread_idx_ >= sched_thread_num_) {
-#if PTO2_PROFILING
-        uint64_t orch_cycle_start = 0;
-        int32_t pto2_submitted_tasks = -1;
-#endif
-
-        // Orchestrator thread: load + run the device orchestration SO. 
-        LOG_INFO_V0("Thread %d: Orchestrator Running", my_thread_idx_);
-
-        // sm_handle / rt are bound to *this* run's memory and must be
-        // (re)created every run, regardless of whether the SO itself was
-        // reused above.
-        const ChipStorageTaskArgs &args = runtime->get_orch_args();
-        int32_t arg_count = args.tensor_count() + args.scalar_count();
-        LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", my_thread_idx_, runtime->get_gm_sm_ptr(), arg_count);
-        for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) {
-            const ContinuousTensor &t = args.tensor(i);
-            LOG_INFO_V0(
-                "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", my_thread_idx_, i,
-                static_cast<uint64_t>(t.data), t.ndims, static_cast<unsigned>(t.dtype)
-            );
+    // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
+    if (!sched_ctx_.is_completed() && (my_thread_idx_ < sched_thread_num_ || orch_to_sched_)) {
+        // Device orchestration: wait for the primary orchestrator to initialize the SM header
+        while (!runtime_init_ready_.load(std::memory_order_acquire)) {
+            SPIN_WAIT_HINT();
         }
-        for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) {
-            LOG_INFO_V0(
-                "Thread %d: orch_args[%d] = SCALAR(0x%lx)", my_thread_idx_, args.tensor_count() + i,
-                static_cast<uint64_t>(args.scalar(i))
-            );
+        if (rt == nullptr) {
+            LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", my_thread_idx_);
+        } else {
+            sched_ctx_.bind_runtime(rt);
+            int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, my_thread_idx_);
+            if (completed < 0) {
+                LOG_ERROR("Thread %d: Scheduler failed with rc=%d", my_thread_idx_, completed);
+                run_rc = completed;
+            } else {
+                LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", my_thread_idx_, completed);
+            }
         }
+    }
 
-        uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE;
-        uint64_t heap_size = PTO2_HEAP_SIZE;
+    LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_);
 
-        if (runtime->task_window_size > 0) {
-            task_window_size = runtime->task_window_size;
-        }
-        if (runtime->heap_size > 0) {
-            heap_size = runtime->heap_size;
-        }
-        int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
-        if (runtime->dep_pool_size > 0) {
-            dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
-        }
+    return run_rc;
+}
+
+int32_t AicpuExecutor::runOrchestration(Runtime* runtime)
+{
+    // Only the orchestrator thread runs this
+    if (my_thread_idx_ < sched_thread_num_) return 0;
+
+#if PTO2_PROFILING
+    uint64_t orch_cycle_start = 0;
+    int32_t pto2_submitted_tasks = -1;
+#endif
+
+    // Orchestrator thread: load + run the device orchestration SO. 
+    LOG_INFO_V0("Thread %d: Orchestrator Running", my_thread_idx_);
+
+    // sm_handle / rt are bound to *this* run's memory and must be
+    // (re)created every run, regardless of whether the SO itself was
+    // reused above.
+    const ChipStorageTaskArgs &args = runtime->get_orch_args();
+    int32_t arg_count = args.tensor_count() + args.scalar_count();
+    LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", my_thread_idx_, runtime->get_gm_sm_ptr(), arg_count);
+    for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) {
+        const ContinuousTensor &t = args.tensor(i);
         LOG_INFO_V0(
-            "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", my_thread_idx_,
-            static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
+            "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", my_thread_idx_, i,
+            static_cast<uint64_t>(t.data), t.ndims, static_cast<unsigned>(t.dtype)
         );
+    }
+    for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) {
+        LOG_INFO_V0(
+            "Thread %d: orch_args[%d] = SCALAR(0x%lx)", my_thread_idx_, args.tensor_count() + i,
+            static_cast<uint64_t>(args.scalar(i))
+        );
+    }
 
-        void *sm_ptr = runtime->get_gm_sm_ptr();
-        void *gm_heap = runtime->get_gm_heap_ptr();
+    uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE;
+    uint64_t heap_size = PTO2_HEAP_SIZE;
 
-        uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size);
-        PTO2SharedMemoryHandle *sm_handle =
-            PTO2SharedMemoryHandle::create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size);
-        if (!sm_handle) {
-            LOG_ERROR("Thread %d: Failed to create shared memory handle", my_thread_idx_);
-            // Unblock scheduler threads before returning so they don't spin forever.
-            runtime_init_ready_.store(true, std::memory_order_release);
-            return -1;
-        }
+    if (runtime->task_window_size > 0) {
+        task_window_size = runtime->task_window_size;
+    }
+    if (runtime->heap_size > 0) {
+        heap_size = runtime->heap_size;
+    }
+    int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
+    if (runtime->dep_pool_size > 0) {
+        dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
+    }
+    LOG_INFO_V0(
+        "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", my_thread_idx_,
+        static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
+    );
+
+    void *sm_ptr = runtime->get_gm_sm_ptr();
+    void *gm_heap = runtime->get_gm_heap_ptr();
+
+    uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size);
+    PTO2SharedMemoryHandle *sm_handle =
+        PTO2SharedMemoryHandle::create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size);
+    if (!sm_handle) {
+        LOG_ERROR("Thread %d: Failed to create shared memory handle", my_thread_idx_);
+        // Unblock scheduler threads before returning so they don't spin forever.
+        runtime_init_ready_.store(true, std::memory_order_release);
+        return -1;
+    }
 
-        rt = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity);
-        if (!rt) {
-            LOG_ERROR("Thread %d: Failed to create PTO2Runtime", my_thread_idx_);
-            sm_handle->destroy();
-            // Unblock scheduler threads before returning so they don't spin forever.
-            runtime_init_ready_.store(true, std::memory_order_release);
-            return -1;
-        }
+    rt = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity);
+    if (!rt) {
+        LOG_ERROR("Thread %d: Failed to create PTO2Runtime", my_thread_idx_);
+        sm_handle->destroy();
+        // Unblock scheduler threads before returning so they don't spin forever.
+        runtime_init_ready_.store(true, std::memory_order_release);
+        return -1;
+    }
 
 #if PTO2_PROFILING
-        rt->orchestrator.l2_perf_level = get_l2_perf_level();
+    rt->orchestrator.l2_perf_level = get_l2_perf_level();
 #endif
 
-        // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
-        rt->orchestrator.total_cluster_count = sched_ctx_.aic_count();
-        rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count();
+    // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
+    rt->orchestrator.total_cluster_count = sched_ctx_.aic_count();
+    rt->orchestrator.total_aiv_count = sched_ctx_.aiv_count();
 
-        // With multi-ring, slot_states are per-ring inside the scheduler.
-        runtime->set_slot_states_ptr(nullptr);
+    // With multi-ring, slot_states are per-ring inside the scheduler.
+    runtime->set_slot_states_ptr(nullptr);
 
-        orch_args_cached_ = &args;
+    orch_args_cached_ = &args;
 
-        // Wire scheduler context to the newly created PTO2Runtime before
-        // releasing scheduler threads from runtime_init_ready_.
-        sched_ctx_.bind_runtime(rt);
+    // Wire scheduler context to the newly created PTO2Runtime before
+    // releasing scheduler threads from runtime_init_ready_.
+    sched_ctx_.bind_runtime(rt);
 
-        runtime_init_ready_.store(true, std::memory_order_release);
+    runtime_init_ready_.store(true, std::memory_order_release);
 
-        // Wait for scheduler's one-time init to complete
-        sched_ctx_.wait_pto2_init_complete();
+    // Wait for scheduler's one-time init to complete
+    sched_ctx_.wait_pto2_init_complete();
 
 #if PTO2_PROFILING
-        if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) {
-            l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_);
-        }
+    if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) {
+        l2_perf_aicpu_set_orch_thread_idx(my_thread_idx_);
+    }
 #endif
 
-        // dep_gen plugs into the orchestrator thread (single-instance subsystem):
-        // set the per-thread queue index and pop the initial buffer before any
-        // submit_task can fire inside orch_func_.
-        if (is_dep_gen_enabled()) {
-            dep_gen_aicpu_set_orch_thread_idx(my_thread_idx_);
-            dep_gen_aicpu_init();
-        }
+    // dep_gen plugs into the orchestrator thread (single-instance subsystem):
+    // set the per-thread queue index and pop the initial buffer before any
+    // submit_task can fire inside orch_func_.
+    if (is_dep_gen_enabled()) {
+        dep_gen_aicpu_set_orch_thread_idx(my_thread_idx_);
+        dep_gen_aicpu_init();
+    }
 
 #if PTO2_PROFILING
-        orch_cycle_start = get_sys_cnt_aicpu();
+    orch_cycle_start = get_sys_cnt_aicpu();
 #endif
-        framework_bind_runtime(rt);
-        if (*p_bind_ != nullptr) {
-            (*p_bind_)(rt);
-        }
-        rt_scope_begin(rt);
-        (*p_func_)(*orch_args_cached_);
-        rt_scope_end(rt);
-
-        // Flush the (potentially partially-filled) DepGenBuffer so the host
-        // collector can pick it up before this orchestrator thread joins.
-        if (is_dep_gen_enabled()) {
-            dep_gen_aicpu_flush();
-        }
+    framework_bind_runtime(rt);
+    if (*p_bind_ != nullptr) {
+        (*p_bind_)(rt);
+    }
+    rt_scope_begin(rt);
+    (*p_func_)(*orch_args_cached_);
+    rt_scope_end(rt);
+
+    // Flush the (potentially partially-filled) DepGenBuffer so the host
+    // collector can pick it up before this orchestrator thread joins.
+    if (is_dep_gen_enabled()) {
+        dep_gen_aicpu_flush();
+    }
 #if PTO2_PROFILING
-        uint64_t orch_cycle_end = get_sys_cnt_aicpu();
-        (void)orch_cycle_end;
+    uint64_t orch_cycle_end = get_sys_cnt_aicpu();
+    (void)orch_cycle_end;
 #endif
 
-        // Print orchestrator profiling data
+    // Print orchestrator profiling data
 #if PTO2_ORCH_PROFILING
-        PTO2OrchProfilingData p = orchestrator_get_profiling();
-        uint64_t total =
-            p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle;
-        if (total == 0) total = 1;  // avoid div-by-zero
-        LOG_INFO_V9(
-            "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", my_thread_idx_,
-            static_cast<int64_t>(p.submit_count), cycles_to_us(total)
-        );
-        LOG_INFO_V9(
-            "Thread %d:   task+heap_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
-            my_thread_idx_, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
-            cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
-            static_cast<uint64_t>(p.alloc_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:   sync_tensormap : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.sync_cycle),
-            p.sync_cycle * 100.0 / total
-        );
-        LOG_INFO_V9(
-            "Thread %d:   lookup+dep     : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.lookup_cycle),
-            p.lookup_cycle * 100.0 / total
-        );
-        LOG_INFO_V9(
-            "Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.insert_cycle),
-            p.insert_cycle * 100.0 / total
-        );
-        LOG_INFO_V9(
-            "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", my_thread_idx_,
-            cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus", my_thread_idx_,
-            cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
-            cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle)
-        );
-        LOG_INFO_V9(
-            "Thread %d:   avg/task       : %.3fus", my_thread_idx_,
-            p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0
-        );
+    PTO2OrchProfilingData p = orchestrator_get_profiling();
+    uint64_t total =
+        p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle;
+    if (total == 0) total = 1;  // avoid div-by-zero
+    LOG_INFO_V9(
+        "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", my_thread_idx_,
+        static_cast<int64_t>(p.submit_count), cycles_to_us(total)
+    );
+    LOG_INFO_V9(
+        "Thread %d:   task+heap_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
+        my_thread_idx_, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
+        cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
+        static_cast<uint64_t>(p.alloc_atomic_count)
+    );
+    LOG_INFO_V9(
+        "Thread %d:   sync_tensormap : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.sync_cycle),
+        p.sync_cycle * 100.0 / total
+    );
+    LOG_INFO_V9(
+        "Thread %d:   lookup+dep     : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.lookup_cycle),
+        p.lookup_cycle * 100.0 / total
+    );
+    LOG_INFO_V9(
+        "Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", my_thread_idx_, cycles_to_us(p.insert_cycle),
+        p.insert_cycle * 100.0 / total
+    );
+    LOG_INFO_V9(
+        "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", my_thread_idx_,
+        cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
+    );
+    LOG_INFO_V9(
+        "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus", my_thread_idx_,
+        cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
+        cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle)
+    );
+    LOG_INFO_V9(
+        "Thread %d:   avg/task       : %.3fus", my_thread_idx_,
+        p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0
+    );
 
 #if PTO2_TENSORMAP_PROFILING
-        PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling();
-        LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", my_thread_idx_);
-        LOG_INFO_V9(
-            "Thread %d:   lookups        : %" PRIu64 ", inserts: %" PRIu64 "", my_thread_idx_,
-            static_cast<uint64_t>(tp.lookup_count), static_cast<uint64_t>(tp.insert_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:   chain walked   : total=%" PRIu64 ", avg=%.1f, max=%d", my_thread_idx_,
-            static_cast<uint64_t>(tp.lookup_chain_total),
-            tp.lookup_count > 0 ? static_cast<double>(tp.lookup_chain_total) / tp.lookup_count : 0.0,
-            tp.lookup_chain_max
-        );
-        LOG_INFO_V9(
-            "Thread %d:   overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", my_thread_idx_,
-            static_cast<uint64_t>(tp.overlap_checks), static_cast<uint64_t>(tp.overlap_hits),
-            tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0
-        );
+    PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling();
+    LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", my_thread_idx_);
+    LOG_INFO_V9(
+        "Thread %d:   lookups        : %" PRIu64 ", inserts: %" PRIu64 "", my_thread_idx_,
+        static_cast<uint64_t>(tp.lookup_count), static_cast<uint64_t>(tp.insert_count)
+    );
+    LOG_INFO_V9(
+        "Thread %d:   chain walked   : total=%" PRIu64 ", avg=%.1f, max=%d", my_thread_idx_,
+        static_cast<uint64_t>(tp.lookup_chain_total),
+        tp.lookup_count > 0 ? static_cast<double>(tp.lookup_chain_total) / tp.lookup_count : 0.0,
+        tp.lookup_chain_max
+    );
+    LOG_INFO_V9(
+        "Thread %d:   overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", my_thread_idx_,
+        static_cast<uint64_t>(tp.overlap_checks), static_cast<uint64_t>(tp.overlap_hits),
+        tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0
+    );
 #endif
 #endif  // PTO2_ORCH_PROFILING
 
-        // Latch task count from PTO2 shared memory to hand off to the
-        // scheduler. The orchestrator's run window (start_time / end_time /
-        // submit_count) is no longer published to shared memory — the
-        // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line
-        // below carries the same envelope info for debugging, and
-        // host-side swimlane derives per-phase timing from the per-event
-        // AicpuPhaseRecord[] stream that already covers everything inside
-        // submit_task().
-        int32_t total_tasks = 0;
-        if (rt->orchestrator.sm_header) {
-            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-                total_tasks +=
-                    rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
-            }
+    // Latch task count from PTO2 shared memory to hand off to the
+    // scheduler. The orchestrator's run window (start_time / end_time /
+    // submit_count) is no longer published to shared memory — the
+    // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line
+    // below carries the same envelope info for debugging, and
+    // host-side swimlane derives per-phase timing from the per-event
+    // AicpuPhaseRecord[] stream that already covers everything inside
+    // submit_task().
+    int32_t total_tasks = 0;
+    if (rt->orchestrator.sm_header) {
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            total_tasks +=
+                rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
         }
+    }
 
 #if PTO2_PROFILING
-        pto2_submitted_tasks = total_tasks;
+    pto2_submitted_tasks = total_tasks;
 #endif
 
-        // Signal completion to the orchestrator state machine
-        rt_orchestration_done(rt);
+    // Signal completion to the orchestrator state machine
+    rt_orchestration_done(rt);
 
-        sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks);
+    sched_ctx_.on_orchestration_done(runtime, rt, my_thread_idx_, total_tasks);
 #if PTO2_PROFILING
-        uint64_t orch_end_ts = get_sys_cnt_aicpu();
+    uint64_t orch_end_ts = get_sys_cnt_aicpu();
+    LOG_INFO_V9(
+        "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", my_thread_idx_,
+        static_cast<uint64_t>(orch_cycle_start), static_cast<uint64_t>(orch_end_ts),
+        cycles_to_us(orch_end_ts - orch_cycle_start)
+    );
+    if (pto2_submitted_tasks >= 0) {
         LOG_INFO_V9(
-            "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", my_thread_idx_,
-            static_cast<uint64_t>(orch_cycle_start), static_cast<uint64_t>(orch_end_ts),
-            cycles_to_us(orch_end_ts - orch_cycle_start)
+            "PTO2 total submitted tasks = %d, already executed %d tasks", pto2_submitted_tasks,
+            sched_ctx_.completed_tasks_count()
         );
-        if (pto2_submitted_tasks >= 0) {
-            LOG_INFO_V9(
-                "PTO2 total submitted tasks = %d, already executed %d tasks", pto2_submitted_tasks,
-                sched_ctx_.completed_tasks_count()
-            );
-        }
-#endif
-        LOG_INFO_V0("Thread %d: Orchestrator completed", my_thread_idx_);
-    }
-
-    // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
-    if (!sched_ctx_.is_completed() && (my_thread_idx_ < sched_thread_num_ || orch_to_sched_)) {
-        // Device orchestration: wait for the primary orchestrator to initialize the SM header
-        while (!runtime_init_ready_.load(std::memory_order_acquire)) {
-            SPIN_WAIT_HINT();
-        }
-        if (rt == nullptr) {
-            LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", my_thread_idx_);
-        } else {
-            sched_ctx_.bind_runtime(rt);
-            int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, my_thread_idx_);
-            if (completed < 0) {
-                LOG_ERROR("Thread %d: Scheduler failed with rc=%d", my_thread_idx_, completed);
-                run_rc = completed;
-            } else {
-                LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", my_thread_idx_, completed);
-            }
-        }
     }
+#endif
+    LOG_INFO_V0("Thread %d: Orchestrator completed", my_thread_idx_);
 
-    LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_);
-
-    return run_rc;
+    return 0;
 }
 
 void AicpuExecutor::deinit(Runtime *runtime) {
@@ -792,7 +798,8 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
     {
         barrier();
         uint64_t t0_ts = get_sys_cnt_aicpu();
-        rc |= run(runtime);
+        rc |= runOrchestration(runtime);
+        rc |= runScheduling(runtime);
         barrier();
         uint64_t t1_ts = get_sys_cnt_aicpu();
         LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, t1_ts - t0_ts);
@@ -810,7 +817,8 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
         // Waiting for threads to arrive for before-timing.
         barrier();
         uint64_t t0_ts = get_sys_cnt_aicpu();
-        rc |= run(runtime);
+        rc |= runOrchestration(runtime);
+        rc |= runScheduling(runtime);
         barrier();
         uint64_t t1_ts = get_sys_cnt_aicpu();
         LOG_INFO_V9("Thread %d: Timing %d/%d time: %luns", my_thread_idx_, i, timingIterationCount, t1_ts - t0_ts);
@@ -872,10 +880,18 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
         }
     }
 
-    // Perform actual kernel run
-    int32_t rc = g_aicpu_executor.run(runtime);
-    if (rc != 0) {
-        LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc);
+    // Perform orchestration step
+    int32_t orch_rc = g_aicpu_executor.runOrchestration(runtime);
+    if (orch_rc != 0) {
+        LOG_ERROR("aicpu_execute: Orhestration execution failed with rc=%d", orch_rc);
+        return orch_rc;
+    }
+
+    // Perform scheduling step
+    int32_t sched_rc = g_aicpu_executor.runScheduling(runtime);
+    if (sched_rc != 0) {
+        LOG_ERROR("aicpu_execute: Scheduling execution failed with rc=%d", sched_rc);
+        return sched_rc;
     }
 
     int32_t runtime_rc = read_pto2_runtime_status(runtime);
@@ -898,10 +914,6 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
         g_aicpu_executor.deinit(runtime);
     }
 
-    if (rc != 0) {
-        return rc;
-    }
-
     LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully");
     return 0;
 }

From 20140dd634ddf810ed1874dd8c778874782a4878 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 21 May 2026 13:00:24 +0200
Subject: [PATCH 12/15] Adding timing statistics

---
 .../aicpu/aicpu_executor.cpp                  | 91 +++++++++++++++----
 1 file changed, 73 insertions(+), 18 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 9b8b4110c..fc34e216e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -18,6 +18,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <math.h>
 #ifdef __linux__
 #include <sys/mman.h>
 #endif
@@ -475,7 +476,7 @@ int32_t AicpuExecutor::runScheduling(Runtime *runtime) {
         }
     }
 
-    LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_);
+    LOG_INFO_V9("Thread %d: Scheduling Completed", my_thread_idx_);
 
     return run_rc;
 }
@@ -711,7 +712,7 @@ int32_t AicpuExecutor::runOrchestration(Runtime* runtime)
         );
     }
 #endif
-    LOG_INFO_V0("Thread %d: Orchestrator completed", my_thread_idx_);
+    LOG_INFO_V9("Thread %d: Orchestrator completed", my_thread_idx_);
 
     return 0;
 }
@@ -797,12 +798,12 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
     for (int32_t i = 0; i < warmupIterationCount; i++)
     {
         barrier();
-        uint64_t t0_ts = get_sys_cnt_aicpu();
+        uint64_t t0 = get_sys_cnt_aicpu();
         rc |= runOrchestration(runtime);
         rc |= runScheduling(runtime);
         barrier();
-        uint64_t t1_ts = get_sys_cnt_aicpu();
-        LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, t1_ts - t0_ts);
+        uint64_t tf = get_sys_cnt_aicpu();
+        LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, tf - t0);
 
         // Resetting execution back to start
         if (my_thread_idx_ == 0) {
@@ -812,28 +813,81 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
     } 
 
     // Second, perform timed runs (the ones that count)
+    std::vector<uint64_t> orchTimes;
+    std::vector<uint64_t> schedTimes;
+    std::vector<uint64_t> runTimes;
     for (int32_t i = 0; i < timingIterationCount; i++)
     {
         // Waiting for threads to arrive for before-timing.
         barrier();
-        uint64_t t0_ts = get_sys_cnt_aicpu();
+        uint64_t t0 = get_sys_cnt_aicpu();
         rc |= runOrchestration(runtime);
+        
+        uint64_t t1 = get_sys_cnt_aicpu();
         rc |= runScheduling(runtime);
         barrier();
-        uint64_t t1_ts = get_sys_cnt_aicpu();
-        LOG_INFO_V9("Thread %d: Timing %d/%d time: %luns", my_thread_idx_, i, timingIterationCount, t1_ts - t0_ts);
+        uint64_t tf = get_sys_cnt_aicpu();
 
-        // Resetting execution back to start
+        // Calculating time segments and adding them to the timing vector
+        const uint64_t orchTime  = t1 - t0;
+        const uint64_t schedTime = tf - t1;
+        const uint64_t runTime   = tf - t0;
+        orchTimes.push_back(orchTime);
+        schedTimes.push_back(schedTime);
+        runTimes.push_back(runTime);
+
+        LOG_INFO_V9("Thread %d: Timing %d/%d Total Time: %luns (Orch: %luns + Sched: %luns)", my_thread_idx_, i, timingIterationCount, runTime, orchTime, schedTime);
+
+        // Resetting execution back to start before the next run
         if (my_thread_idx_ == 0) {
             deinit(runtime);
             init(runtime);
         }
     }   
 
-    if (rc != 0) LOG_ERROR("Thread %d - Timed runs failed with rc=%d", my_thread_idx_, rc);
+    if (rc != 0)
+    {
+        LOG_ERROR("Thread %d - Timed runs failed with rc=%d", my_thread_idx_, rc);
+        return rc;
+    } 
+
+    // The orchestrator thread now calculates and reports timing
+    if (my_thread_idx_ == sched_thread_num_)
+    {
+        uint64_t orchSum = 0; 
+        uint64_t schedSum = 0; 
+        uint64_t runSum = 0; 
+
+        for (const auto t : orchTimes) orchSum += t;
+        for (const auto t : schedTimes) schedSum += t;
+        for (const auto t : runTimes) runSum += t;
+
+        // Calculating averages
+        const auto runCount = runTimes.size();
+        double orchAvg  = (double)orchSum  / (double)runCount;
+        double schedAvg = (double)schedSum / (double)runCount;
+        double runAvg   = (double)runSum   / (double)runCount;
+
+        // Calculating L2 norms
+        double orchDiff  = 0; 
+        double schedDiff = 0; 
+        double runDiff   = 0; 
 
-    // Return code
-    return rc;
+        for (const auto t : orchTimes)  orchDiff  += ((double)t - orchAvg)  * ((double)t - orchAvg) ;
+        for (const auto t : schedTimes) schedDiff += ((double)t - schedAvg) * ((double)t - schedAvg);
+        for (const auto t : runTimes)   runDiff   += ((double)t - runAvg)   * ((double)t - runAvg)  ;
+
+        double orchStdDev   = runCount == 1 ? 0.0 : std::sqrt(orchDiff   / (double)(runCount-1)); 
+        double schedStdDev  = runCount == 1 ? 0.0 : std::sqrt(schedDiff  / (double)(runCount-1));
+        double runStdDev    = runCount == 1 ? 0.0 : std::sqrt(runDiff    / (double)(runCount-1));
+
+        LOG_INFO_V9("Thread %d: [Timing] Average Runtime over %d timed runs: ", my_thread_idx_, timingIterationCount);
+        LOG_INFO_V9("Thread %d: [Timing]   + Orchestration: %10.0fns +- %6.0fns", my_thread_idx_, orchAvg,  orchStdDev);
+        LOG_INFO_V9("Thread %d: [Timing]   + Scheduling:    %10.0fns +- %6.0fns", my_thread_idx_, schedAvg, schedStdDev);
+        LOG_INFO_V9("Thread %d: [Timing]   + Run Total:     %10.0fns +- %6.0fns", my_thread_idx_, runAvg,   runStdDev);
+    }
+
+    return 0;
 }
 
 // ===== Public Entry Point =====
@@ -894,13 +948,19 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
         return sched_rc;
     }
 
+    // Reading PTO2 runtime status
     int32_t runtime_rc = read_pto2_runtime_status(runtime);
-    
     if (runtime_rc != 0) {
         LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc);
         return runtime_rc;
     }
 
+    // Last thread cleans up
+    if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) {
+        LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up");
+        g_aicpu_executor.deinit(runtime);
+    }
+
     // Shutting down
     int32_t shutdown_rc = g_aicpu_executor.shutdown(runtime);
     if (shutdown_rc != 0) {
@@ -908,11 +968,6 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
         return shutdown_rc;
     }
 
-    // Last thread cleans up
-    if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) {
-        LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up");
-        g_aicpu_executor.deinit(runtime);
-    }
 
     LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully");
     return 0;

From 454f8755d98333f47c53958d1d36e4167f59c98b Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 21 May 2026 13:50:02 +0200
Subject: [PATCH 13/15] Moving initialization routine outside main loop

---
 .../aicpu/aicpu_executor.cpp                  |  4 +-
 .../runtime/scheduler/scheduler_context.h     |  2 +
 .../runtime/scheduler/scheduler_dispatch.cpp  | 56 ++++++++++---------
 3 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index fc34e216e..59a73e465 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -223,6 +223,8 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
 
     finished_count_.store(0, std::memory_order_release);
 
+    sched_ctx_.initializePerfCounters();
+
     // Loading orchestrator
     int32_t load_orch_rc = loadOrchestrator(runtime);
     if (load_orch_rc != 0)
@@ -825,7 +827,7 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
         
         uint64_t t1 = get_sys_cnt_aicpu();
         rc |= runScheduling(runtime);
-        barrier();
+
         uint64_t tf = get_sys_cnt_aicpu();
 
         // Calculating time segments and adding them to the timing vector
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 943f3ed06..2b810c70f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -67,6 +67,8 @@ class SchedulerContext {
     // Called by AicpuExecutor::deinit() during per-run teardown.
     void deinit();
 
+    void initializePerfCounters();
+
     // =========================================================================
     // Per-thread execution entry points (called by AicpuExecutor::run)
     // =========================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index e2970dbb3..1537d7292 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -314,6 +314,36 @@ void SchedulerContext::dispatch_shape(
     }
 }
 
+void SchedulerContext::initializePerfCounters()
+{
+    // One-time init: assign perf buffers (one thread does it; others wait)
+    if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel))
+    {
+        LOG_INFO_V0("Initializing scheduler perf counters");
+
+#if PTO2_PROFILING
+        if (is_dump_tensor_enabled()) {
+            dump_tensor_init(orch_to_sched_ ? thread_num_ : sched_thread_num_);
+        }
+#endif
+
+#if PTO2_PROFILING
+        // Initialize PMU: program events, start counters, and pop initial buffers
+        if (is_pmu_enabled()) {
+            pmu_aicpu_init(physical_core_ids_, cores_total_num_);
+            LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_);
+        }
+#endif
+
+        LOG_INFO_V0("Initialized scheduler perf counters");
+        pto2_init_complete_.store(true, std::memory_order_release);
+    } else {
+        while (!pto2_init_complete_.load(std::memory_order_acquire)) {
+            SPIN_WAIT_HINT();
+        }
+    }
+}
+
 // =============================================================================
 // Main scheduler dispatch loop
 // =============================================================================
@@ -340,32 +370,6 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
         static_cast<uint64_t>(header->rings[0].task_window_size)
     );
 
-    // One-time init: assign perf buffers (one thread does it; others wait)
-    if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) {
-        LOG_INFO_V0("Thread %d: doing one-time init", thread_idx);
-
-#if PTO2_PROFILING
-        if (is_dump_tensor_enabled()) {
-            dump_tensor_init(orch_to_sched_ ? thread_num_ : sched_thread_num_);
-        }
-#endif
-
-#if PTO2_PROFILING
-        // Initialize PMU: program events, start counters, and pop initial buffers
-        if (is_pmu_enabled()) {
-            pmu_aicpu_init(physical_core_ids_, cores_total_num_);
-            LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_);
-        }
-#endif
-
-        LOG_INFO_V0("Thread %d: one-time init done", thread_idx);
-        pto2_init_complete_.store(true, std::memory_order_release);
-    } else {
-        while (!pto2_init_complete_.load(std::memory_order_acquire)) {
-            SPIN_WAIT_HINT();
-        }
-    }
-
     LOG_INFO_V0("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_trackers_[thread_idx].core_num());
     int32_t cur_thread_completed = 0;
     int32_t idle_iterations = 0;

From 4da588ac243e4d10bea3c8d3ffcd84f73d63cdcf Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 21 May 2026 15:47:29 +0200
Subject: [PATCH 14/15] Now producing avg+stddev

---
 .../aicpu/aicpu_executor.cpp                  | 51 +++++--------------
 1 file changed, 14 insertions(+), 37 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 59a73e465..08a885f76 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -122,12 +122,11 @@ struct AicpuExecutor {
 
     int32_t thread_num_{0};
 
-    // Barrier counters for synchronization (timing) across threads
+    // ====== Barrier counters for synchronization (timing) across threads
     std::atomic<uint64_t> barrier_counter_in_{0};
     std::atomic<uint64_t> barrier_counter_out_{0};
 
     // ===== Task queue state (managed by scheduler ready queues) =====
-
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
@@ -186,6 +185,8 @@ struct AicpuExecutor {
 };
 
 static AicpuExecutor g_aicpu_executor;
+
+// Thread-local identifier
 thread_local int32_t my_thread_idx_;
 
 // ===== AicpuExecutor Method Implementations =====
@@ -478,7 +479,7 @@ int32_t AicpuExecutor::runScheduling(Runtime *runtime) {
         }
     }
 
-    LOG_INFO_V9("Thread %d: Scheduling Completed", my_thread_idx_);
+    LOG_INFO_V0("Thread %d: Scheduling Completed", my_thread_idx_);
 
     return run_rc;
 }
@@ -576,7 +577,7 @@ int32_t AicpuExecutor::runOrchestration(Runtime* runtime)
     runtime_init_ready_.store(true, std::memory_order_release);
 
     // Wait for scheduler's one-time init to complete
-    sched_ctx_.wait_pto2_init_complete();
+    // sched_ctx_.wait_pto2_init_complete();
 
 #if PTO2_PROFILING
     if (get_l2_perf_level() >= L2PerfLevel::ORCH_PHASES) {
@@ -714,7 +715,7 @@ int32_t AicpuExecutor::runOrchestration(Runtime* runtime)
         );
     }
 #endif
-    LOG_INFO_V9("Thread %d: Orchestrator completed", my_thread_idx_);
+    LOG_INFO_V0("Thread %d: Orchestrator completed", my_thread_idx_);
 
     return 0;
 }
@@ -805,7 +806,7 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
         rc |= runScheduling(runtime);
         barrier();
         uint64_t tf = get_sys_cnt_aicpu();
-        LOG_INFO_V9("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, tf - t0);
+        LOG_INFO_V0("Thread %d: Warmup %d/%d Time: %luns", my_thread_idx_, i, warmupIterationCount, tf - t0);
 
         // Resetting execution back to start
         if (my_thread_idx_ == 0) {
@@ -815,8 +816,6 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
     } 
 
     // Second, perform timed runs (the ones that count)
-    std::vector<uint64_t> orchTimes;
-    std::vector<uint64_t> schedTimes;
     std::vector<uint64_t> runTimes;
     for (int32_t i = 0; i < timingIterationCount; i++)
     {
@@ -824,21 +823,15 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
         barrier();
         uint64_t t0 = get_sys_cnt_aicpu();
         rc |= runOrchestration(runtime);
-        
-        uint64_t t1 = get_sys_cnt_aicpu();
         rc |= runScheduling(runtime);
-
+        barrier();
         uint64_t tf = get_sys_cnt_aicpu();
 
         // Calculating time segments and adding them to the timing vector
-        const uint64_t orchTime  = t1 - t0;
-        const uint64_t schedTime = tf - t1;
         const uint64_t runTime   = tf - t0;
-        orchTimes.push_back(orchTime);
-        schedTimes.push_back(schedTime);
         runTimes.push_back(runTime);
 
-        LOG_INFO_V9("Thread %d: Timing %d/%d Total Time: %luns (Orch: %luns + Sched: %luns)", my_thread_idx_, i, timingIterationCount, runTime, orchTime, schedTime);
+        LOG_INFO_V0("Thread %d: Timing %d/%d Total Time: %luns", my_thread_idx_, i, timingIterationCount, runTime);
 
         // Resetting execution back to start before the next run
         if (my_thread_idx_ == 0) {
@@ -856,37 +849,21 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
     // The orchestrator thread now calculates and reports timing
     if (my_thread_idx_ == sched_thread_num_)
     {
-        uint64_t orchSum = 0; 
-        uint64_t schedSum = 0; 
+        // Calculating timing sum over all runs
         uint64_t runSum = 0; 
-
-        for (const auto t : orchTimes) orchSum += t;
-        for (const auto t : schedTimes) schedSum += t;
         for (const auto t : runTimes) runSum += t;
 
-        // Calculating averages
+        // Calculating average
         const auto runCount = runTimes.size();
-        double orchAvg  = (double)orchSum  / (double)runCount;
-        double schedAvg = (double)schedSum / (double)runCount;
         double runAvg   = (double)runSum   / (double)runCount;
 
-        // Calculating L2 norms
-        double orchDiff  = 0; 
-        double schedDiff = 0; 
+        // Calculating stddev
         double runDiff   = 0; 
-
-        for (const auto t : orchTimes)  orchDiff  += ((double)t - orchAvg)  * ((double)t - orchAvg) ;
-        for (const auto t : schedTimes) schedDiff += ((double)t - schedAvg) * ((double)t - schedAvg);
         for (const auto t : runTimes)   runDiff   += ((double)t - runAvg)   * ((double)t - runAvg)  ;
-
-        double orchStdDev   = runCount == 1 ? 0.0 : std::sqrt(orchDiff   / (double)(runCount-1)); 
-        double schedStdDev  = runCount == 1 ? 0.0 : std::sqrt(schedDiff  / (double)(runCount-1));
         double runStdDev    = runCount == 1 ? 0.0 : std::sqrt(runDiff    / (double)(runCount-1));
 
-        LOG_INFO_V9("Thread %d: [Timing] Average Runtime over %d timed runs: ", my_thread_idx_, timingIterationCount);
-        LOG_INFO_V9("Thread %d: [Timing]   + Orchestration: %10.0fns +- %6.0fns", my_thread_idx_, orchAvg,  orchStdDev);
-        LOG_INFO_V9("Thread %d: [Timing]   + Scheduling:    %10.0fns +- %6.0fns", my_thread_idx_, schedAvg, schedStdDev);
-        LOG_INFO_V9("Thread %d: [Timing]   + Run Total:     %10.0fns +- %6.0fns", my_thread_idx_, runAvg,   runStdDev);
+        // Printing
+        LOG_INFO_V9("Thread %d: [Timing] Average Runtime over %d timed runs: %10.0fns +- %6.0fns", my_thread_idx_, timingIterationCount, runAvg,   runStdDev);
     }
 
     return 0;

From 726c3bc2894db524ce8b70f89b633c90882d9430 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 21 May 2026 16:08:26 +0200
Subject: [PATCH 15/15] Adding missing barrier

---
 .../runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp  | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 08a885f76..3d7249fcf 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -838,6 +838,9 @@ int32_t AicpuExecutor::performTimingRuns(Runtime *runtime)
             deinit(runtime);
             init(runtime);
         }
+
+        // Synchronizing before actual run
+        barrier();
     }   
 
     if (rc != 0)