From a6cc1ba619a20e34b15ddbefba859c37744966b9 Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Thu, 24 Jul 2025 16:58:44 +0800 Subject: [PATCH 1/3] [DevSAN] Cache internal queue to avoid repeated create/destroy (#19540) --- .../layers/sanitizer/asan/asan_buffer.cpp | 16 ++++++----- .../loader/layers/sanitizer/asan/asan_ddi.cpp | 2 +- .../sanitizer/asan/asan_interceptor.cpp | 28 ++++++++++++------- .../sanitizer/asan/asan_interceptor.hpp | 7 +++++ .../sanitizer_common/sanitizer_utils.cpp | 6 ++-- .../sanitizer_common/sanitizer_utils.hpp | 3 +- .../layers/sanitizer/tsan/tsan_buffer.cpp | 7 +++-- .../loader/layers/sanitizer/tsan/tsan_ddi.cpp | 2 +- .../sanitizer/tsan/tsan_interceptor.cpp | 18 +++++++----- .../sanitizer/tsan/tsan_interceptor.hpp | 8 ++++++ 10 files changed, 65 insertions(+), 32 deletions(-) diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_buffer.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_buffer.cpp index a5d6eb6269947..a98cec7453966 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_buffer.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_buffer.cpp @@ -90,6 +90,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { assert((void *)Device != nullptr && "Device cannot be nullptr"); std::scoped_lock Guard(Mutex); + auto CI = getAsanInterceptor()->getContextInfo(Context); auto &Allocation = Allocations[Device]; ur_result_t URes = UR_RESULT_SUCCESS; if (!Allocation) { @@ -106,9 +107,9 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { } if (HostPtr) { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr); + InternalQueue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr); if (URes != UR_RESULT_SUCCESS) { UR_LOG_L( getContext()->logger, ERR, @@ -147,10 +148,10 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { // Copy data from last synced device to host { - ManagedQueue Queue(Context, LastSyncedDevice.hDevice); + ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, 0, - nullptr, nullptr); + InternalQueue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, + 0, nullptr, nullptr); if (URes != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Failed to migrate memory buffer data"); @@ -160,9 +161,10 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { // Sync data back to device { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, Allocation, HostAllocation, Size, 0, nullptr, nullptr); + InternalQueue, true, Allocation, HostAllocation, Size, 0, nullptr, + nullptr); if (URes != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Failed to migrate memory buffer data"); diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp index 0b4a64c38a549..899ff6a850dbe 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -677,7 +677,7 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreate( std::shared_ptr CtxInfo = getAsanInterceptor()->getContextInfo(hContext); for (const auto &hDevice : CtxInfo->DeviceList) { - ManagedQueue InternalQueue(hContext, hDevice); + ur_queue_handle_t InternalQueue = CtxInfo->getInternalQueue(hDevice); char *Handle = nullptr; UR_CALL(pMemBuffer->getHandle(hDevice, Handle)); UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index 5e984d9b93e65..6c6eb65c92499 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -270,17 +270,15 @@ ur_result_t AsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, auto ContextInfo = getContextInfo(Context); auto DeviceInfo = getDeviceInfo(Device); - ManagedQueue InternalQueue(Context, Device); - if (!InternalQueue) { - UR_LOG_L(getContext()->logger, ERR, "Failed to create internal queue"); - return UR_RESULT_ERROR_INVALID_QUEUE; - } + ur_queue_handle_t InternalQueue = ContextInfo->getInternalQueue(Device); UR_CALL(prepareLaunch(ContextInfo, DeviceInfo, InternalQueue, Kernel, LaunchInfo)); UR_CALL(updateShadowMemory(ContextInfo, DeviceInfo, InternalQueue)); + UR_CALL(getContext()->urDdiTable.Queue.pfnFinish(InternalQueue)); + return UR_RESULT_SUCCESS; } @@ -467,6 +465,7 @@ ur_result_t AsanInterceptor::unregisterProgram(ur_program_handle_t Program) { ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) { auto Context = GetContext(Program); + auto CI = getContextInfo(Context); std::vector Devices = GetDevices(Program); for (auto Device : Devices) { @@ -484,11 +483,11 @@ ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) { assert((MetadataSize % sizeof(SpirKernelInfo) == 0) && "SpirKernelMetadata size is not correct"); - ManagedQueue Queue(Context, Device); + ur_queue_handle_t InternalQueue = CI->getInternalQueue(Device); std::vector SKInfo(NumOfSpirKernel); Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, &SKInfo[0], MetadataPtr, + InternalQueue, true, &SKInfo[0], MetadataPtr, sizeof(SpirKernelInfo) * NumOfSpirKernel, 0, nullptr, nullptr); if (Result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Can't read the value of <{}>: {}", @@ -504,7 +503,7 @@ ur_result_t AsanInterceptor::registerSpirKernels(ur_program_handle_t Program) { } std::vector KernelNameV(SKI.Size); Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, KernelNameV.data(), (void *)SKI.KernelName, + InternalQueue, true, KernelNameV.data(), (void *)SKI.KernelName, sizeof(char) * SKI.Size, 0, nullptr, nullptr); if (Result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Can't read kernel name: {}", @@ -537,7 +536,7 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) { assert(ProgramInfo != nullptr && "unregistered program!"); for (auto Device : Devices) { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t InternalQueue = ContextInfo->getInternalQueue(Device); size_t MetadataSize; void *MetadataPtr; @@ -554,7 +553,7 @@ AsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) { "DeviceGlobal metadata size is not correct"); std::vector GVInfos(NumOfDeviceGlobal); Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( - Queue, true, &GVInfos[0], MetadataPtr, + InternalQueue, true, &GVInfos[0], MetadataPtr, sizeof(DeviceGlobalInfo) * NumOfDeviceGlobal, 0, nullptr, nullptr); if (Result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Device Global[{}] Read Failed: {}", @@ -932,6 +931,8 @@ bool ProgramInfo::isKernelInstrumented(ur_kernel_handle_t Kernel) const { ContextInfo::~ContextInfo() { Stats.Print(Handle); + InternalQueueMap.clear(); + [[maybe_unused]] ur_result_t URes; if (USMPool) { URes = getContext()->urDdiTable.USM.pfnPoolRelease(USMPool); @@ -971,6 +972,13 @@ ur_usm_pool_handle_t ContextInfo::getUSMPool() { return USMPool; } +ur_queue_handle_t ContextInfo::getInternalQueue(ur_device_handle_t Device) { + std::scoped_lock Guard(InternalQueueMapMutex); + if (!InternalQueueMap[Device]) + InternalQueueMap[Device].emplace(Handle, Device); + return *InternalQueueMap[Device]; +} + AsanRuntimeDataWrapper::~AsanRuntimeDataWrapper() { [[maybe_unused]] ur_result_t Result; if (Host.LocalArgs) { diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp index bbf75d803ba27..8190bd9232877 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp @@ -20,6 +20,7 @@ #include "asan_statistics.hpp" #include "sanitizer_common/sanitizer_common.hpp" #include "sanitizer_common/sanitizer_options.hpp" +#include "sanitizer_common/sanitizer_utils.hpp" #include "ur_sanitizer_layer.hpp" #include @@ -143,6 +144,10 @@ struct ContextInfo { std::vector DeviceList; std::unordered_map AllocInfosMap; + ur_shared_mutex InternalQueueMapMutex; + std::unordered_map> + InternalQueueMap; + AsanStatsWrapper Stats; explicit ContextInfo(ur_context_handle_t Context) : Handle(Context) { @@ -163,6 +168,8 @@ struct ContextInfo { } ur_usm_pool_handle_t getUSMPool(); + + ur_queue_handle_t getInternalQueue(ur_device_handle_t); }; struct AsanRuntimeDataWrapper { diff --git a/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp b/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp index 2bd8733394a5c..b81cf55b4d584 100644 --- a/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp @@ -31,9 +31,11 @@ ur_usm_type_t GetUSMType(ur_context_handle_t Context, const void *MemPtr) { } // namespace ManagedQueue::ManagedQueue(ur_context_handle_t Context, - ur_device_handle_t Device) { + ur_device_handle_t Device, bool IsOutOfOrder) { + ur_queue_properties_t Prop{UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE}; [[maybe_unused]] auto Result = getContext()->urDdiTable.Queue.pfnCreate( - Context, Device, nullptr, &Handle); + Context, Device, IsOutOfOrder ? &Prop : nullptr, &Handle); assert(Result == UR_RESULT_SUCCESS && "Failed to create ManagedQueue"); UR_LOG_L(getContext()->logger, DEBUG, ">>> ManagedQueue {}", (void *)Handle); } diff --git a/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.hpp b/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.hpp index 6024c644a4d00..0199cc6f24b9a 100644 --- a/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.hpp @@ -23,7 +23,8 @@ namespace ur_sanitizer_layer { struct ManagedQueue { - ManagedQueue(ur_context_handle_t Context, ur_device_handle_t Device); + ManagedQueue(ur_context_handle_t Context, ur_device_handle_t Device, + bool IsOutOfOrder = false); ~ManagedQueue(); // Disable copy semantics diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp index b3ed7386a547c..c42a39d7cccde 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp @@ -99,6 +99,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { std::scoped_lock Guard(Mutex); auto &Allocation = Allocations[Device]; + auto CI = getTsanInterceptor()->getContextInfo(Context); ur_result_t URes = UR_RESULT_SUCCESS; if (!Allocation) { ur_usm_desc_t USMDesc{}; @@ -114,7 +115,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { } if (HostPtr) { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t Queue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr); if (URes != UR_RESULT_SUCCESS) { @@ -155,7 +156,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { // Copy data from last synced device to host { - ManagedQueue Queue(Context, LastSyncedDevice.hDevice); + ur_queue_handle_t Queue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( Queue, true, HostAllocation, LastSyncedDevice.MemHandle, Size, 0, nullptr, nullptr); @@ -168,7 +169,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { // Sync data back to device { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t Queue = CI->getInternalQueue(Device); URes = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( Queue, true, Allocation, HostAllocation, Size, 0, nullptr, nullptr); if (URes != UR_RESULT_SUCCESS) { diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp index fa772dd7e8c4d..61849ac0b363a 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp @@ -402,7 +402,7 @@ ur_result_t urMemBufferCreate( std::shared_ptr CtxInfo = getTsanInterceptor()->getContextInfo(hContext); for (const auto &hDevice : CtxInfo->DeviceList) { - ManagedQueue InternalQueue(hContext, hDevice); + ur_queue_handle_t InternalQueue = CtxInfo->getInternalQueue(hDevice); char *Handle = nullptr; UR_CALL(pMemBuffer->getHandle(hDevice, Handle)); UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMMemcpy( diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp index 6c1ee4a62b3f8..86c12f9e89b5c 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp @@ -13,7 +13,6 @@ */ #include "tsan_interceptor.hpp" -#include "sanitizer_common/sanitizer_utils.hpp" #include "tsan_report.hpp" namespace ur_sanitizer_layer { @@ -107,6 +106,13 @@ void ContextInfo::insertAllocInfo(TsanAllocInfo AI) { AllocInfos.insert(std::move(AI)); } +ur_queue_handle_t ContextInfo::getInternalQueue(ur_device_handle_t Device) { + std::scoped_lock Guard(InternalQueueMapMutex); + if (!InternalQueueMap[Device]) + InternalQueueMap[Device].emplace(Handle, Device, true); + return *InternalQueueMap[Device]; +} + TsanInterceptor::~TsanInterceptor() { // We must release these objects before releasing adapters, since // they may use the adapter in their destructor @@ -190,7 +196,7 @@ TsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) { auto &ProgramInfo = getProgramInfo(Program); for (auto Device : Devices) { - ManagedQueue Queue(Context, Device); + ur_queue_handle_t Queue = ContextInfo->getInternalQueue(Device); size_t MetadataSize; void *MetadataPtr; @@ -333,16 +339,14 @@ ur_result_t TsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, auto CI = getContextInfo(GetContext(Queue)); auto DI = getDeviceInfo(GetDevice(Queue)); - ManagedQueue InternalQueue(CI->Handle, DI->Handle); - if (!InternalQueue) { - UR_LOG_L(getContext()->logger, ERR, "Failed to create internal queue"); - return UR_RESULT_ERROR_INVALID_QUEUE; - } + ur_queue_handle_t InternalQueue = CI->getInternalQueue(DI->Handle); UR_CALL(prepareLaunch(CI, DI, InternalQueue, Kernel, LaunchInfo)); UR_CALL(updateShadowMemory(CI, DI, Kernel, InternalQueue)); + UR_CALL(getContext()->urDdiTable.Queue.pfnFinish(InternalQueue)); + return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp index f6ada3a06b767..e700e70294332 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp @@ -15,6 +15,7 @@ #include "sanitizer_common/sanitizer_allocator.hpp" #include "sanitizer_common/sanitizer_common.hpp" +#include "sanitizer_common/sanitizer_utils.hpp" #include "tsan_buffer.hpp" #include "tsan_libdevice.hpp" #include "tsan_shadow.hpp" @@ -58,6 +59,10 @@ struct ContextInfo { ur_shared_mutex AllocInfosMutex; std::set AllocInfos; + ur_shared_mutex InternalQueueMapMutex; + std::unordered_map> + InternalQueueMap; + explicit ContextInfo(ur_context_handle_t Context) : Handle(Context) { [[maybe_unused]] auto Result = getContext()->urDdiTable.Context.pfnRetain(Context); @@ -65,6 +70,7 @@ struct ContextInfo { } ~ContextInfo() { + InternalQueueMap.clear(); [[maybe_unused]] auto Result = getContext()->urDdiTable.Context.pfnRelease(Handle); assert(Result == UR_RESULT_SUCCESS); @@ -75,6 +81,8 @@ struct ContextInfo { ContextInfo &operator=(const ContextInfo &) = delete; void insertAllocInfo(TsanAllocInfo AI); + + ur_queue_handle_t getInternalQueue(ur_device_handle_t); }; struct DeviceGlobalInfo { From 224e9fee2b346efaa34a80fd19ec9aeb2b7c8dec Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Wed, 6 Aug 2025 21:37:54 +0800 Subject: [PATCH 2/3] [DevTSAN] Move AllocInfo into DeviceInfo to support indirect access (#19634) If we maintain AllocInfo in ContextInfo, this will cause the pointer which is allocated by another context can't be poisoned. --- .../layers/sanitizer/tsan/tsan_buffer.cpp | 6 ++-- .../sanitizer/tsan/tsan_interceptor.cpp | 35 ++++++++++++------- .../sanitizer/tsan/tsan_interceptor.hpp | 13 ++++--- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp index c42a39d7cccde..d95d1c6409a66 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_buffer.cpp @@ -186,8 +186,10 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { } ur_result_t MemBuffer::free() { - for (const auto &[_, Ptr] : Allocations) { - ur_result_t URes = getTsanInterceptor()->releaseMemory(Context, Ptr); + for (const auto &[Device, Ptr] : Allocations) { + ur_result_t URes = Device + ? getTsanInterceptor()->releaseMemory(Context, Ptr) + : getContext()->urDdiTable.USM.pfnFree(Context, Ptr); if (URes != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Failed to free buffer handle {}", (void *)Ptr); diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp index 86c12f9e89b5c..3f9248489f52b 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp @@ -101,7 +101,7 @@ ur_result_t DeviceInfo::allocShadowMemory() { return UR_RESULT_SUCCESS; } -void ContextInfo::insertAllocInfo(TsanAllocInfo AI) { +void DeviceInfo::insertAllocInfo(TsanAllocInfo AI) { std::scoped_lock Guard(AllocInfosMutex); AllocInfos.insert(std::move(AI)); } @@ -153,7 +153,15 @@ ur_result_t TsanInterceptor::allocateMemory(ur_context_handle_t Context, auto AI = TsanAllocInfo{reinterpret_cast(Allocated), Size}; // For updating shadow memory - CI->insertAllocInfo(std::move(AI)); + if (Device) { + auto DI = getDeviceInfo(Device); + DI->insertAllocInfo(std::move(AI)); + } else { + for (const auto &Device : CI->DeviceList) { + auto DI = getDeviceInfo(Device); + DI->insertAllocInfo(AI); + } + } *ResultPtr = Allocated; return UR_RESULT_SUCCESS; @@ -163,11 +171,14 @@ ur_result_t TsanInterceptor::releaseMemory(ur_context_handle_t Context, void *Ptr) { auto CI = getContextInfo(Context); auto Addr = reinterpret_cast(Ptr); - { - std::scoped_lock Guard(CI->AllocInfosMutex); - auto It = std::find_if(CI->AllocInfos.begin(), CI->AllocInfos.end(), + + for (const auto &Device : CI->DeviceList) { + auto DI = getDeviceInfo(Device); + std::scoped_lock Guard(DI->AllocInfosMutex); + auto It = std::find_if(DI->AllocInfos.begin(), DI->AllocInfos.end(), [&](auto &P) { return P.AllocBegin == Addr; }); - CI->AllocInfos.erase(It); + if (It != DI->AllocInfos.end()) + DI->AllocInfos.erase(It); } UR_CALL(getContext()->urDdiTable.USM.pfnFree(Context, Ptr)); @@ -343,7 +354,7 @@ ur_result_t TsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, UR_CALL(prepareLaunch(CI, DI, InternalQueue, Kernel, LaunchInfo)); - UR_CALL(updateShadowMemory(CI, DI, Kernel, InternalQueue)); + UR_CALL(updateShadowMemory(DI, Kernel, InternalQueue)); UR_CALL(getContext()->urDdiTable.Queue.pfnFinish(InternalQueue)); @@ -470,12 +481,12 @@ ur_result_t TsanInterceptor::prepareLaunch(std::shared_ptr &, return UR_RESULT_SUCCESS; } -ur_result_t TsanInterceptor::updateShadowMemory( - std::shared_ptr &CI, std::shared_ptr &DI, - ur_kernel_handle_t Kernel, ur_queue_handle_t Queue) { +ur_result_t TsanInterceptor::updateShadowMemory(std::shared_ptr &DI, + ur_kernel_handle_t Kernel, + ur_queue_handle_t Queue) { auto &PI = getProgramInfo(GetProgram(Kernel)); - std::scoped_lock Guard(CI->AllocInfosMutex); - for (auto &AllocInfo : CI->AllocInfos) { + std::scoped_lock Guard(DI->AllocInfosMutex); + for (auto &AllocInfo : DI->AllocInfos) { UR_CALL(DI->Shadow->CleanShadow(Queue, AllocInfo.AllocBegin, AllocInfo.AllocSize)); } diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp index e700e70294332..eefcba4036c08 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp @@ -44,9 +44,14 @@ struct DeviceInfo { std::shared_ptr Shadow; + ur_shared_mutex AllocInfosMutex; + std::set AllocInfos; + explicit DeviceInfo(ur_device_handle_t Device) : Handle(Device) {} ur_result_t allocShadowMemory(); + + void insertAllocInfo(TsanAllocInfo AI); }; struct ContextInfo { @@ -56,9 +61,6 @@ struct ContextInfo { std::vector DeviceList; - ur_shared_mutex AllocInfosMutex; - std::set AllocInfos; - ur_shared_mutex InternalQueueMapMutex; std::unordered_map> InternalQueueMap; @@ -80,8 +82,6 @@ struct ContextInfo { ContextInfo &operator=(const ContextInfo &) = delete; - void insertAllocInfo(TsanAllocInfo AI); - ur_queue_handle_t getInternalQueue(ur_device_handle_t); }; @@ -297,8 +297,7 @@ class TsanInterceptor { ur_shared_mutex KernelLaunchMutex; private: - ur_result_t updateShadowMemory(std::shared_ptr &CI, - std::shared_ptr &DI, + ur_result_t updateShadowMemory(std::shared_ptr &DI, ur_kernel_handle_t Kernel, ur_queue_handle_t Queue); From 8b2e1f858f5722b2949a3c426b5877e713bdd374 Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Wed, 13 Aug 2025 22:16:32 +0800 Subject: [PATCH 3/3] [DevMSAN] Correctly apply stride for dest/src pointer when do async copy (#19766) Currently, we always define stride in elements when writing to destination pointer. However, according to spirv spec, the stride can only be applied to global ptr. --- libdevice/sanitizer/msan_rtl.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/libdevice/sanitizer/msan_rtl.cpp b/libdevice/sanitizer/msan_rtl.cpp index 8c49d0ac2d151..8e7877636ce55 100644 --- a/libdevice/sanitizer/msan_rtl.cpp +++ b/libdevice/sanitizer/msan_rtl.cpp @@ -303,11 +303,15 @@ inline void ReportError(const uint32_t size, const char __SYCL_CONSTANT__ *file, // This function is only used for shadow propagation template -void GroupAsyncCopy(uptr Dest, uptr Src, size_t NumElements, size_t Stride) { +void GroupAsyncCopy(uptr Dest, uptr Src, size_t NumElements, size_t Stride, + bool StrideOnSrc) { auto DestPtr = (__SYCL_GLOBAL__ T *)Dest; auto SrcPtr = (const __SYCL_GLOBAL__ T *)Src; for (size_t i = 0; i < NumElements; i++) { - DestPtr[i] = SrcPtr[i * Stride]; + if (StrideOnSrc) + DestPtr[i] = SrcPtr[i * Stride]; + else + DestPtr[i * Stride] = SrcPtr[i]; } } @@ -748,16 +752,20 @@ __msan_unpoison_strided_copy(uptr dest, uint32_t dest_as, uptr src, switch (element_size) { case 1: - GroupAsyncCopy(shadow_dest, shadow_src, counts, stride); + GroupAsyncCopy(shadow_dest, shadow_src, counts, stride, + src_as == ADDRESS_SPACE_GLOBAL); break; case 2: - GroupAsyncCopy(shadow_dest, shadow_src, counts, stride); + GroupAsyncCopy(shadow_dest, shadow_src, counts, stride, + src_as == ADDRESS_SPACE_GLOBAL); break; case 4: - GroupAsyncCopy(shadow_dest, shadow_src, counts, stride); + GroupAsyncCopy(shadow_dest, shadow_src, counts, stride, + src_as == ADDRESS_SPACE_GLOBAL); break; case 8: - GroupAsyncCopy(shadow_dest, shadow_src, counts, stride); + GroupAsyncCopy(shadow_dest, shadow_src, counts, stride, + src_as == ADDRESS_SPACE_GLOBAL); break; default: __spirv_ocl_printf(__msan_print_strided_copy_unsupport_type,