From 048d0ebb82e42a4127dbbf12a746bdd71df5fed5 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Fri, 29 Aug 2025 14:52:32 +0100 Subject: [PATCH] [UR][Offload] Fixes for enqueue UR CTS tests A small selection of fixes to increase the pass rate of the enqueue CTS unit tests: * Blocking memory reads/writes now properly wait on the queue. * `urKernelSetArgMemObj` added to the function table. * Debug print removed. * Layout of kernel arguments now matches the HIP target if Offload is on an AMD device. * `urEnqueueEventsWaitWithBarrierExt` has been implemented (it just calls to the non-ext version). * `UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP` set to false. --- .../source/adapters/offload/device.cpp | 1 + .../source/adapters/offload/enqueue.cpp | 12 ++++ .../adapters/offload/ur_interface_loader.cpp | 3 +- .../urEnqueueEventsWaitWithBarrier.cpp | 1 - .../test/conformance/source/environment.cpp | 63 +++---------------- .../testing/include/uur/environment.h | 1 - .../testing/include/uur/fixtures.h | 4 +- .../conformance/testing/include/uur/utils.h | 2 + .../test/conformance/testing/source/utils.cpp | 47 ++++++++++++++ 9 files changed, 76 insertions(+), 58 deletions(-) diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp index 01624a4e60b27..eed50bb917d16 100644 --- a/unified-runtime/source/adapters/offload/device.cpp +++ b/unified-runtime/source/adapters/offload/device.cpp @@ -194,6 +194,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_IMAGE_SRGB: case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: case UR_DEVICE_INFO_LINKER_AVAILABLE: + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: return ReturnValue(false); case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index 0a39aac6f9e34..03a08923f5fdf 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -141,6 +141,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( return doWait(hQueue, numEventsInWaitList, phEventWaitList, phEvent); } +// This function only makes sense for level_zero, the flag in properties is +// ignored +UR_APIEXPORT ur_result_t urEnqueueEventsWaitWithBarrierExt( + ur_queue_handle_t hQueue, const ur_exp_enqueue_ext_properties_t *, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, @@ -235,6 +245,8 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue, OL_RETURN_ON_ERR(waitOnEvents(Queue, phEventWaitList, numEventsInWaitList)); if (blocking) { + // Ensure all work in the queue is complete + OL_RETURN_ON_ERR(olSyncQueue(Queue)); OL_RETURN_ON_ERR( olMemcpy(nullptr, DestPtr, DestDevice, SrcPtr, SrcDevice, size)); if (phEvent) { diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index cc3afd36e0281..d0ded68c8e634 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -118,7 +118,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnRelease = urKernelRelease; pDdiTable->pfnRetain = urKernelRetain; pDdiTable->pfnSetArgLocal = nullptr; - pDdiTable->pfnSetArgMemObj = nullptr; + pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj; pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; pDdiTable->pfnSetArgSampler = nullptr; pDdiTable->pfnSetArgValue = urKernelSetArgValue; @@ -172,6 +172,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; pDdiTable->pfnEventsWait = urEnqueueEventsWait; pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; + pDdiTable->pfnEventsWaitWithBarrierExt = urEnqueueEventsWaitWithBarrierExt; pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; pDdiTable->pfnMemBufferCopyRect = nullptr; diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp index b22594d2a35cc..67225cbef6ddd 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp @@ -88,7 +88,6 @@ struct urEnqueueEventsWaitWithBarrierOrderingTest : uur::urProgramTest { auto entry_points = uur::KernelsEnvironment::instance->GetEntryPointNames(program_name); - std::cout << entry_points[0]; ASSERT_SUCCESS(urKernelCreate(program, "_ZTS3Add", &add_kernel)); ASSERT_SUCCESS(urKernelCreate(program, "_ZTS3Mul", &mul_kernel)); diff --git a/unified-runtime/test/conformance/source/environment.cpp b/unified-runtime/test/conformance/source/environment.cpp index 1612580d35ead..521e42c5aa082 100644 --- a/unified-runtime/test/conformance/source/environment.cpp +++ b/unified-runtime/test/conformance/source/environment.cpp @@ -191,56 +191,6 @@ KernelsEnvironment::parseKernelOptions(int argc, char **argv, return options; } -std::string -KernelsEnvironment::getDefaultTargetName(ur_platform_handle_t platform) { - if (instance->GetDevices().size() == 0) { - error = "no devices available on the platform"; - return {}; - } - - ur_backend_t backend; - if (urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, sizeof(backend), - &backend, nullptr)) { - error = "failed to get backend from platform."; - return {}; - } - - switch (backend) { - case UR_BACKEND_OPENCL: - case UR_BACKEND_LEVEL_ZERO: - return "spir64"; - case UR_BACKEND_CUDA: - return "nvptx64-nvidia-cuda"; - case UR_BACKEND_HIP: - return "amdgcn-amd-amdhsa"; - case UR_BACKEND_OFFLOAD: { - // All Offload platforms report this backend, use the platform name to select - // the actual underlying backend. - std::vector PlatformName; - size_t PlatformNameSize = 0; - urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, 0, nullptr, - &PlatformNameSize); - PlatformName.resize(PlatformNameSize); - urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, PlatformNameSize, - PlatformName.data(), nullptr); - if (std::strcmp(PlatformName.data(), "CUDA") == 0) { - return "nvptx64-nvidia-cuda"; - } else if (std::strcmp(PlatformName.data(), "AMDGPU") == 0) { - return "amdgcn-amd-amdhsa"; - } else { - error = "Could not detect target for Offload platform"; - return {}; - } - } - case UR_BACKEND_NATIVE_CPU: - error = "native_cpu doesn't support kernel tests yet"; - return {}; - default: - error = "unknown target."; - return {}; - } -} - std::string KernelsEnvironment::getKernelSourcePath(const std::string &kernel_name, const std::string &target_name) { @@ -256,12 +206,17 @@ void KernelsEnvironment::LoadSource( // We don't have a way to build device code for native cpu yet. UUR_KNOWN_FAILURE_ON_PARAM(platform, uur::NativeCPU{}); - std::string target_name = getDefaultTargetName(platform); - if (target_name.empty()) { - FAIL() << error; + if (instance->GetDevices().size() == 0) { + FAIL() << "no devices available on the platform"; + } + + std::string triple_name; + auto Err = GetPlatformTriple(platform, triple_name); + if (Err) { + FAIL() << "GetPlatformTriple failed with error " << Err << "\n"; } - return LoadSource(kernel_name, target_name, binary_out); + return LoadSource(kernel_name, triple_name, binary_out); } void KernelsEnvironment::LoadSource( diff --git a/unified-runtime/test/conformance/testing/include/uur/environment.h b/unified-runtime/test/conformance/testing/include/uur/environment.h index dfd248d8d2251..4186b34fe5f35 100644 --- a/unified-runtime/test/conformance/testing/include/uur/environment.h +++ b/unified-runtime/test/conformance/testing/include/uur/environment.h @@ -90,7 +90,6 @@ struct KernelsEnvironment : DevicesEnvironment { const std::string &kernels_default_dir); std::string getKernelSourcePath(const std::string &kernel_name, const std::string &target_name); - std::string getDefaultTargetName(ur_platform_handle_t platform); KernelOptions kernel_options; // mapping between kernels (full_path + kernel_name) and their saved source. diff --git a/unified-runtime/test/conformance/testing/include/uur/fixtures.h b/unified-runtime/test/conformance/testing/include/uur/fixtures.h index fff0be4a0107e..424d4df5e4744 100644 --- a/unified-runtime/test/conformance/testing/include/uur/fixtures.h +++ b/unified-runtime/test/conformance/testing/include/uur/fixtures.h @@ -1472,7 +1472,9 @@ struct KernelLaunchHelper { ur_backend_t backend; ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, sizeof(backend), &backend, nullptr)); - if (backend == UR_BACKEND_HIP) { + std::string target_name; + ASSERT_SUCCESS(GetPlatformTriple(platform, target_name)); + if (target_name == "amdgcn-amd-amdhsa") { // this emulates the three offset params for buffer accessor on AMD. size_t val = 0; ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, diff --git a/unified-runtime/test/conformance/testing/include/uur/utils.h b/unified-runtime/test/conformance/testing/include/uur/utils.h index e6b5ef604ec04..179d8d583efe8 100644 --- a/unified-runtime/test/conformance/testing/include/uur/utils.h +++ b/unified-runtime/test/conformance/testing/include/uur/utils.h @@ -416,6 +416,8 @@ ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, bool &support); ur_result_t GetUSMContextMemcpyExpSupport(ur_device_handle_t device, bool &support); +ur_result_t GetPlatformTriple(ur_platform_handle_t platform, + std::string &Triple); ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count); diff --git a/unified-runtime/test/conformance/testing/source/utils.cpp b/unified-runtime/test/conformance/testing/source/utils.cpp index 51fc6374f7a48..4ee601a2958c9 100644 --- a/unified-runtime/test/conformance/testing/source/utils.cpp +++ b/unified-runtime/test/conformance/testing/source/utils.cpp @@ -649,6 +649,53 @@ ur_result_t GetUSMContextMemcpyExpSupport(ur_device_handle_t device, device, UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP, support); } +ur_result_t GetPlatformTriple(ur_platform_handle_t platform, + std::string &triple) { + ur_backend_t backend; + if (auto Err = urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)) { + return Err; + } + + switch (backend) { + case UR_BACKEND_OPENCL: + case UR_BACKEND_LEVEL_ZERO: + triple = "spir64"; + break; + case UR_BACKEND_CUDA: + triple = "nvptx64-nvidia-cuda"; + break; + case UR_BACKEND_HIP: + triple = "amdgcn-amd-amdhsa"; + break; + case UR_BACKEND_OFFLOAD: { + // All Offload platforms report this backend, use the platform name to select + // the actual underlying backend. + std::vector PlatformName; + size_t PlatformNameSize = 0; + urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, 0, nullptr, + &PlatformNameSize); + PlatformName.resize(PlatformNameSize); + urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, PlatformNameSize, + PlatformName.data(), nullptr); + if (strcmp(PlatformName.data(), "CUDA") == 0) { + triple = "nvptx64-nvidia-cuda"; + } else if (strcmp(PlatformName.data(), "AMDGPU") == 0) { + triple = "amdgcn-amd-amdhsa"; + } else { + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + break; + } + case UR_BACKEND_NATIVE_CPU: + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + return UR_RESULT_SUCCESS; +} + ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count) { ur_device_partition_property_t desc; desc.type = UR_DEVICE_PARTITION_BY_COUNTS;