diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 839db6cfc371..6c789d2640b1 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -98,6 +98,7 @@ def Aspectext_oneapi_clock_sub_group : Aspect<"ext_oneapi_clock_sub_group">; def Aspectext_oneapi_clock_work_group : Aspect<"ext_oneapi_clock_work_group">; def Aspectext_oneapi_clock_device : Aspect<"ext_oneapi_clock_device">; def Aspectext_oneapi_is_integrated_gpu : Aspect<"ext_oneapi_is_integrated_gpu">; +def Aspectext_oneapi_device_wait : Aspect<"ext_oneapi_device_wait">; // Deprecated aspects def AspectInt64_base_atomics : Aspect<"int64_base_atomics">; @@ -176,7 +177,8 @@ def : TargetInfo<"__TestAspectList", Aspectext_oneapi_clock_sub_group, Aspectext_oneapi_clock_work_group, Aspectext_oneapi_clock_device, - Aspectext_oneapi_is_integrated_gpu], + Aspectext_oneapi_is_integrated_gpu, + Aspectext_oneapi_device_wait], []>; // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT // match. diff --git a/sycl/include/sycl/device.hpp b/sycl/include/sycl/device.hpp index 8d2aa1b75b51..9a1c879669fa 100644 --- a/sycl/include/sycl/device.hpp +++ b/sycl/include/sycl/device.hpp @@ -365,6 +365,21 @@ class __SYCL_STANDALONE_DEBUG __SYCL_EXPORT device return profile.c_str(); } + /// Synchronizes with all queues associated with the device. + void ext_oneapi_wait(); + + /// Dispatches all unconsumed asynchronous exceptions for all queues or + /// contexts associated with the queues. + void ext_oneapi_throw_asynchronous(); + + /// Synchronizes with all queues associated with the device, then dispatches + /// all unconsumed asynchronous exceptions for all queues or contexts + /// associated with the queues. + void ext_oneapi_wait_and_throw() { + ext_oneapi_wait(); + ext_oneapi_throw_asynchronous(); + } + // TODO: Remove this diagnostics when __SYCL_WARN_IMAGE_ASPECT is removed. #if defined(__clang__) #pragma clang diagnostic pop diff --git a/sycl/include/sycl/exception_list.hpp b/sycl/include/sycl/exception_list.hpp index 2a0202fa020a..4e6140d8389c 100644 --- a/sycl/include/sycl/exception_list.hpp +++ b/sycl/include/sycl/exception_list.hpp @@ -24,6 +24,7 @@ inline namespace _V1 { // Forward declaration namespace detail { class queue_impl; +class device_impl; } /// A list of asynchronous exceptions. @@ -46,6 +47,7 @@ class __SYCL_EXPORT exception_list { private: friend class detail::queue_impl; + friend class detail::device_impl; void PushBack(const_reference Value); void PushBack(value_type &&Value); void Clear() noexcept; diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index 4c5cb0a3ec8b..0f99a2407e6d 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -84,3 +84,4 @@ __SYCL_ASPECT(ext_oneapi_clock_sub_group, 91) __SYCL_ASPECT(ext_oneapi_clock_work_group, 92) __SYCL_ASPECT(ext_oneapi_clock_device, 93) __SYCL_ASPECT(ext_oneapi_is_integrated_gpu, 94) +__SYCL_ASPECT(ext_oneapi_device_wait, 95) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 629aa72f04dd..b198569569f4 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -503,6 +503,34 @@ device_impl::getImmediateProgressGuarantee( return forward_progress_guarantee::weakly_parallel; } +void device_impl::wait() const { + // Firstly, all associated queues should be cleaned through of all + // not-yet-enqueued commands and host_task. + for (const std::weak_ptr &WQueue : MQueues) { + std::shared_ptr Queue = WQueue.lock(); + assert(Queue && "Queue should never be dangling in the list of queues " + "associated with the device!"); + Queue->waitForRuntimeLevelCmdsAndClear(); + } + + // Then we synchronize the entire device. + getAdapter().call(getHandleRef()); +} + +void device_impl::throwAsynchronous() { + std::lock_guard Lock(MAsyncExceptionsMutex); + for (auto &ExceptionsEntryIt : MAsyncExceptions) { + exception_list Exceptions = std::move(ExceptionsEntryIt.second); + std::shared_ptr Queue = ExceptionsEntryIt.first.lock(); + if (Queue && Queue->getAsynchHandler()) { + Queue->getAsynchHandler()(std::move(Exceptions)); + } else { + // If the queue is dead, use the default handler. + defaultAsyncHandler(std::move(Exceptions)); + } + } +} + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES #define EXPORT_GET_INFO(PARAM) \ template <> \ diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 13c5f5c16ce3..2daa2af48c73 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -1597,6 +1597,10 @@ class device_impl : public std::enable_shared_from_this { get_info_impl_nocheck().value_or( 0); } + CASE(ext_oneapi_device_wait) { + return get_info_impl_nocheck() + .value_or(0); + } else { return false; // This device aspect has not been implemented yet. } @@ -2267,6 +2271,47 @@ class device_impl : public std::enable_shared_from_this { return {}; } + /// Puts exception to the list of asynchronous ecxeptions. + /// + /// \param QueueWeakPtr is a weak pointer referring to the queue to report + /// the asynchronous exceptions for. + /// \param ExceptionPtr is a pointer to exception to be put. + void reportAsyncException(std::weak_ptr QueueWeakPtr, + const std::exception_ptr &ExceptionPtr) { + std::lock_guard Lock(MAsyncExceptionsMutex); + MAsyncExceptions[QueueWeakPtr].PushBack(ExceptionPtr); + } + + /// Extracts all unconsumed asynchronous exceptions for a given queue. + /// + /// \param QueueWeakPtr is a weak pointer referring to the queue to extract + /// unconsumed asynchronous exceptions for. + exception_list flushAsyncExceptions(std::weak_ptr QueueWeakPtr) { + std::lock_guard Lock(MAsyncExceptionsMutex); + auto ExceptionsEntryIt = MAsyncExceptions.find(QueueWeakPtr); + if (ExceptionsEntryIt == MAsyncExceptions.end()) + return exception_list{}; + exception_list Exceptions = std::move(ExceptionsEntryIt->second); + MAsyncExceptions.erase(ExceptionsEntryIt); + return Exceptions; + } + + /// Synchronizes with all queues on the device. + void wait() const; + + // Dispatch all unconsumed asynchronous exception to the appropriate handlers. + void throwAsynchronous(); + + void registerQueue(const std::weak_ptr &Q) { + std::lock_guard Lock(MQueuesMutex); + MQueues.insert(Q); + } + + void unregisterQueue(const std::weak_ptr &Q) { + std::lock_guard Lock(MQueuesMutex); + MQueues.erase(Q); + } + private: ur_device_handle_t MDevice = 0; // This is used for getAdapter so should be above other properties. @@ -2277,6 +2322,20 @@ class device_impl : public std::enable_shared_from_this { const ur_device_handle_t MRootDevice; + // Devices track a list of active queues on it, to allow for synchronization + // with host_task and not-yet-enqueued commands. + std::mutex MQueuesMutex; + std::set, + std::owner_less>> + MQueues; + + // Asynchronous exceptions are captured at device-level until flushed, either + // by queues, events or a synchronization on the device itself. + std::mutex MAsyncExceptionsMutex; + std::map, exception_list, + std::owner_less>> + MAsyncExceptions; + // Order of caches matters! UR must come before SYCL info descriptors (because // get_info calls get_info_impl but the opposite never happens) and both // should come before aspects. diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index b0c838cdd890..30950824e6bf 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -211,8 +211,9 @@ void event_impl::initHostProfilingInfo() { MHostProfilingInfo->setDevice(&Device); } -void event_impl::setSubmittedQueue(std::weak_ptr SubmittedQueue) { - MSubmittedQueue = std::move(SubmittedQueue); +void event_impl::setSubmittedQueue(queue_impl *SubmittedQueue) { + MSubmittedQueue = SubmittedQueue->weak_from_this(); + MSubmittedDevice = &SubmittedQueue->getDeviceImpl(); } #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -308,8 +309,28 @@ void event_impl::wait(bool *Success) { void event_impl::wait_and_throw() { wait(); - if (std::shared_ptr SubmittedQueue = MSubmittedQueue.lock()) + if (std::shared_ptr SubmittedQueue = MSubmittedQueue.lock()) { SubmittedQueue->throw_asynchronous(); + return; + } + + // If the queue has died, we rely on finding its exceptions through the + // device. + if (MSubmittedDevice == nullptr) + return; + + // If MSubmittedQueue has died, get flush any exceptions associated with it + // still, then user either the context async_handler or the default + // async_handler. + exception_list Exceptions = + MSubmittedDevice->flushAsyncExceptions(MSubmittedQueue); + if (Exceptions.size() == 0) + return; + + if (MContext && MContext->get_async_handler()) + MContext->get_async_handler()(std::move(Exceptions)); + else + defaultAsyncHandler(std::move(Exceptions)); } void event_impl::checkProfilingPreconditions() const { diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 0ca4aa6d49a9..05c7b87c89c0 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -264,10 +264,10 @@ class event_impl { MWorkerQueue = std::move(WorkerQueue); }; - /// Sets original queue used for submission. + /// Sets original queue and device used for submission. /// /// @return - void setSubmittedQueue(std::weak_ptr SubmittedQueue); + void setSubmittedQueue(queue_impl *SubmittedQueue); /// Indicates if this event is not associated with any command and doesn't /// have native handle. @@ -394,6 +394,7 @@ class event_impl { std::weak_ptr MWorkerQueue; std::weak_ptr MSubmittedQueue; + device_impl *MSubmittedDevice = nullptr; /// Dependency events prepared for waiting by backend. std::vector MPreparedDepsEvents; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 2b18a4fb6e28..10e0fdfaae30 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -514,7 +514,7 @@ EventImplPtr queue_impl::submit_command_to_graph( std::unique_ptr CommandGroup, sycl::detail::CGType CGType, sycl::ext::oneapi::experimental::node_type UserFacingNodeType) { auto EventImpl = detail::event_impl::create_completed_host_event(); - EventImpl->setSubmittedQueue(weak_from_this()); + EventImpl->setSubmittedQueue(this); ext::oneapi::experimental::detail::node_impl *NodeImpl = nullptr; // GraphImpl is read and written in this scope so we lock this graph @@ -889,32 +889,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { LastEvent->wait(); } } else if (!isInOrder()) { - std::vector> WeakEvents; - { - std::lock_guard Lock(MMutex); - WeakEvents.swap(MEventsWeak); - MMissedCleanupRequests.unset( - [&](MissedCleanupRequestsType &MissedCleanupRequests) { - for (auto &UpdatedGraph : MissedCleanupRequests) - doUnenqueuedCommandCleanup(UpdatedGraph); - MissedCleanupRequests.clear(); - }); - } - - // Wait for unenqueued or host task events, starting - // from the latest submitted task in order to minimize total amount of - // calls, then handle the rest with urQueueFinish. - for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); - EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { - if (std::shared_ptr EventImplSharedPtr = - EventImplWeakPtrIt->lock()) { - // A nullptr UR event indicates that urQueueFinish will not cover it, - // either because it's a host task event or an unenqueued one. - if (nullptr == EventImplSharedPtr->getHandle()) { - EventImplSharedPtr->wait(); - } - } - } + waitForRuntimeLevelCmdsAndClear(); } getAdapter().call(getHandleRef()); @@ -1127,6 +1102,47 @@ void queue_impl::verifyProps(const property_list &Props) const { CheckPropertiesWithData); } +void queue_impl::waitForRuntimeLevelCmdsAndClear() { + if (isInOrder() && !MNoLastEventMode.load(std::memory_order_relaxed)) { + // if MLastEvent is not null and has no associated handle, we need to wait + // for it. We do not clear it however. + EventImplPtr LastEvent; + { + std::lock_guard Lock(MMutex); + LastEvent = MDefaultGraphDeps.LastEventPtr; + } + if (LastEvent && nullptr == LastEvent->getHandle()) + LastEvent->wait(); + } else if (!isInOrder()) { + std::vector> WeakEvents; + { + std::lock_guard Lock(MMutex); + WeakEvents.swap(MEventsWeak); + MMissedCleanupRequests.unset( + [&](MissedCleanupRequestsType &MissedCleanupRequests) { + for (auto &UpdatedGraph : MissedCleanupRequests) + doUnenqueuedCommandCleanup(UpdatedGraph); + MissedCleanupRequests.clear(); + }); + } + + // Wait for unenqueued or host task events, starting + // from the latest submitted task in order to minimize total amount of + // calls, then handle the rest with urQueueFinish. + for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); + EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtrIt->lock()) { + // A nullptr UR event indicates that urQueueFinish will not cover it, + // either because it's a host task event or an unenqueued one. + if (nullptr == EventImplSharedPtr->getHandle()) { + EventImplSharedPtr->wait(); + } + } + } + } +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 49da7aee8c44..ef68f581c492 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -241,8 +241,10 @@ class queue_impl : public std::enable_shared_from_this { // `std::shared_ptr` allocations. template static std::shared_ptr create(Ts &&...args) { - return std::make_shared(std::forward(args)..., - private_tag{}); + auto ImplPtr = + std::make_shared(std::forward(args)..., private_tag{}); + ImplPtr->getDeviceImpl().registerQueue(ImplPtr); + return ImplPtr; } ~queue_impl() { @@ -253,7 +255,7 @@ class queue_impl : public std::enable_shared_from_this { // notification and destroy the trace event for this queue. destructorNotification(); #endif - throw_asynchronous(); + MDevice.unregisterQueue(weak_from_this()); auto status = getAdapter().call_nocheck(MQueue); // If loader is already closed, it'll return a not-initialized status @@ -411,9 +413,6 @@ class queue_impl : public std::enable_shared_from_this { /// @param Loc is the code location of the submit call (default argument) void wait(const detail::code_location &Loc = {}); - /// \return list of asynchronous exceptions occurred during execution. - exception_list getExceptionList() const { return MExceptions; } - /// @param Loc is the code location of the submit call (default argument) void wait_and_throw(const detail::code_location &Loc = {}) { wait(Loc); @@ -426,21 +425,21 @@ class queue_impl : public std::enable_shared_from_this { /// Synchronous errors will be reported through SYCL exceptions. /// Asynchronous errors will be passed to the async_handler passed to the /// queue on construction. If no async_handler was provided then - /// asynchronous exceptions will be lost. + /// asynchronous exceptions will be passed to the async_handler associated + /// with the context if present, or the default async_handler otherwise. void throw_asynchronous() { - if (!MAsyncHandler) + exception_list Exceptions = + getDeviceImpl().flushAsyncExceptions(weak_from_this()); + if (Exceptions.size() == 0) return; - exception_list Exceptions; - { - std::lock_guard Lock(MMutex); - std::swap(Exceptions, MExceptions); - } - // Unlock the mutex before calling user-provided handler to avoid - // potential deadlock if the same queue is somehow referenced in the - // handler. - if (Exceptions.size()) + if (MAsyncHandler) MAsyncHandler(std::move(Exceptions)); + else if (const async_handler &CtxAsyncHandler = + getContextImpl().get_async_handler()) + CtxAsyncHandler(std::move(Exceptions)); + else + defaultAsyncHandler(std::move(Exceptions)); } /// Creates UR properties array. @@ -588,14 +587,6 @@ class queue_impl : public std::enable_shared_from_this { event mem_advise(const void *Ptr, size_t Length, ur_usm_advice_flags_t Advice, const std::vector &DepEvents, bool CallerNeedsEvent); - /// Puts exception to the list of asynchronous ecxeptions. - /// - /// \param ExceptionPtr is a pointer to exception to be put. - void reportAsyncException(const std::exception_ptr &ExceptionPtr) { - std::lock_guard Lock(MMutex); - MExceptions.PushBack(ExceptionPtr); - } - static ThreadPool &getThreadPool() { return GlobalHandler::instance().getHostTaskThreadPool(); } @@ -716,6 +707,17 @@ class queue_impl : public std::enable_shared_from_this { } #endif + /// Returns the async_handler associated with the queue. + const async_handler &getAsynchHandler() const noexcept { + return MAsyncHandler; + } + + /// Waits for all not-yet-enqueued and host_task commands in the queue and + /// clears the events associated with the queue (if out-of-order.) + /// Note: This should only be called if the queue is guaranteed to be + /// synchronized by the caller. + void waitForRuntimeLevelCmdsAndClear(); + protected: template EventImplPtr insertHelperBarrier(const HandlerType &Handler) { @@ -1003,10 +1005,6 @@ class queue_impl : public std::enable_shared_from_this { /// These events are tracked, but not owned, by the queue. std::vector> MEventsWeak; - /// Events without data dependencies (such as USM) need an owner, - /// additionally, USM operations are not added to the scheduler command graph, - /// queue is the only owner on the runtime side. - exception_list MExceptions; const async_handler MAsyncHandler; const property_list MPropList; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index b38706b9d9d3..8de3a1784513 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -359,12 +359,14 @@ class DispatchHostTask { AdapterWithEvents.first->call(RawEvents.size(), RawEvents.data()); } catch (const sycl::exception &) { - MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( - std::current_exception()); + auto QueuePtr = MThisCmd->MEvent->getSubmittedQueue(); + QueuePtr->getDeviceImpl().reportAsyncException( + QueuePtr, std::current_exception()); return false; } catch (...) { - MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( - std::current_exception()); + auto QueuePtr = MThisCmd->MEvent->getSubmittedQueue(); + QueuePtr->getDeviceImpl().reportAsyncException( + QueuePtr, std::current_exception()); return false; } } @@ -407,7 +409,8 @@ class DispatchHostTask { make_error_code(errc::runtime), std::string("Couldn't wait for host-task's dependencies"))); - MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(EPtr); + auto QueuePtr = MThisCmd->MEvent->getSubmittedQueue(); + QueuePtr->getDeviceImpl().reportAsyncException(QueuePtr, EPtr); // reset host-task's lambda and quit HostTask.MHostTask.reset(); Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd); @@ -469,8 +472,9 @@ class DispatchHostTask { } } #endif - MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( - CurrentException); + auto QueuePtr = MThisCmd->MEvent->getSubmittedQueue(); + QueuePtr->getDeviceImpl().reportAsyncException(QueuePtr, + CurrentException); } HostTask.MHostTask.reset(); @@ -487,8 +491,9 @@ class DispatchHostTask { Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd); } catch (...) { auto CurrentException = std::current_exception(); - MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( - CurrentException); + auto QueuePtr = MThisCmd->MEvent->getSubmittedQueue(); + QueuePtr->getDeviceImpl().reportAsyncException(QueuePtr, + CurrentException); } } }; @@ -563,7 +568,8 @@ Command::Command( MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) { MWorkerQueue = MQueue; MEvent->setWorkerQueue(MWorkerQueue); - MEvent->setSubmittedQueue(MWorkerQueue); + if (Queue) + MEvent->setSubmittedQueue(Queue); MEvent->setCommand(this); if (MQueue) MEvent->setContextImpl(MQueue->getContextImpl()); @@ -1958,7 +1964,7 @@ ExecCGCommand::ExecCGCommand( assert(SubmitQueue && "Host task command group must have a valid submit queue"); - MEvent->setSubmittedQueue(SubmitQueue->weak_from_this()); + MEvent->setSubmittedQueue(SubmitQueue); // Initialize host profiling info if the queue has profiling enabled. if (SubmitQueue->MIsProfilingEnabled) MEvent->initHostProfilingInfo(); diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c6a821cdf1e0..dedefd7a70b8 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -260,7 +260,8 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { auto WorkerQueue = NewCmd->getEvent()->getWorkerQueue(); assert(WorkerQueue && "WorkerQueue for CopyBack command must be not null"); - WorkerQueue->reportAsyncException(std::current_exception()); + WorkerQueue->getDeviceImpl().reportAsyncException( + WorkerQueue, std::current_exception()); } } EventImplPtr NewEvent = NewCmd->getEvent(); diff --git a/sycl/source/detail/ur_device_info_ret_types.inc b/sycl/source/detail/ur_device_info_ret_types.inc index 9c7a12379efd..98a3189ab1ec 100644 --- a/sycl/source/detail/ur_device_info_ret_types.inc +++ b/sycl/source/detail/ur_device_info_ret_types.inc @@ -197,4 +197,5 @@ MAP(UR_DEVICE_INFO_CLOCK_SUB_GROUP_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_CLOCK_WORK_GROUP_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_CLOCK_DEVICE_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_IS_INTEGRATED_GPU, ur_bool_t) +MAP(UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP, ur_bool_t) // clang-format on diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index 6796060dda3d..353962363703 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -344,5 +344,15 @@ detail::string device::ext_oneapi_cl_profile_impl() const { return detail::string{profile}; } +void device::ext_oneapi_wait() { + if (!has(aspect::ext_oneapi_device_wait)) + throw sycl::exception( + make_error_code(errc::feature_not_supported), + "Device does not support aspect::ext_oneapi_device_wait."); + impl->wait(); +} + +void device::ext_oneapi_throw_asynchronous() { impl->throwAsynchronous(); } + } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/AsyncHandler/custom_async_handler.cpp b/sycl/test-e2e/AsyncHandler/custom_async_handler.cpp new file mode 100644 index 000000000000..113f67a54d3d --- /dev/null +++ b/sycl/test-e2e/AsyncHandler/custom_async_handler.cpp @@ -0,0 +1,217 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include + +int main() { + int Failures = 0; + + // Case 1 - Event wait_and_throw with custom handler on queue. + { + bool Caught = false; + auto AHandler = [&](sycl::exception_list EL) { + Caught = true; + if (EL.size() != 1) { + std::cout << "Case 1 - Unexpected number of exceptions." << std::endl; + ++Failures; + } + }; + sycl::queue Q{sycl::default_selector_v, AHandler}; + Q.submit([&](sycl::handler &CGH) { + CGH.host_task([=]() { throw std::runtime_error("Case 1"); }); + }).wait_and_throw(); + if (!Caught) { + std::cout << "Case 1 - Async exception not caught by handler." + << std::endl; + ++Failures; + } + } + + // Case 2 - Event wait_and_throw with custom handler on context. + { + bool Caught = false; + auto AHandler = [&](sycl::exception_list EL) { + Caught = true; + if (EL.size() != 1) { + std::cout << "Case 2 - Unexpected number of exceptions." << std::endl; + ++Failures; + } + }; + sycl::context Ctx{AHandler}; + sycl::queue Q{Ctx, sycl::default_selector_v}; + Q.submit([&](sycl::handler &CGH) { + CGH.host_task([=]() { throw std::runtime_error("Case 2"); }); + }).wait_and_throw(); + if (!Caught) { + std::cout << "Case 2 - Async exception not caught by handler." + << std::endl; + ++Failures; + } + } + + // Case 3 - Event wait_and_throw with custom handler on both queue and + // context. + { + bool Caught = false; + auto CtxAHandler = [&](sycl::exception_list EL) { + std::cout << "Case 3 - Unexpected handler used." << std::endl; + ++Failures; + }; + auto QAHandler = [&](sycl::exception_list EL) { + Caught = true; + if (EL.size() != 1) { + std::cout << "Case 3 - Unexpected number of exceptions." << std::endl; + ++Failures; + } + }; + sycl::context Ctx{CtxAHandler}; + sycl::queue Q{Ctx, sycl::default_selector_v, QAHandler}; + Q.submit([&](sycl::handler &CGH) { + CGH.host_task([=]() { throw std::runtime_error("Case 3"); }); + }).wait_and_throw(); + if (!Caught) { + std::cout << "Case 3 - Async exception not caught by handler." + << std::endl; + ++Failures; + } + } + + // Case 4 - Queue wait_and_throw with custom handler on queue. + { + bool Caught = false; + auto AHandler = [&](sycl::exception_list EL) { + Caught = true; + if (EL.size() != 1) { + std::cout << "Case 1 - Unexpected number of exceptions." << std::endl; + ++Failures; + } + }; + sycl::queue Q{sycl::default_selector_v, AHandler}; + Q.submit([&](sycl::handler &CGH) { + CGH.host_task([=]() { throw std::runtime_error("Case 4"); }); + }); + Q.wait_and_throw(); + if (!Caught) { + std::cout << "Case 4 - Async exception not caught by handler." + << std::endl; + ++Failures; + } + } + + // Case 5 - Queue wait and throw_asynchronous with custom handler on queue. + { + bool Caught = false; + auto AHandler = [&](sycl::exception_list EL) { + Caught = true; + if (EL.size() != 1) { + std::cout << "Case 5 - Unexpected number of exceptions." << std::endl; + ++Failures; + } + }; + sycl::queue Q{sycl::default_selector_v, AHandler}; + Q.submit([&](sycl::handler &CGH) { + CGH.host_task([=]() { throw std::runtime_error("Case 5"); }); + }); + Q.wait(); + Q.throw_asynchronous(); + if (!Caught) { + std::cout << "Case 5 - Async exception not caught by handler." + << std::endl; + ++Failures; + } + } + + // Case 6 - Queue wait_and_throw with custom handler on context. + { + bool Caught = false; + auto AHandler = [&](sycl::exception_list EL) { + Caught = true; + if (EL.size() != 1) { + std::cout << "Case 6 - Unexpected number of exceptions." << std::endl; + ++Failures; + } + }; + sycl::context Ctx{AHandler}; + sycl::queue Q{Ctx, sycl::default_selector_v}; + Q.submit([&](sycl::handler &CGH) { + CGH.host_task([=]() { throw std::runtime_error("Case 6"); }); + }); + Q.wait_and_throw(); + if (!Caught) { + std::cout << "Case 6 - Async exception not caught by handler." + << std::endl; + ++Failures; + } + } + + // Case 7 - Queue wait and throw_asynchronous with custom handler on context. + { + bool Caught = false; + auto AHandler = [&](sycl::exception_list EL) { + Caught = true; + if (EL.size() != 1) { + std::cout << "Case 7 - Unexpected number of exceptions." << std::endl; + ++Failures; + } + }; + sycl::context Ctx{AHandler}; + sycl::queue Q{Ctx, sycl::default_selector_v}; + Q.submit([&](sycl::handler &CGH) { + CGH.host_task([=]() { throw std::runtime_error("Case 7"); }); + }); + Q.wait(); + Q.throw_asynchronous(); + if (!Caught) { + std::cout << "Case 7 - Async exception not caught by handler." + << std::endl; + ++Failures; + } + } + + // Case 8 - Queue wait_and_throw with custom handler on both queue and + // context. + { + bool Caught = false; + auto CtxAHandler = [&](sycl::exception_list EL) { + std::cout << "Case 8 - Unexpected handler used." << std::endl; + ++Failures; + }; + auto QAHandler = [&](sycl::exception_list EL) { + Caught = true; + if (EL.size() != 1) { + std::cout << "Case 8 - Unexpected number of exceptions." << std::endl; + ++Failures; + } + }; + sycl::context Ctx{CtxAHandler}; + sycl::queue Q{Ctx, sycl::default_selector_v, QAHandler}; + Q.submit([&](sycl::handler &CGH) { + CGH.host_task([=]() { throw std::runtime_error("Case 8"); }); + }); + Q.wait_and_throw(); + if (!Caught) { + std::cout << "Case 8 - Async exception not caught by handler." + << std::endl; + ++Failures; + } + } + + // Case 9 - Queue dying without having consumed its asynchronous exceptions. + { + auto CtxAHandler = [&](sycl::exception_list EL) { + std::cout << "Case 9 - Unexpected context handler used." << std::endl; + ++Failures; + }; + auto QAHandler = [&](sycl::exception_list EL) { + std::cout << "Case 9 - Unexpected queue handler used." << std::endl; + ++Failures; + }; + sycl::context Ctx{CtxAHandler}; + sycl::queue Q{Ctx, sycl::default_selector_v, QAHandler}; + Q.submit([&](sycl::handler &CGH) { + CGH.host_task([=]() { throw std::runtime_error("Case 9"); }); + }); + } + + return Failures; +} diff --git a/sycl/test-e2e/AsyncHandler/default_async_handler.cpp b/sycl/test-e2e/AsyncHandler/default_async_handler.cpp index 7e6a333e305d..de713859e31f 100644 --- a/sycl/test-e2e/AsyncHandler/default_async_handler.cpp +++ b/sycl/test-e2e/AsyncHandler/default_async_handler.cpp @@ -1,19 +1,57 @@ // RUN: %{build} -o %t.out -// RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt +// RUN: %{run} %t.out 0 &> %t_0.txt ; FileCheck %s --input-file %t_0.txt --check-prefix CHECK-0 +// RUN: %{run} %t.out 1 &> %t_1.txt ; FileCheck %s --input-file %t_1.txt --check-prefix CHECK-1 +// RUN: %{run} %t.out 2 &> %t_2.txt ; FileCheck %s --input-file %t_2.txt --check-prefix CHECK-2 +#include #include using namespace sycl; -int main() { +int main(int argc, char *argv[]) { + assert(argc == 2); + int TestRun = std::stoi(argv[1]); + assert(TestRun >= 0 && TestRun <= 3); + queue Q; - Q.submit([&](handler &CGH) { - CGH.host_task([=]() { - throw std::runtime_error("Exception thrown from host_task."); - }); - }).wait_and_throw(); + if (TestRun == 0) { + Q.submit([&](handler &CGH) { + CGH.host_task([=]() { + throw std::runtime_error("Exception thrown from host_task through " + "event::wait_and_throw()."); + }); + }).wait_and_throw(); + } else if (TestRun == 1) { + Q.submit([&](handler &CGH) { + CGH.host_task([=]() { + throw std::runtime_error( + "Exception thrown from host_task through queue::wait_and_throw()."); + }); + }); + Q.wait_and_throw(); + } else if (TestRun == 2) { + Q.submit([&](handler &CGH) { + CGH.host_task([=]() { + throw std::runtime_error( + "Exception thrown from host_task through queue::wait() and " + "queue::throw_asynchronous()."); + }); + }); + Q.wait(); + Q.throw_asynchronous(); + } return 0; } -// CHECK: Default async_handler caught exceptions: -// CHECK-NEXT: Exception thrown from host_task. +// CHECK-0: Default async_handler caught exceptions: +// CHECK-0-NEXT: Exception thrown from host_task through event::wait_and_throw(). + +// CHECK-1: Default async_handler caught exceptions: +// CHECK-1-NEXT: Exception thrown from host_task through queue::wait_and_throw(). + +// CHECK-2: Default async_handler caught exceptions: +// CHECK-2-NEXT: Exception thrown from host_task through queue::wait() and queue::throw_asynchronous(). + +// CHECK-3: Default async_handler caught exceptions: +// CHECK-3-NEXT: Exception thrown from host_task through event::wait_and_throw() after queue death. +// CHECK-3-NOT: Custom queue async handler was called! diff --git a/sycl/test-e2e/DeviceWait/basic.cpp b/sycl/test-e2e/DeviceWait/basic.cpp new file mode 100644 index 000000000000..2928e4cfab86 --- /dev/null +++ b/sycl/test-e2e/DeviceWait/basic.cpp @@ -0,0 +1,50 @@ +// REQUIRES: aspect-ext_oneapi_device_wait + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include +#include + +#include +#include + +constexpr size_t NContexts = 2; +constexpr size_t NQueues = 6; + +int main() { + sycl::device D; + std::array Contexts{sycl::context{D}, + sycl::context{D}}; + std::array Queues{ + sycl::queue{Contexts[0], D}, + sycl::queue{Contexts[0], D, sycl::property::queue::in_order()}, + sycl::queue{Contexts[0], D}, + sycl::queue{Contexts[1], D, sycl::property::queue::in_order()}, + sycl::queue{Contexts[1], D}, + sycl::queue{Contexts[1], D, sycl::property::queue::in_order()}}; + + std::vector Events; + Events.reserve(NQueues); + for (sycl::queue &Q : Queues) { + sycl::event E = Q.single_task([]() { + volatile int value = 1024 * 1024; + while (--value) + ; + }); + Events.push_back(std::move(E)); + } + + D.ext_oneapi_wait(); + + int Failed = 0; + for (size_t I = 0; I < Events.size(); ++I) { + sycl::info::event_command_status EventStatus = + Events[I].get_info(); + if (EventStatus != sycl::info::event_command_status::complete) { + std::cout << "Unexpected event status for event at " << I << std::endl; + ++Failed; + } + } + return Failed; +} diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index f9a2adb393f4..60f522384168 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3389,12 +3389,14 @@ _ZN4sycl3_V16detail9modf_implENS1_9half_impl4halfEPS3_ _ZN4sycl3_V16detail9modf_implEdPd _ZN4sycl3_V16detail9modf_implEfPf _ZN4sycl3_V16device11get_devicesENS0_4info11device_typeE +_ZN4sycl3_V16device15ext_oneapi_waitEv _ZN4sycl3_V16device20ext_oneapi_can_buildENS0_3ext6oneapi12experimental15source_languageE _ZN4sycl3_V16device22ext_oneapi_can_compileENS0_3ext6oneapi12experimental15source_languageE _ZN4sycl3_V16device26ext_oneapi_architecture_isENS0_3ext6oneapi12experimental12architectureE _ZN4sycl3_V16device26ext_oneapi_architecture_isENS0_3ext6oneapi12experimental13arch_categoryE _ZN4sycl3_V16device26ext_oneapi_can_access_peerERKS1_NS0_3ext6oneapi11peer_accessE _ZN4sycl3_V16device29ext_oneapi_enable_peer_accessERKS1_ +_ZN4sycl3_V16device29ext_oneapi_throw_asynchronousEv _ZN4sycl3_V16device30ext_oneapi_disable_peer_accessERKS1_ _ZN4sycl3_V16device32ext_oneapi_supports_cl_c_featureENS0_6detail11string_viewE _ZN4sycl3_V16deviceC1EP13_cl_device_id diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 7506f4fec3f2..7e2156eba7c4 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4022,6 +4022,9 @@ ?ext_oneapi_supports_cl_c_version@device@_V1@sycl@@QEBA_NAEBUcl_version@experimental@oneapi@ext@23@@Z ?ext_oneapi_supports_cl_extension@device@_V1@sycl@@AEBA_NVstring_view@detail@23@PEAUcl_version@experimental@oneapi@ext@23@@Z ?ext_oneapi_supports_cl_extension@device@_V1@sycl@@QEBA_NAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAUcl_version@experimental@oneapi@ext@23@@Z +?ext_oneapi_throw_asynchronous@device@_V1@sycl@@QEAAXXZ +?ext_oneapi_wait_and_throw@device@_V1@sycl@@QEAAXXZ +?ext_oneapi_wait@device@_V1@sycl@@QEAAXXZ ?ext_oneapi_wait_external_semaphore@handler@_V1@sycl@@QEAAXUexternal_semaphore@experimental@oneapi@ext@23@@Z ?ext_oneapi_wait_external_semaphore@handler@_V1@sycl@@QEAAXUexternal_semaphore@experimental@oneapi@ext@23@_K@Z ?ext_oneapi_wait_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uexternal_semaphore@experimental@oneapi@ext@23@AEBUcode_location@detail@23@@Z diff --git a/unified-runtime/cmake/FetchLevelZero.cmake b/unified-runtime/cmake/FetchLevelZero.cmake index 7997e99c6964..5ea2db0d9960 100644 --- a/unified-runtime/cmake/FetchLevelZero.cmake +++ b/unified-runtime/cmake/FetchLevelZero.cmake @@ -50,7 +50,7 @@ if(NOT LEVEL_ZERO_LIB_NAME AND NOT LEVEL_ZERO_LIBRARY) set(UR_LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git") # Remember to update the pkg_check_modules minimum version above when updating the # clone tag - set(UR_LEVEL_ZERO_LOADER_TAG v1.24.3) + set(UR_LEVEL_ZERO_LOADER_TAG master) # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104 set(CMAKE_INCLUDE_CURRENT_DIR OFF) diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 057c89cc9139..4ad894880db0 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -483,6 +483,8 @@ typedef enum ur_function_t { UR_FUNCTION_IPC_OPEN_MEM_HANDLE_EXP = 291, /// Enumerator for ::urIPCCloseMemHandleExp UR_FUNCTION_IPC_CLOSE_MEM_HANDLE_EXP = 292, + /// Enumerator for ::urDeviceWaitExp + UR_FUNCTION_DEVICE_WAIT_EXP = 293, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -2457,6 +2459,9 @@ typedef enum ur_device_info_t { /// [::ur_bool_t] Returns true if the device supports the multi device /// compile experimental feature. UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP = 0x6000, + /// [::ur_bool_t] Returns true if the device supports the device-wide + /// synchronization experimental feature. + UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP = 0x6002, /// [::ur_bool_t] returns true if the device supports /// ::urUSMContextMemcpyExp UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP = 0x7000, @@ -12355,6 +12360,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( /// [out] A pointer to the native handle of the command-buffer. ur_native_handle_t *phNativeCommandBuffer); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental APIs for device-wide +// synchronization +#if !defined(__GNUC__) +#pragma region device_wait_(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Synchronizes with all queues on the device. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hDevice` +/// + `hDevice == nullptr` +/// - ::UR_RESULT_ERROR_INVALID_DEVICE +UR_APIEXPORT ur_result_t UR_APICALL urDeviceWaitExp( + /// [in] handle of the device instance. + ur_device_handle_t hDevice); + #if !defined(__GNUC__) #pragma endregion #endif @@ -15895,6 +15924,14 @@ typedef struct ur_device_get_global_timestamps_params_t { uint64_t **ppHostTimestamp; } ur_device_get_global_timestamps_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urDeviceWaitExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_device_wait_exp_params_t { + ur_device_handle_t *phDevice; +} ur_device_wait_exp_params_t; + #if !defined(__GNUC__) #pragma endregion #endif diff --git a/unified-runtime/include/ur_api_funcs.def b/unified-runtime/include/ur_api_funcs.def index ab189345ce9d..d4c071530b2c 100644 --- a/unified-runtime/include/ur_api_funcs.def +++ b/unified-runtime/include/ur_api_funcs.def @@ -232,6 +232,7 @@ _UR_API(urDeviceSelectBinary) _UR_API(urDeviceGetNativeHandle) _UR_API(urDeviceCreateWithNativeHandle) _UR_API(urDeviceGetGlobalTimestamps) +_UR_API(urDeviceWaitExp) _UR_API(urLoaderConfigCreate) _UR_API(urLoaderConfigEnableLayer) _UR_API(urLoaderConfigGetInfo) diff --git a/unified-runtime/include/ur_ddi.h b/unified-runtime/include/ur_ddi.h index 39cd96274f1d..8b5664149a42 100644 --- a/unified-runtime/include/ur_ddi.h +++ b/unified-runtime/include/ur_ddi.h @@ -2079,6 +2079,36 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( typedef ur_result_t(UR_APICALL *ur_pfnGetDeviceProcAddrTable_t)( ur_api_version_t, ur_device_dditable_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urDeviceWaitExp +typedef ur_result_t(UR_APICALL *ur_pfnDeviceWaitExp_t)(ur_device_handle_t); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Table of DeviceExp functions pointers +typedef struct ur_device_exp_dditable_t { + ur_pfnDeviceWaitExp_t pfnWaitExp; +} ur_device_exp_dditable_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Exported function for filling application's DeviceExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + /// [in] API version requested + ur_api_version_t version, + /// [in,out] pointer to table of DDI function pointers + ur_device_exp_dditable_t *pDdiTable); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urGetDeviceExpProcAddrTable +typedef ur_result_t(UR_APICALL *ur_pfnGetDeviceExpProcAddrTable_t)( + ur_api_version_t, ur_device_exp_dditable_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Container for all DDI tables typedef struct ur_dditable_t { @@ -2104,6 +2134,7 @@ typedef struct ur_dditable_t { ur_usm_p2p_exp_dditable_t UsmP2PExp; ur_virtual_mem_dditable_t VirtualMem; ur_device_dditable_t Device; + ur_device_exp_dditable_t DeviceExp; } ur_dditable_t; #if defined(__cplusplus) diff --git a/unified-runtime/include/ur_print.h b/unified-runtime/include/ur_print.h index 8f734596419d..f14d6f02329c 100644 --- a/unified-runtime/include/ur_print.h +++ b/unified-runtime/include/ur_print.h @@ -3793,6 +3793,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintDeviceGetGlobalTimestampsParams( const struct ur_device_get_global_timestamps_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_device_wait_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintDeviceWaitExpParams( + const struct ur_device_wait_exp_params_t *params, char *buffer, + const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print function parameters /// @returns diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index c0e8e3628fe8..0dd7abde00ff 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -1288,6 +1288,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_IPC_CLOSE_MEM_HANDLE_EXP: os << "UR_FUNCTION_IPC_CLOSE_MEM_HANDLE_EXP"; break; + case UR_FUNCTION_DEVICE_WAIT_EXP: + os << "UR_FUNCTION_DEVICE_WAIT_EXP"; + break; default: os << "unknown enumerator"; break; @@ -3154,6 +3157,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: os << "UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP"; break; + case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP: + os << "UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP"; + break; case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: os << "UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP"; break; @@ -5377,6 +5383,19 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, os << ")"; } break; + case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size + << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: { const ur_bool_t *tptr = (const ur_bool_t *)ptr; if (sizeof(ur_bool_t) > size) { @@ -21263,6 +21282,21 @@ inline std::ostream &operator<<( return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_device_wait_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, + [[maybe_unused]] const struct ur_device_wait_exp_params_t *params) { + + os << ".hDevice = "; + + ur::details::printPtr(os, *(params->phDevice)); + + return os; +} + inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const ur_bool_t value) { os << (value ? "true" : "false"); @@ -22042,6 +22076,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_DEVICE_GET_GLOBAL_TIMESTAMPS: { os << (const struct ur_device_get_global_timestamps_params_t *)params; } break; + case UR_FUNCTION_DEVICE_WAIT_EXP: { + os << (const struct ur_device_wait_exp_params_t *)params; + } break; default: return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/unified-runtime/scripts/core/EXP-DEVICE-WAIT.rst b/unified-runtime/scripts/core/EXP-DEVICE-WAIT.rst new file mode 100644 index 000000000000..6425d248a47f --- /dev/null +++ b/unified-runtime/scripts/core/EXP-DEVICE-WAIT.rst @@ -0,0 +1,64 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-device-wait: + +================================================================================ +Device Wait +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + + +Motivation +-------------------------------------------------------------------------------- + +This extension adds the ability to do device-wide synchronization, instead of at +queue or event level. + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}_device_info_t + * ${X}_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}DeviceWaitExp + +Changelog +-------------------------------------------------------------------------------- + ++-----------+---------------------------------------------+ +| Revision | Changes | ++===========+=============================================+ +| 1.0 | Initial Draft | ++-----------+---------------------------------------------+ + +Support +-------------------------------------------------------------------------------- + +Adapters which support this experimental feature *must* return ``true`` when +queried for ${X}_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP via +${x}DeviceGetInfo. Conversely, before using any of the functionality defined +in this experimental feature the user *must* use the device query to determine +if the adapter supports this feature. + +Contributors +-------------------------------------------------------------------------------- + +* Steffen Larsen `steffen.larsen@intel.com `_ \ No newline at end of file diff --git a/unified-runtime/scripts/core/exp-device-wait.yml b/unified-runtime/scripts/core/exp-device-wait.yml new file mode 100644 index 000000000000..a4753c4af130 --- /dev/null +++ b/unified-runtime/scripts/core/exp-device-wait.yml @@ -0,0 +1,38 @@ +# +# Copyright (C) 2023 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental APIs for device-wide synchronization" +ordinal: "99" +--- #-------------------------------------------------------------------------- +type: enum +extend: true +typed_etors: true +desc: "Extension enums for $x_device_info_t to support device-wide synchronization." +name: $x_device_info_t +etors: + - name: DEVICE_WAIT_SUPPORT_EXP + value: "0x6002" + desc: "[$x_bool_t] Returns true if the device supports the device-wide synchronization experimental feature." +--- #-------------------------------------------------------------------------- +type: function +desc: "Synchronizes with all queues on the device." +class: $xDevice +name: WaitExp +decl: static +ordinal: "1" +params: + - type: $x_device_handle_t + name: hDevice + desc: "[in] handle of the device instance." +returns: + - $X_RESULT_ERROR_INVALID_DEVICE + - $X_RESULT_ERROR_INVALID_NULL_HANDLE: + - "`hDevice == nullptr`" \ No newline at end of file diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index b37d399ef332..9134cbf5e609 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -682,7 +682,10 @@ etors: - name: IPC_CLOSE_MEM_HANDLE_EXP desc: Enumerator for $xIPCCloseMemHandleExp value: '292' -max_id: '292' +- name: DEVICE_WAIT_EXP + desc: Enumerator for $xDeviceWaitExp + value: '293' +max_id: '293' --- type: enum desc: Defines structure types diff --git a/unified-runtime/source/adapters/adapter.def.in b/unified-runtime/source/adapters/adapter.def.in index aa3b55765ce3..3427b250394a 100644 --- a/unified-runtime/source/adapters/adapter.def.in +++ b/unified-runtime/source/adapters/adapter.def.in @@ -22,3 +22,4 @@ EXPORTS urGetUsmP2PExpProcAddrTable urGetVirtualMemProcAddrTable urGetDeviceProcAddrTable + urGetDeviceExpProcAddrTable diff --git a/unified-runtime/source/adapters/adapter.map.in b/unified-runtime/source/adapters/adapter.map.in index 4be9b9b1348b..9015676619cd 100644 --- a/unified-runtime/source/adapters/adapter.map.in +++ b/unified-runtime/source/adapters/adapter.map.in @@ -22,6 +22,7 @@ urGetUsmP2PExpProcAddrTable; urGetVirtualMemProcAddrTable; urGetDeviceProcAddrTable; + urGetDeviceExpProcAddrTable; local: *; }; diff --git a/unified-runtime/source/adapters/cuda/device.cpp b/unified-runtime/source/adapters/cuda/device.cpp index ddc4c7a115e2..fed6666621e2 100644 --- a/unified-runtime/source/adapters/cuda/device.cpp +++ b/unified-runtime/source/adapters/cuda/device.cpp @@ -1178,6 +1178,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(true); case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP: + return ReturnValue(true); case UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_SUPPORT_EXP: return ReturnValue(true); case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES: { @@ -1402,3 +1404,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( // No image can be loaded for the given device return UR_RESULT_ERROR_INVALID_BINARY; } + +/// Synchronizes with all queues on the device. +UR_APIEXPORT ur_result_t UR_APICALL +urDeviceWaitExp(ur_device_handle_t hDevice) { + ScopedContext Active(hDevice); + UR_CHECK_ERROR(cuCtxSynchronize()); + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index a53266ecbe20..0974f93315c4 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -270,6 +270,16 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return UR_RESULT_SUCCESS; } +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + ur_api_version_t version, ur_device_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnWaitExp = urDeviceWaitExp; + return UR_RESULT_SUCCESS; +} + UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( /// [in] API version requested ur_api_version_t version, @@ -510,6 +520,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urAllAddrTable(ur_api_version_t version, urGetUsmP2PExpProcAddrTable(version, &pDdiTable->UsmP2PExp); urGetVirtualMemProcAddrTable(version, &pDdiTable->VirtualMem); urGetDeviceProcAddrTable(version, &pDdiTable->Device); + urGetDeviceExpProcAddrTable(version, &pDdiTable->DeviceExp); urGetMemoryExportExpProcAddrTable(version, &pDdiTable->MemoryExportExp); return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/hip/device.cpp b/unified-runtime/source/adapters/hip/device.cpp index f71d86c3f4a5..cd1672770aa2 100644 --- a/unified-runtime/source/adapters/hip/device.cpp +++ b/unified-runtime/source/adapters/hip/device.cpp @@ -1043,6 +1043,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(true); case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP: + return ReturnValue(false); case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES: return ReturnValue(0); case UR_DEVICE_INFO_MEMORY_EXPORT_EXPORTABLE_DEVICE_MEM_EXP: @@ -1241,3 +1243,7 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, } return UR_RESULT_SUCCESS; } + +ur_result_t UR_APICALL urDeviceWaitExp(ur_device_handle_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index 978570a96376..c56914e46101 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -270,6 +270,16 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return UR_RESULT_SUCCESS; } +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + ur_api_version_t version, ur_device_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnWaitExp = urDeviceWaitExp; + return UR_RESULT_SUCCESS; +} + UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( ur_api_version_t version, ur_command_buffer_exp_dditable_t *pDdiTable) { auto retVal = validateProcInputs(version, pDdiTable); @@ -503,6 +513,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urAllAddrTable(ur_api_version_t version, urGetUsmP2PExpProcAddrTable(version, &pDdiTable->UsmP2PExp); urGetVirtualMemProcAddrTable(version, &pDdiTable->VirtualMem); urGetDeviceProcAddrTable(version, &pDdiTable->Device); + urGetDeviceExpProcAddrTable(version, &pDdiTable->DeviceExp); urGetMemoryExportExpProcAddrTable(version, &pDdiTable->MemoryExportExp); return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/level_zero/device.cpp b/unified-runtime/source/adapters/level_zero/device.cpp index bb168927ca56..d4b4d8a98f0c 100644 --- a/unified-runtime/source/adapters/level_zero/device.cpp +++ b/unified-runtime/source/adapters/level_zero/device.cpp @@ -1320,6 +1320,8 @@ ur_result_t urDeviceGetInfo( return ReturnValue(true); case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: return ReturnValue(true); + case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP: + return ReturnValue(true); case UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_SUPPORT_EXP: return ReturnValue(true); case UR_DEVICE_INFO_CURRENT_CLOCK_THROTTLE_REASONS: { @@ -1787,6 +1789,11 @@ ur_result_t urDeviceRelease(ur_device_handle_t Device) { return UR_RESULT_SUCCESS; } + +ur_result_t urDeviceWaitExp(ur_device_handle_t Device) { + ZE2UR_CALL(zeDeviceSynchronize, (Device->ZeDevice)); + return UR_RESULT_SUCCESS; +} } // namespace ur::level_zero /** diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index 269a21f6c56e..ada7e5fe89aa 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -562,6 +562,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return result; } +UR_APIEXPORT ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + ur_api_version_t version, ur_device_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + + pDdiTable->pfnWaitExp = ur::level_zero::urDeviceWaitExp; + + return result; +} + #ifdef UR_STATIC_ADAPTER_LEVEL_ZERO } // namespace ur::level_zero #else @@ -668,6 +680,10 @@ ur_result_t populateDdiTable(ur_dditable_t *ddi) { &ddi->Device); if (result != UR_RESULT_SUCCESS) return result; + result = NAMESPACE_::urGetDeviceExpProcAddrTable(UR_API_VERSION_CURRENT, + &ddi->DeviceExp); + if (result != UR_RESULT_SUCCESS) + return result; #undef NAMESPACE_ diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index 8fee48da9d69..85545ed58917 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -767,6 +767,7 @@ urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_result_t urCommandBufferGetNativeHandleExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_native_handle_t *phNativeCommandBuffer); +ur_result_t urDeviceWaitExp(ur_device_handle_t hDevice); ur_result_t urEnqueueTimestampRecordingExp( ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index b0f045adf96e..569570bf8911 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -11239,6 +11239,48 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urDeviceWaitExp +__urdlllocal ur_result_t UR_APICALL urDeviceWaitExp( + /// [in] handle of the device instance. + ur_device_handle_t hDevice) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_device_wait_exp_params_t params = {&hDevice}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback("urDeviceWaitExp")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback("urDeviceWaitExp")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback("urDeviceWaitExp")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueTimestampRecordingExp __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( @@ -13307,4 +13349,32 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( } catch (...) { return exceptionToResult(std::current_exception()); } + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Exported function for filling application's DeviceExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + /// [in] API version requested + ur_api_version_t version, + /// [in,out] pointer to table of DDI function pointers + ur_device_exp_dditable_t *pDdiTable) try { + if (nullptr == pDdiTable) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (driver::d_context.version < version) + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnWaitExp = driver::urDeviceWaitExp; + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} } diff --git a/unified-runtime/source/adapters/native_cpu/device.cpp b/unified-runtime/source/adapters/native_cpu/device.cpp index 4fed1565f8cf..8d0e43a50eee 100644 --- a/unified-runtime/source/adapters/native_cpu/device.cpp +++ b/unified-runtime/source/adapters/native_cpu/device.cpp @@ -448,6 +448,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: return ReturnValue(true); + case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP: + return ReturnValue(false); + case UR_DEVICE_INFO_GLOBAL_VARIABLE_SUPPORT: return ReturnValue(false); @@ -531,5 +534,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( return UR_RESULT_ERROR_INVALID_BINARY; } +UR_APIEXPORT ur_result_t UR_APICALL urDeviceWaitExp(ur_device_handle_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + ur_device_handle_t_::ur_device_handle_t_(ur_platform_handle_t ArgPlt) : mem_size(os_memory_bounded_size()), Platform(ArgPlt) {} diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index 492f005c866d..6510e26e3d2d 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -270,6 +270,16 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return UR_RESULT_SUCCESS; } +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + ur_api_version_t version, ur_device_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnWaitExp = urDeviceWaitExp; + return UR_RESULT_SUCCESS; +} + UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( ur_api_version_t version, ur_command_buffer_exp_dditable_t *pDdiTable) { auto retVal = validateProcInputs(version, pDdiTable); @@ -487,6 +497,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urAllAddrTable(ur_api_version_t version, urGetUsmP2PExpProcAddrTable(version, &pDdiTable->UsmP2PExp); urGetVirtualMemProcAddrTable(version, &pDdiTable->VirtualMem); urGetDeviceProcAddrTable(version, &pDdiTable->Device); + urGetDeviceExpProcAddrTable(version, &pDdiTable->DeviceExp); urGetMemoryExportExpProcAddrTable(version, &pDdiTable->MemoryExportExp); return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp index ab335b6d48e2..5f4987aa6018 100644 --- a/unified-runtime/source/adapters/offload/device.cpp +++ b/unified-runtime/source/adapters/offload/device.cpp @@ -280,6 +280,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_LOCAL_MEM_SIZE: { return ReturnValue(size_t{0}); } + case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP: { + return ReturnValue(ur_bool_t{false}); + } // The following properties are lifted from the minimum supported // intersection of the HIP and CUDA backends until liboffload adds a specific @@ -495,3 +498,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t, uint64_t *, uint64_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceWaitExp(ur_device_handle_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index 0e1cbbf9994e..b5a28ea68c8a 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -266,6 +266,16 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return UR_RESULT_SUCCESS; } +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + ur_api_version_t version, ur_device_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnWaitExp = urDeviceWaitExp; + return UR_RESULT_SUCCESS; +} + UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( ur_api_version_t version, ur_command_buffer_exp_dditable_t *pDdiTable) { auto retVal = validateProcInputs(version, pDdiTable); @@ -440,6 +450,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urAllAddrTable(ur_api_version_t version, urGetUsmP2PExpProcAddrTable(version, &pDdiTable->UsmP2PExp); urGetVirtualMemProcAddrTable(version, &pDdiTable->VirtualMem); urGetDeviceProcAddrTable(version, &pDdiTable->Device); + urGetDeviceExpProcAddrTable(version, &pDdiTable->DeviceExp); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/opencl/device.cpp b/unified-runtime/source/adapters/opencl/device.cpp index 002011a0d5ee..36471fff34bf 100644 --- a/unified-runtime/source/adapters/opencl/device.cpp +++ b/unified-runtime/source/adapters/opencl/device.cpp @@ -1427,6 +1427,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(false); case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: return ReturnValue(true); + case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP: + return ReturnValue(false); case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES: return ReturnValue(0); case UR_DEVICE_INFO_LUID: { @@ -1862,3 +1864,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( // No image can be loaded for the given device return UR_RESULT_ERROR_INVALID_BINARY; } + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceWaitExp(ur_device_handle_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index cf473f652680..96b627aea3f5 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -284,6 +284,16 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return UR_RESULT_SUCCESS; } +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + ur_api_version_t version, ur_device_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnWaitExp = urDeviceWaitExp; + return UR_RESULT_SUCCESS; +} + UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( ur_api_version_t version, ur_command_buffer_exp_dditable_t *pDdiTable) { auto retVal = validateProcInputs(version, pDdiTable); @@ -490,6 +500,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urAllAddrTable(ur_api_version_t version, urGetUsmP2PExpProcAddrTable(version, &pDdiTable->UsmP2PExp); urGetVirtualMemProcAddrTable(version, &pDdiTable->VirtualMem); urGetDeviceProcAddrTable(version, &pDdiTable->Device); + urGetDeviceExpProcAddrTable(version, &pDdiTable->DeviceExp); urGetMemoryExportExpProcAddrTable(version, &pDdiTable->MemoryExportExp); return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 732e89bf12d0..5ea65875d431 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -9517,6 +9517,39 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urDeviceWaitExp +__urdlllocal ur_result_t UR_APICALL urDeviceWaitExp( + /// [in] handle of the device instance. + ur_device_handle_t hDevice) { + auto pfnWaitExp = getContext()->urDdiTable.DeviceExp.pfnWaitExp; + + if (nullptr == pfnWaitExp) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_device_wait_exp_params_t params = {&hDevice}; + uint64_t instance = getContext()->notify_begin(UR_FUNCTION_DEVICE_WAIT_EXP, + "urDeviceWaitExp", ¶ms); + + auto &logger = getContext()->logger; + UR_LOG_L(logger, INFO, " ---> urDeviceWaitExp\n"); + + ur_result_t result = pfnWaitExp(hDevice); + + getContext()->notify_end(UR_FUNCTION_DEVICE_WAIT_EXP, "urDeviceWaitExp", + ¶ms, &result, instance); + + if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_WAIT_EXP, + ¶ms); + UR_LOG_L(logger, INFO, " <--- urDeviceWaitExp({}) -> {};\n", + args_str.str(), result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueTimestampRecordingExp __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( @@ -11741,6 +11774,37 @@ __urdlllocal ur_result_t UR_APICALL urGetDeviceProcAddrTable( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Exported function for filling application's DeviceExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +__urdlllocal ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + /// [in] API version requested + ur_api_version_t version, + /// [in,out] pointer to table of DDI function pointers + ur_device_exp_dditable_t *pDdiTable) { + auto &dditable = ur_tracing_layer::getContext()->urDdiTable.DeviceExp; + + if (nullptr == pDdiTable) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (UR_MAJOR_VERSION(ur_tracing_layer::getContext()->version) != + UR_MAJOR_VERSION(version) || + UR_MINOR_VERSION(ur_tracing_layer::getContext()->version) > + UR_MINOR_VERSION(version)) + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + + ur_result_t result = UR_RESULT_SUCCESS; + + dditable.pfnWaitExp = pDdiTable->pfnWaitExp; + pDdiTable->pfnWaitExp = ur_tracing_layer::urDeviceWaitExp; + + return result; +} ur_result_t context_t::init(ur_dditable_t *dditable, const std::set &enabledLayerNames, @@ -11867,6 +11931,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable, &dditable->Device); } + if (UR_RESULT_SUCCESS == result) { + result = ur_tracing_layer::urGetDeviceExpProcAddrTable( + UR_API_VERSION_CURRENT, &dditable->DeviceExp); + } + return result; } } /* namespace ur_tracing_layer */ diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index 8924809ee478..2d05787d3edd 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -10268,6 +10268,35 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urDeviceWaitExp +__urdlllocal ur_result_t UR_APICALL urDeviceWaitExp( + /// [in] handle of the device instance. + ur_device_handle_t hDevice) { + auto pfnWaitExp = getContext()->urDdiTable.DeviceExp.pfnWaitExp; + + if (nullptr == pfnWaitExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == hDevice) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (hDevice == nullptr) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hDevice)) { + URLOG_CTX_INVALID_REFERENCE(hDevice); + } + + ur_result_t result = pfnWaitExp(hDevice); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueTimestampRecordingExp __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( @@ -12557,6 +12586,38 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Exported function for filling application's DeviceExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + /// [in] API version requested + ur_api_version_t version, + /// [in,out] pointer to table of DDI function pointers + ur_device_exp_dditable_t *pDdiTable) { + auto &dditable = ur_validation_layer::getContext()->urDdiTable.DeviceExp; + + if (nullptr == pDdiTable) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (UR_MAJOR_VERSION(ur_validation_layer::getContext()->version) != + UR_MAJOR_VERSION(version) || + UR_MINOR_VERSION(ur_validation_layer::getContext()->version) > + UR_MINOR_VERSION(version)) + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + + ur_result_t result = UR_RESULT_SUCCESS; + + dditable.pfnWaitExp = pDdiTable->pfnWaitExp; + pDdiTable->pfnWaitExp = ur_validation_layer::urDeviceWaitExp; + + return result; +} + ur_result_t context_t::init(ur_dditable_t *dditable, const std::set &enabledLayerNames, codeloc_data) { @@ -12699,6 +12760,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable, UR_API_VERSION_CURRENT, &dditable->Device); } + if (UR_RESULT_SUCCESS == result) { + result = ur_validation_layer::urGetDeviceExpProcAddrTable( + UR_API_VERSION_CURRENT, &dditable->DeviceExp); + } + return result; } diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index b59fae08e176..674937d4e9ac 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -69,6 +69,7 @@ EXPORTS urDeviceRelease urDeviceRetain urDeviceSelectBinary + urDeviceWaitExp urEnqueueCommandBufferExp urEnqueueDeviceGlobalVariableRead urEnqueueDeviceGlobalVariableWrite @@ -114,6 +115,7 @@ EXPORTS urGetBindlessImagesExpProcAddrTable urGetCommandBufferExpProcAddrTable urGetContextProcAddrTable + urGetDeviceExpProcAddrTable urGetDeviceProcAddrTable urGetEnqueueExpProcAddrTable urGetEnqueueProcAddrTable @@ -286,6 +288,7 @@ EXPORTS urPrintDeviceThrottleReasonsFlags urPrintDeviceType urPrintDeviceUsmAccessCapabilityFlags + urPrintDeviceWaitExpParams urPrintEnqueueCommandBufferExpParams urPrintEnqueueDeviceGlobalVariableReadParams urPrintEnqueueDeviceGlobalVariableWriteParams diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index 29786cd51b52..dcc2db78188e 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -69,6 +69,7 @@ urDeviceRelease; urDeviceRetain; urDeviceSelectBinary; + urDeviceWaitExp; urEnqueueCommandBufferExp; urEnqueueDeviceGlobalVariableRead; urEnqueueDeviceGlobalVariableWrite; @@ -114,6 +115,7 @@ urGetBindlessImagesExpProcAddrTable; urGetCommandBufferExpProcAddrTable; urGetContextProcAddrTable; + urGetDeviceExpProcAddrTable; urGetDeviceProcAddrTable; urGetEnqueueExpProcAddrTable; urGetEnqueueProcAddrTable; @@ -286,6 +288,7 @@ urPrintDeviceThrottleReasonsFlags; urPrintDeviceType; urPrintDeviceUsmAccessCapabilityFlags; + urPrintDeviceWaitExpParams; urPrintEnqueueCommandBufferExpParams; urPrintEnqueueDeviceGlobalVariableReadParams; urPrintEnqueueDeviceGlobalVariableWriteParams; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index 2e0201e8b414..98809dc348cd 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -5415,6 +5415,22 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return pfnGetNativeHandleExp(hCommandBuffer, phNativeCommandBuffer); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urDeviceWaitExp +__urdlllocal ur_result_t UR_APICALL urDeviceWaitExp( + /// [in] handle of the device instance. + ur_device_handle_t hDevice) { + + auto *dditable = *reinterpret_cast(hDevice); + + auto *pfnWaitExp = dditable->DeviceExp.pfnWaitExp; + if (nullptr == pfnWaitExp) + return UR_RESULT_ERROR_UNINITIALIZED; + + // forward to device-platform + return pfnWaitExp(hDevice); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueTimestampRecordingExp __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( @@ -7316,4 +7332,57 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( return result; } + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Exported function for filling application's DeviceExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceExpProcAddrTable( + /// [in] API version requested + ur_api_version_t version, + /// [in,out] pointer to table of DDI function pointers + ur_device_exp_dditable_t *pDdiTable) { + if (nullptr == pDdiTable) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (ur_loader::getContext()->version < version) + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + + ur_result_t result = UR_RESULT_SUCCESS; + + // Load the device-platform DDI tables + for (auto &platform : ur_loader::getContext()->platforms) { + // statically linked adapter inside of the loader + if (platform.handle == nullptr) + continue; + + if (platform.initStatus != UR_RESULT_SUCCESS) + continue; + auto getTable = reinterpret_cast( + ur_loader::LibLoader::getFunctionPtr(platform.handle.get(), + "urGetDeviceExpProcAddrTable")); + if (!getTable) + continue; + platform.initStatus = getTable(version, &platform.dditable.DeviceExp); + } + + if (UR_RESULT_SUCCESS == result) { + if (ur_loader::getContext()->platforms.size() != 1 || + ur_loader::getContext()->forceIntercept) { + // return pointers to loader's DDIs + pDdiTable->pfnWaitExp = ur_loader::urDeviceWaitExp; + } else { + // return pointers directly to platform's DDIs + *pDdiTable = + ur_loader::getContext()->platforms.front().dditable.DeviceExp; + } + } + + return result; +} } diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index c5f248f645ff..70ae105c5a94 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -9964,6 +9964,30 @@ ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Synchronizes with all queues on the device. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hDevice` +/// + `hDevice == nullptr` +/// - ::UR_RESULT_ERROR_INVALID_DEVICE +ur_result_t UR_APICALL urDeviceWaitExp( + /// [in] handle of the device instance. + ur_device_handle_t hDevice) try { + auto pfnWaitExp = ur_lib::getContext()->urDdiTable.DeviceExp.pfnWaitExp; + if (nullptr == pfnWaitExp) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnWaitExp(hDevice); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a command for recording the device timestamp /// diff --git a/unified-runtime/source/loader/ur_libddi.cpp b/unified-runtime/source/loader/ur_libddi.cpp index 648fc8787218..17014c613d3f 100644 --- a/unified-runtime/source/loader/ur_libddi.cpp +++ b/unified-runtime/source/loader/ur_libddi.cpp @@ -128,6 +128,11 @@ __urdlllocal ur_result_t context_t::ddiInit() { urGetDeviceProcAddrTable(UR_API_VERSION_CURRENT, &urDdiTable.Device); } + if (UR_RESULT_SUCCESS == result) { + result = urGetDeviceExpProcAddrTable(UR_API_VERSION_CURRENT, + &urDdiTable.DeviceExp); + } + return result; } diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp index 6d887d494fc3..7e737bd641c4 100644 --- a/unified-runtime/source/loader/ur_print.cpp +++ b/unified-runtime/source/loader/ur_print.cpp @@ -3055,6 +3055,15 @@ ur_result_t urPrintDeviceGetGlobalTimestampsParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t +urPrintDeviceWaitExpParams(const struct ur_device_wait_exp_params_t *params, + char *buffer, const size_t buff_size, + size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintFunctionParams(enum ur_function_t function, const void *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 4ab0bf014958..8a86df1a208e 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -8678,6 +8678,25 @@ ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Synchronizes with all queues on the device. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hDevice` +/// + `hDevice == nullptr` +/// - ::UR_RESULT_ERROR_INVALID_DEVICE +ur_result_t UR_APICALL urDeviceWaitExp( + /// [in] handle of the device instance. + ur_device_handle_t hDevice) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a command for recording the device timestamp /// diff --git a/unified-runtime/tools/urinfo/urinfo.hpp b/unified-runtime/tools/urinfo/urinfo.hpp index 0440269c589b..f9d4fbeca2b6 100644 --- a/unified-runtime/tools/urinfo/urinfo.hpp +++ b/unified-runtime/tools/urinfo/urinfo.hpp @@ -465,6 +465,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, printDeviceInfo(hDevice, UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP); std::cout << prefix; + printDeviceInfo(hDevice, UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP); + std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP); std::cout << prefix;