diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 3201cf94f4065..7f10dd7f79e85 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -8,7 +8,7 @@ #pragma once -#include // for std::forward +#include #include #include @@ -72,14 +72,20 @@ template struct LaunchConfigAccess { return MLaunchConfig.getProperties(); } }; + +template +void submit_impl(queue &Q, CommandGroupFunc &&CGF, + const sycl::detail::code_location &CodeLoc) { + Q.submit_without_event(std::forward(CGF), CodeLoc); +} } // namespace detail template void submit(queue Q, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - // TODO: Use new submit without Events. - Q.submit(std::forward(CGF), CodeLoc); + sycl::ext::oneapi::experimental::detail::submit_impl( + Q, std::forward(CGF), CodeLoc); } template @@ -205,7 +211,8 @@ template Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { submit(Q, [&](handler &CGH) { - nd_launch(CGH, Range, KernelObj, std::forward(Reductions)...); + nd_launch(CGH, Range, KernelObj, + std::forward(Reductions)...); }); } @@ -228,7 +235,8 @@ template , Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { submit(Q, [&](handler &CGH) { - nd_launch(CGH, Config, KernelObj, std::forward(Reductions)...); + nd_launch(CGH, Config, KernelObj, + std::forward(Reductions)...); }); } @@ -270,11 +278,9 @@ inline void memcpy(handler &CGH, void *Dest, const void *Src, size_t NumBytes) { CGH.memcpy(Dest, Src, NumBytes); } -inline void memcpy(queue Q, void *Dest, const void *Src, size_t NumBytes, - const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - submit(Q, [&](handler &CGH) { memcpy(CGH, Dest, Src, NumBytes); }, CodeLoc); -} +__SYCL_EXPORT void memcpy(queue Q, void *Dest, const void *Src, size_t NumBytes, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()); template void copy(handler &CGH, const T *Src, T *Dest, size_t Count) { @@ -292,11 +298,9 @@ inline void memset(handler &CGH, void *Ptr, int Value, size_t NumBytes) { CGH.memset(Ptr, Value, NumBytes); } -inline void memset(queue Q, void *Ptr, int Value, size_t NumBytes, - const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - submit(Q, [&](handler &CGH) { memset(CGH, Ptr, Value, NumBytes); }, CodeLoc); -} +__SYCL_EXPORT void memset(queue Q, void *Ptr, int Value, size_t NumBytes, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()); template void fill(sycl::handler &CGH, T *Ptr, const T &Pattern, size_t Count) { @@ -324,13 +328,9 @@ inline void mem_advise(handler &CGH, void *Ptr, size_t NumBytes, int Advice) { CGH.mem_advise(Ptr, NumBytes, Advice); } -inline void mem_advise(queue Q, void *Ptr, size_t NumBytes, int Advice, - const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - submit( - Q, [&](handler &CGH) { mem_advise(CGH, Ptr, NumBytes, Advice); }, - CodeLoc); -} +__SYCL_EXPORT void mem_advise(queue Q, void *Ptr, size_t NumBytes, int Advice, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()); inline void barrier(handler &CGH) { CGH.ext_oneapi_barrier(); } diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 2b313c8834443..0d3e757c175b0 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -465,6 +465,7 @@ class __SYCL_EXPORT handler { /// /// \param Queue is a SYCL queue. /// \param IsHost indicates if this handler is created for SYCL host device. + /// TODO: Unused. Remove with ABI break. handler(std::shared_ptr Queue, bool IsHost); /// Constructs SYCL handler from the associated queue and the submission's @@ -476,10 +477,36 @@ class __SYCL_EXPORT handler { /// \param SecondaryQueue is the secondary SYCL queue of the submission. This /// is null if no secondary queue is associated with the submission. /// \param IsHost indicates if this handler is created for SYCL host device. + /// TODO: Unused. Remove with ABI break. handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, std::shared_ptr SecondaryQueue, bool IsHost); + /// Constructs SYCL handler from queue. + /// + /// \param Queue is a SYCL queue. + /// \param IsHost indicates if this handler is created for SYCL host device. + /// \param CallerNeedsEvent indicates if the event resulting from this handler + /// is needed by the caller. + handler(std::shared_ptr Queue, bool IsHost, + bool CallerNeedsEvent); + + /// Constructs SYCL handler from the associated queue and the submission's + /// primary and secondary queue. + /// + /// \param Queue is a SYCL queue. This is equal to either PrimaryQueue or + /// SecondaryQueue. + /// \param PrimaryQueue is the primary SYCL queue of the submission. + /// \param SecondaryQueue is the secondary SYCL queue of the submission. This + /// is null if no secondary queue is associated with the submission. + /// \param IsHost indicates if this handler is created for SYCL host device. + /// \param CallerNeedsEvent indicates if the event resulting from this handler + /// is needed by the caller. + handler(std::shared_ptr Queue, + std::shared_ptr PrimaryQueue, + std::shared_ptr SecondaryQueue, bool IsHost, + bool CallerNeedsEvent); + /// Constructs SYCL handler from Graph. /// /// The hander will add the command-group as a node to the graph rather than @@ -575,6 +602,16 @@ class __SYCL_EXPORT handler { /// \return a SYCL event object representing the command group event finalize(); + /// Constructs CG object of specific type, passes it to Scheduler and + /// returns sycl::event object representing the command group. + /// It's expected that the method is the latest method executed before + /// object destruction. + /// \param CallerNeedsEvent Specifies if the caller needs an event + /// representing the work related to this handler. + /// + /// \return a SYCL event object representing the command group + event finalize(bool CallerNeedsEvent); + /// Saves streams associated with this handler. /// /// Streams are then forwarded to command group and flushed in the scheduler. @@ -1184,6 +1221,8 @@ class __SYCL_EXPORT handler { Size == 32 || Size == 64 || Size == 128; } + bool eventNeeded() const; + template struct TransformUserItemType { using type = std::conditional_t< std::is_convertible_v, LambdaArgType>, nd_item, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 4d32218ab09d4..04b6969fe2b12 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -95,6 +95,12 @@ namespace ext ::oneapi ::experimental { // returned by info::queue::state enum class queue_state { executing, recording }; struct image_descriptor; + +namespace detail { +template +void submit_impl(queue &Q, CommandGroupFunc &&CGF, + const sycl::detail::code_location &CodeLoc); +} // namespace detail } // namespace ext::oneapi::experimental /// Encapsulates a single SYCL queue which schedules kernels on a SYCL device. @@ -2689,6 +2695,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &); #endif + template + friend void ext::oneapi::experimental::detail::submit_impl( + queue &Q, CommandGroupFunc &&CGF, + const sycl::detail::code_location &CodeLoc); + /// A template-free version of submit. event submit_impl(std::function CGH, const detail::code_location &CodeLoc); @@ -2696,6 +2707,28 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { event submit_impl(std::function CGH, queue secondQueue, const detail::code_location &CodeLoc); + /// A template-free version of submit_without_event. + void submit_without_event_impl(std::function CGH, + const detail::code_location &CodeLoc); + + /// Submits a command group function object to the queue, in order to be + /// scheduled for execution on the device. + /// + /// \param CGF is a function object containing command group. + /// \param CodeLoc is the code location of the submit call (default argument) + template + std::enable_if_t, void> + submit_without_event(T CGF, const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); +#if __SYCL_USE_FALLBACK_ASSERT + // If post-processing is needed, fall back to the regular submit. + // TODO: Revisit whether we can avoid this. + submit(CGF, CodeLoc); +#else + submit_without_event_impl(CGF, CodeLoc); +#endif // __SYCL_USE_FALLBACK_ASSERT + } + /// Checks if the event needs to be discarded and if so, discards it and /// returns a discarded event. Otherwise, it returns input event. /// TODO: move to impl class in the next ABI Breaking window diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index ff955ce8b9eda..82c4448e37836 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -1178,8 +1178,9 @@ namespace reduction { inline void finalizeHandler(handler &CGH) { CGH.finalize(); } template void withAuxHandler(handler &CGH, FunctorTy Func) { event E = CGH.finalize(); - handler AuxHandler(CGH.MQueue, CGH.MIsHost); - AuxHandler.depends_on(E); + handler AuxHandler(CGH.MQueue, CGH.MIsHost, CGH.eventNeeded()); + if (!createSyclObjFromImpl(CGH.MQueue).is_in_order()) + AuxHandler.depends_on(E); AuxHandler.saveCodeLoc(CGH.MCodeLoc); Func(AuxHandler); CGH.MLastEvent = AuxHandler.finalize(); diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index 7ef8ff587f0e2..74497db20c9f1 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -248,6 +248,7 @@ set(SYCL_COMMON_SOURCES "context.cpp" "device.cpp" "device_selector.cpp" + "enqueue_functions.cpp" "event.cpp" "exception.cpp" "exception_list.cpp" diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 3268a27fbb827..09ccef30dacd2 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -690,7 +690,8 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode( sycl::detail::EventImplPtr Event = sycl::detail::Scheduler::getInstance().addCG( - Node->getCGCopy(), AllocaQueue, CommandBuffer, Deps); + Node->getCGCopy(), AllocaQueue, /*EventNeeded=*/true, CommandBuffer, + Deps); MCommandMap[Node] = Event->getCommandBufferCommand(); return Event->getSyncPoint(); @@ -928,7 +929,7 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, CommandBuffer, nullptr, std::move(CGData)); NewEvent = sycl::detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), Queue); + std::move(CommandGroup), Queue, /*EventNeeded=*/true); } NewEvent->setEventFromSubmittedExecCommandBuffer(true); } else if ((CurrentPartition->MSchedule.size() > 0) && @@ -946,7 +947,7 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, .MQueue = Queue; NewEvent = sycl::detail::Scheduler::getInstance().addCG( - NodeImpl->getCGCopy(), Queue); + NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true); } else { std::vector> ScheduledEvents; for (auto &NodeImpl : CurrentPartition->MSchedule) { @@ -982,7 +983,7 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, // dependencies are propagated in findRealDeps sycl::detail::EventImplPtr EventImpl = sycl::detail::Scheduler::getInstance().addCG( - NodeImpl->getCGCopy(), Queue); + NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true); ScheduledEvents.push_back(EventImpl); } diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index f50c5c94b78d4..f0df55d5e069b 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -31,9 +31,11 @@ enum class HandlerSubmissionState : std::uint8_t { class handler_impl { public: handler_impl(std::shared_ptr SubmissionPrimaryQueue, - std::shared_ptr SubmissionSecondaryQueue) + std::shared_ptr SubmissionSecondaryQueue, + bool EventNeeded) : MSubmissionPrimaryQueue(std::move(SubmissionPrimaryQueue)), - MSubmissionSecondaryQueue(std::move(SubmissionSecondaryQueue)){}; + MSubmissionSecondaryQueue(std::move(SubmissionSecondaryQueue)), + MEventNeeded(EventNeeded) {}; handler_impl() = default; @@ -74,6 +76,10 @@ class handler_impl { /// submission is a fallback from a previous submission. std::shared_ptr MSubmissionSecondaryQueue; + /// Bool stores information about whether the event resulting from the + /// corresponding work is required. + bool MEventNeeded = true; + // Stores auxiliary resources used by internal operations. std::vector> MAuxiliaryResources; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 8865c342646eb..5b873039cd4a1 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -146,7 +146,8 @@ queue_impl::getExtendDependencyList(const std::vector &DepEvents, event queue_impl::memset(const std::shared_ptr &Self, void *Ptr, int Value, size_t Count, - const std::vector &DepEvents) { + const std::vector &DepEvents, + bool CallerNeedsEvent) { #if XPTI_ENABLE_INSTRUMENTATION // We need a code pointer value and we use the object ptr; if code location // information is available, we will have function name and source file @@ -173,7 +174,8 @@ event queue_impl::memset(const std::shared_ptr &Self, #endif return submitMemOpHelper( - Self, DepEvents, [&](handler &CGH) { CGH.memset(Ptr, Value, Count); }, + Self, DepEvents, CallerNeedsEvent, + [&](handler &CGH) { CGH.memset(Ptr, Value, Count); }, [](const auto &...Args) { MemoryManager::fill_usm(Args...); }, Ptr, Self, Count, Value); } @@ -194,7 +196,7 @@ void report(const code_location &CodeLoc) { event queue_impl::memcpy(const std::shared_ptr &Self, void *Dest, const void *Src, size_t Count, const std::vector &DepEvents, - const code_location &CodeLoc) { + bool CallerNeedsEvent, const code_location &CodeLoc) { #if XPTI_ENABLE_INSTRUMENTATION // We need a code pointer value and we duse the object ptr; If code location // is available, we use the source file information along with the object @@ -225,7 +227,8 @@ event queue_impl::memcpy(const std::shared_ptr &Self, PI_ERROR_INVALID_VALUE); } return submitMemOpHelper( - Self, DepEvents, [&](handler &CGH) { CGH.memcpy(Dest, Src, Count); }, + Self, DepEvents, CallerNeedsEvent, + [&](handler &CGH) { CGH.memcpy(Dest, Src, Count); }, [](const auto &...Args) { MemoryManager::copy_usm(Args...); }, Src, Self, Count, Dest); } @@ -233,9 +236,10 @@ event queue_impl::memcpy(const std::shared_ptr &Self, event queue_impl::mem_advise(const std::shared_ptr &Self, const void *Ptr, size_t Length, pi_mem_advice Advice, - const std::vector &DepEvents) { + const std::vector &DepEvents, + bool CallerNeedsEvent) { return submitMemOpHelper( - Self, DepEvents, + Self, DepEvents, CallerNeedsEvent, [&](handler &CGH) { CGH.mem_advise(Ptr, Length, Advice); }, [](const auto &...Args) { MemoryManager::advise_usm(Args...); }, Ptr, Self, Length, Advice); @@ -244,9 +248,9 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, event queue_impl::memcpyToDeviceGlobal( const std::shared_ptr &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, - const std::vector &DepEvents) { + const std::vector &DepEvents, bool CallerNeedsEvent) { return submitMemOpHelper( - Self, DepEvents, + Self, DepEvents, CallerNeedsEvent, [&](handler &CGH) { CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope, NumBytes, Offset); @@ -260,9 +264,9 @@ event queue_impl::memcpyToDeviceGlobal( event queue_impl::memcpyFromDeviceGlobal( const std::shared_ptr &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, - size_t Offset, const std::vector &DepEvents) { + size_t Offset, const std::vector &DepEvents, bool CallerNeedsEvent) { return submitMemOpHelper( - Self, DepEvents, + Self, DepEvents, CallerNeedsEvent, [&](handler &CGH) { CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope, NumBytes, Offset); @@ -348,9 +352,11 @@ event queue_impl::submit_impl(const std::function &CGF, const std::shared_ptr &Self, const std::shared_ptr &PrimaryQueue, const std::shared_ptr &SecondaryQueue, + bool CallerNeedsEvent, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess) { - handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue); + handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue, + CallerNeedsEvent); Handler.saveCodeLoc(Loc); { @@ -399,6 +405,7 @@ event queue_impl::submitWithHandler(const std::shared_ptr &Self, template event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, const std::vector &DepEvents, + bool CallerNeedsEvent, HandlerFuncT HandlerFunc, MemOpFuncT MemOpFunc, MemOpArgTs... MemOpArgs) { @@ -415,7 +422,8 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, // handler rather than by-passing the scheduler. if (MGraph.expired() && Scheduler::areEventsSafeForSchedulerBypass( ExpandedDepEvents, MContext)) { - if (MSupportsDiscardingPiEvents) { + if ((MDiscardEvents || !CallerNeedsEvent) && + supportsDiscardingPiEvents()) { NestedCallsTracker tracker; MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr); @@ -646,10 +654,17 @@ bool queue_impl::ext_oneapi_empty() const { // the status of the last event. if (isInOrder() && !MDiscardEvents) { std::lock_guard Lock(MMutex); - return !MDefaultGraphDeps.LastEventPtr || - MDefaultGraphDeps.LastEventPtr - ->get_info() == - info::event_command_status::complete; + // If there is no last event we know that no work has been submitted, so it + // must be trivially empty. + if (!MDefaultGraphDeps.LastEventPtr) + return true; + // Otherwise, check if the last event is finished. + // Note that we fall back to the backend query if the event was discarded, + // which may happend despite the queue not being a discard event queue. + if (!MDefaultGraphDeps.LastEventPtr->isDiscarded()) + return MDefaultGraphDeps.LastEventPtr + ->get_info() == + info::event_command_status::complete; } // Check the status of the backend queue if this is not a host queue. diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index d87db1d7ef2e5..db586fda5bf3e 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -111,8 +111,6 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MSupportsDiscardingPiEvents(MDiscardEvents && - (MHostQueue ? true : MIsInorder)), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { if (has_property()) { @@ -292,8 +290,6 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MSupportsDiscardingPiEvents(MDiscardEvents && - (MHostQueue ? true : MIsInorder)), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { queue_impl_interop(PiQueue); @@ -313,9 +309,7 @@ class queue_impl { MIsInorder(has_property()), MDiscardEvents( has_property()), - MIsProfilingEnabled(has_property()), - MSupportsDiscardingPiEvents(MDiscardEvents && - (MHostQueue ? true : MIsInorder)) { + MIsProfilingEnabled(has_property()) { queue_impl_interop(PiQueue); } @@ -370,9 +364,12 @@ class queue_impl { /// \return true if this queue is a SYCL host queue. bool is_host() const { return MHostQueue; } - /// \return true if this queue has discard_events support. + /// \return true if the discard event property was set at time of creation. + bool hasDiscardEventsProperty() const { return MDiscardEvents; } + + /// \return true if this queue allows for discarded events. bool supportsDiscardingPiEvents() const { - return MSupportsDiscardingPiEvents; + return MHostQueue ? true : MIsInorder; } bool isInOrder() const { return MIsInorder; } @@ -424,10 +421,12 @@ class queue_impl { const SubmitPostProcessF *PostProcess = nullptr) { event ResEvent; try { - ResEvent = submit_impl(CGF, Self, Self, SecondQueue, Loc, PostProcess); + ResEvent = submit_impl(CGF, Self, Self, SecondQueue, + /*CallerNeedsEvent=*/true, Loc, PostProcess); } catch (...) { - ResEvent = SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue, - Loc, PostProcess); + ResEvent = + SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue, + /*CallerNeedsEvent=*/true, Loc, PostProcess); } return discard_or_return(ResEvent); } @@ -444,10 +443,19 @@ class queue_impl { const std::shared_ptr &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess = nullptr) { - auto ResEvent = submit_impl(CGF, Self, Self, nullptr, Loc, PostProcess); + auto ResEvent = submit_impl(CGF, Self, Self, nullptr, + /*CallerNeedsEvent=*/true, Loc, PostProcess); return discard_or_return(ResEvent); } + void submit_without_event(const std::function &CGF, + const std::shared_ptr &Self, + const detail::code_location &Loc, + const SubmitPostProcessF *PostProcess = nullptr) { + submit_impl(CGF, Self, Self, nullptr, /*CallerNeedsEvent=*/false, Loc, + PostProcess); + } + /// Performs a blocking wait for the completion of all enqueued tasks in the /// queue. /// @@ -651,9 +659,11 @@ class queue_impl { /// \param Count is a number of bytes to fill. /// \param DepEvents is a vector of events that specifies the kernel /// dependencies. + /// \param CallerNeedsEvent specifies if the caller expects a usable event. /// \return an event representing fill operation. event memset(const std::shared_ptr &Self, void *Ptr, int Value, - size_t Count, const std::vector &DepEvents); + size_t Count, const std::vector &DepEvents, + bool CallerNeedsEvent); /// Copies data from one memory region to another, both pointed by /// USM pointers. /// @@ -663,10 +673,11 @@ class queue_impl { /// \param Count is a number of bytes to copy. /// \param DepEvents is a vector of events that specifies the kernel /// dependencies. + /// \param CallerNeedsEvent specifies if the caller expects a usable event. /// \return an event representing copy operation. event memcpy(const std::shared_ptr &Self, void *Dest, const void *Src, size_t Count, - const std::vector &DepEvents, + const std::vector &DepEvents, bool CallerNeedsEvent, const code_location &CodeLoc); /// Provides additional information to the underlying runtime about how /// different allocations are used. @@ -677,10 +688,11 @@ class queue_impl { /// \param Advice is a device-defined advice for the specified allocation. /// \param DepEvents is a vector of events that specifies the kernel /// dependencies. + /// \param CallerNeedsEvent specifies if the caller expects a usable event. /// \return an event representing advise operation. event mem_advise(const std::shared_ptr &Self, const void *Ptr, size_t Length, pi_mem_advice Advice, - const std::vector &DepEvents); + const std::vector &DepEvents, bool CallerNeedsEvent); /// Puts exception to the list of asynchronous ecxeptions. /// @@ -718,13 +730,14 @@ class queue_impl { event memcpyToDeviceGlobal(const std::shared_ptr &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, - size_t Offset, - const std::vector &DepEvents); + size_t Offset, const std::vector &DepEvents, + bool CallerNeedsEvent); event memcpyFromDeviceGlobal(const std::shared_ptr &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, - const std::vector &DepEvents); + const std::vector &DepEvents, + bool CallerNeedsEvent); bool isProfilingFallback() { return MFallbackProfiling; } @@ -780,6 +793,14 @@ class queue_impl { // Hook to the scheduler to clean up any fusion command held on destruction. void cleanup_fusion_cmd(); + template + EventImplPtr insertHelperBarrier(const HandlerType &Handler) { + auto ResEvent = std::make_shared(Handler.MQueue); + getPlugin()->call( + Handler.MQueue->getHandleRef(), 0, nullptr, &ResEvent->getHandleRef()); + return ResEvent; + } + // template is needed for proper unit testing template void finalizeHandler(HandlerType &Handler, event &EventRet) { @@ -787,6 +808,10 @@ class queue_impl { // Accessing and changing of an event isn't atomic operation. // Hence, here is the lock for thread-safety. std::lock_guard Lock{MMutex}; + + auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr + : MExtGraphDeps.LastEventPtr; + // This dependency is needed for the following purposes: // - host tasks are handled by the runtime and cannot be implicitly // synchronized by the backend. @@ -794,11 +819,19 @@ class queue_impl { // by a host task. This dependency allows to build the enqueue order in // the RT but will not be passed to the backend. See getPIEvents in // Command. - - auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr - : MExtGraphDeps.LastEventPtr; - if (EventToBuildDeps) - Handler.depends_on(EventToBuildDeps); + if (EventToBuildDeps) { + // In the case where the last event was discarded and we are to run a + // host_task, we insert a barrier into the queue and use the resulting + // event as the dependency for the host_task. + // Note that host_task events can never be discarded, so this will not + // insert barriers between host_task enqueues. + if (EventToBuildDeps->isDiscarded() && + Handler.getType() == CG::CodeplayHostTask) + EventToBuildDeps = insertHelperBarrier(Handler); + + if (!EventToBuildDeps->isDiscarded()) + Handler.depends_on(EventToBuildDeps); + } // If there is an external event set, add it as a dependency and clear it. // We do not need to hold the lock as MLastEventMtx will ensure the last @@ -852,13 +885,15 @@ class queue_impl { /// same as Self. /// \param SecondaryQueue is a pointer to the secondary queue. This may be the /// same as Self. + /// \param CallerNeedsEvent is a boolean indicating whether the event is + /// required by the user after the call. /// \param Loc is the code location of the submit call (default argument) /// \return a SYCL event representing submitted command group. event submit_impl(const std::function &CGF, const std::shared_ptr &Self, const std::shared_ptr &PrimaryQueue, const std::shared_ptr &SecondaryQueue, - const detail::code_location &Loc, + bool CallerNeedsEvent, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess); /// Helper function for submitting a memory operation with a handler. @@ -876,6 +911,8 @@ class queue_impl { /// /// \param Self is a shared_ptr to this queue. /// \param DepEvents is a vector of dependencies of the operation. + /// \param CallerNeedsEvent specifies if the caller needs an event from this + /// memory operation. /// \param HandlerFunc is a function that submits the operation with a /// handler. /// \param MemMngrFunc is a function that forwards its arguments to the @@ -888,8 +925,8 @@ class queue_impl { typename... MemMngrArgTs> event submitMemOpHelper(const std::shared_ptr &Self, const std::vector &DepEvents, - HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, - MemMngrArgTs... MemOpArgs); + bool CallerNeedsEvent, HandlerFuncT HandlerFunc, + MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs); // When instrumentation is enabled emits trace event for wait begin and // returns the telemetry event generated for the wait @@ -987,12 +1024,6 @@ class queue_impl { const bool MIsProfilingEnabled; protected: - // Indicates whether the queue supports discarding PI events for tasks - // submitted to it. This condition is necessary but not sufficient, PI events - // should be discarded only if they also don't represent potential implicit - // dependencies for future tasks in other queues. - const bool MSupportsDiscardingPiEvents; - // Command graph which is associated with this queue for the purposes of // recording commands to it. std::weak_ptr MGraph{}; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 78c52327ff289..6ea0fc569bced 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1926,11 +1926,11 @@ static std::string_view cgTypeToString(detail::CG::CGTYPE Type) { ExecCGCommand::ExecCGCommand( std::unique_ptr CommandGroup, QueueImplPtr Queue, - sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + bool EventNeeded, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const std::vector &Dependencies) : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer, Dependencies), - MCommandGroup(std::move(CommandGroup)) { + MEventNeeded(EventNeeded), MCommandGroup(std::move(CommandGroup)) { if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) { MEvent->setSubmittedQueue( static_cast(MCommandGroup.get())->MQueue); @@ -2758,11 +2758,15 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { Plugin->call(RawEvents.size(), &RawEvents[0]); } + // We can omit creating a PI event and create a "discarded" event if either + // the queue has the discard property or the command has been explicitly + // marked as not needing an event, e.g. if the user did not ask for one, and + // if the queue supports discarded PI event and there are no requirements. + bool DiscardPiEvent = (MQueue->MDiscardEvents || !MEventNeeded) && + MQueue->supportsDiscardingPiEvents() && + MCommandGroup->getRequirements().size() == 0; sycl::detail::pi::PiEvent *Event = - (MQueue->supportsDiscardingPiEvents() && - MCommandGroup->getRequirements().size() == 0) - ? nullptr - : &MEvent->getHandleRef(); + DiscardPiEvent ? nullptr : &MEvent->getHandleRef(); sycl::detail::pi::PiExtSyncPoint OutSyncPoint; sycl::detail::pi::PiExtCommandBufferCommand OutCommand = nullptr; switch (MCommandGroup->getType()) { @@ -2909,8 +2913,13 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { auto RawEvents = getPiEvents(EventImpls); flushCrossQueueDeps(EventImpls, getWorkerQueue()); - bool DiscardPiEvent = (MQueue->supportsDiscardingPiEvents() && - MCommandGroup->getRequirements().size() == 0); + // We can omit creating a PI event and create a "discarded" event if either + // the queue has the discard property or the command has been explicitly + // marked as not needing an event, e.g. if the user did not ask for one, and + // if the queue supports discarded PI event and there are no requirements. + bool DiscardPiEvent = (MQueue->MDiscardEvents || !MEventNeeded) && + MQueue->supportsDiscardingPiEvents() && + MCommandGroup->getRequirements().size() == 0; sycl::detail::pi::PiEvent *Event = DiscardPiEvent ? nullptr : &MEvent->getHandleRef(); detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent; diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 8ba0cceee9e6a..ea1a5b5111149 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -648,6 +648,7 @@ class ExecCGCommand : public Command { public: ExecCGCommand( std::unique_ptr CommandGroup, QueueImplPtr Queue, + bool EventNeeded, sycl::detail::pi::PiExtCommandBuffer CommandBuffer = nullptr, const std::vector &Dependencies = {}); @@ -672,6 +673,11 @@ class ExecCGCommand : public Command { // necessary. KernelFusionCommand *MFusionCmd = nullptr; + // MEventNeeded is true if the command needs to produce a valid event. The + // implementation may elect to not produce events (native or SYCL) if this + // is false. + bool MEventNeeded = true; + bool producesPiEvent() const final; bool supportsPostEnqueueCleanup() const final; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index f8c2bb27855e3..a17c45720733c 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -943,14 +943,15 @@ static void combineAccessModesOfReqs(std::vector &Reqs) { Scheduler::GraphBuildResult Scheduler::GraphBuilder::addCG( std::unique_ptr CommandGroup, const QueueImplPtr &Queue, - std::vector &ToEnqueue, + std::vector &ToEnqueue, bool EventNeeded, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const std::vector &Dependencies) { std::vector &Reqs = CommandGroup->getRequirements(); std::vector &Events = CommandGroup->getEvents(); - auto NewCmd = std::make_unique( - std::move(CommandGroup), Queue, CommandBuffer, std::move(Dependencies)); + auto NewCmd = std::make_unique(std::move(CommandGroup), Queue, + EventNeeded, CommandBuffer, + std::move(Dependencies)); if (!NewCmd) throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); @@ -1350,7 +1351,8 @@ Command *Scheduler::GraphBuilder::connectDepEvent( CG::CodeplayHostTask, /* Payload */ {})); ConnectCmd = new ExecCGCommand( - std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue()); + std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue(), + /*EventNeeded=*/true); } catch (const std::bad_alloc &) { throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); } @@ -1624,8 +1626,8 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue, }), FusedEventDeps.end()); - auto FusedKernelCmd = - std::make_unique(std::move(FusedCG), Queue); + auto FusedKernelCmd = std::make_unique( + std::move(FusedCG), Queue, /*EventNeeded=*/true); // Inherit auxiliary resources from fused command groups Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(), diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 99975edb7d649..78fd300460526 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -94,7 +94,7 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, EventImplPtr Scheduler::addCG( std::unique_ptr CommandGroup, const QueueImplPtr &Queue, - sycl::detail::pi::PiExtCommandBuffer CommandBuffer, + bool EventNeeded, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const std::vector &Dependencies) { EventImplPtr NewEvent = nullptr; const CG::CGTYPE Type = CommandGroup->getType(); @@ -130,17 +130,18 @@ EventImplPtr Scheduler::addCG( NewEvent = NewCmd->getEvent(); break; case CG::CodeplayHostTask: { - auto Result = MGraphBuilder.addCG(std::move(CommandGroup), - DefaultHostQueue, AuxiliaryCmds); + auto Result = + MGraphBuilder.addCG(std::move(CommandGroup), DefaultHostQueue, + AuxiliaryCmds, EventNeeded); NewCmd = Result.NewCmd; NewEvent = Result.NewEvent; ShouldEnqueue = Result.ShouldEnqueue; break; } default: - auto Result = MGraphBuilder.addCG(std::move(CommandGroup), - std::move(Queue), AuxiliaryCmds, - CommandBuffer, std::move(Dependencies)); + auto Result = MGraphBuilder.addCG( + std::move(CommandGroup), std::move(Queue), AuxiliaryCmds, EventNeeded, + CommandBuffer, std::move(Dependencies)); NewCmd = Result.NewCmd; NewEvent = Result.NewEvent; diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 9ce3d7d2a5f94..124fc1181116c 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -372,6 +372,7 @@ class Scheduler { /// /// \param CommandGroup is a unique_ptr to a command group to be added. /// \param Queue Queue that is registering the command-group. + /// \param EventNeeded Specifies whether an event is explicitly required. /// \param CommandBuffer Optional command buffer to enqueue to instead of /// directly to the queue. /// \param Dependencies Optional list of dependency @@ -379,6 +380,7 @@ class Scheduler { /// \return an event object to wait on for command group completion. EventImplPtr addCG(std::unique_ptr CommandGroup, const QueueImplPtr &Queue, + bool EventNeeded, sycl::detail::pi::PiExtCommandBuffer CommandBuffer = nullptr, const std::vector &Dependencies = {}); @@ -602,7 +604,7 @@ class Scheduler { /// processor right away or not. GraphBuildResult addCG( std::unique_ptr CommandGroup, const QueueImplPtr &Queue, - std::vector &ToEnqueue, + std::vector &ToEnqueue, bool EventNeeded, sycl::detail::pi::PiExtCommandBuffer CommandBuffer = nullptr, const std::vector &Dependencies = {}); diff --git a/sycl/source/enqueue_functions.cpp b/sycl/source/enqueue_functions.cpp new file mode 100644 index 0000000000000..b2e4f3f712f4b --- /dev/null +++ b/sycl/source/enqueue_functions.cpp @@ -0,0 +1,42 @@ +//==------ enqueue_functions.hpp ------- SYCL enqueue free functions -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { + +__SYCL_EXPORT void memcpy(queue Q, void *Dest, const void *Src, size_t NumBytes, + const sycl::detail::code_location &CodeLoc) { + sycl::detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + auto QueueImplPtr = sycl::detail::getSyclObjImpl(Q); + QueueImplPtr->memcpy(QueueImplPtr, Dest, Src, NumBytes, {}, + /*CallerNeedsEvent=*/false, CodeLoc); +} + +__SYCL_EXPORT void memset(queue Q, void *Ptr, int Value, size_t NumBytes, + const sycl::detail::code_location &CodeLoc) { + sycl::detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + auto QueueImplPtr = sycl::detail::getSyclObjImpl(Q); + QueueImplPtr->memset(QueueImplPtr, Ptr, Value, NumBytes, {}, + /*CallerNeedsEvent=*/false); +} + +__SYCL_EXPORT void mem_advise(queue Q, void *Ptr, size_t NumBytes, int Advice, + const sycl::detail::code_location &CodeLoc) { + sycl::detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + auto QueueImplPtr = sycl::detail::getSyclObjImpl(Q); + QueueImplPtr->mem_advise(QueueImplPtr, Ptr, NumBytes, pi_mem_advice(Advice), + {}, /*CallerNeedsEvent=*/false); +} + +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index b16441e4ff146..10ce364310912 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -80,15 +80,29 @@ void *getValueFromDynamicParameter( } // namespace detail +/// TODO: Unused. Remove with ABI break. handler::handler(std::shared_ptr Queue, bool IsHost) - : handler(Queue, Queue, nullptr, IsHost) {} + : handler(Queue, IsHost, /*CallerNeedsEvent=*/true) {} +/// TODO: Unused. Remove with ABI break. handler::handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, std::shared_ptr SecondaryQueue, bool IsHost) + : handler(Queue, PrimaryQueue, SecondaryQueue, IsHost, + /*CallerNeedsEvent=*/true) {} + +handler::handler(std::shared_ptr Queue, bool IsHost, + bool CallerNeedsEvent) + : handler(Queue, Queue, nullptr, IsHost, CallerNeedsEvent) {} + +handler::handler(std::shared_ptr Queue, + std::shared_ptr PrimaryQueue, + std::shared_ptr SecondaryQueue, + bool IsHost, bool CallerNeedsEvent) : MImpl(std::make_shared(std::move(PrimaryQueue), - std::move(SecondaryQueue))), + std::move(SecondaryQueue), + CallerNeedsEvent)), MQueue(std::move(Queue)), MIsHost(IsHost) {} handler::handler( @@ -297,8 +311,9 @@ event handler::finalize() { return Result; }; - bool DiscardEvent = false; - if (MQueue->supportsDiscardingPiEvents()) { + bool DiscardEvent = (MQueue->MDiscardEvents || !MImpl->MEventNeeded) && + MQueue->supportsDiscardingPiEvents(); + if (DiscardEvent) { // Kernel only uses assert if it's non interop one bool KernelUsesAssert = !(MKernel && MKernel->isInterop()) && @@ -311,6 +326,9 @@ event handler::finalize() { if (PI_SUCCESS != EnqueueKernel()) throw runtime_error("Enqueue process failed.", PI_ERROR_INVALID_OPERATION); + auto EventImpl = std::make_shared( + detail::event_impl::HES_Discarded); + MLastEvent = detail::createSyclObjFromImpl(EventImpl); } else { NewEvent = std::make_shared(MQueue); NewEvent->setWorkerQueue(MQueue); @@ -576,7 +594,7 @@ event handler::finalize() { } detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), std::move(MQueue)); + std::move(CommandGroup), std::move(MQueue), MImpl->MEventNeeded); MLastEvent = detail::createSyclObjFromImpl(Event); return MLastEvent; @@ -1781,5 +1799,7 @@ void handler::registerDynamicParameter( } MImpl->MDynamicParameters.emplace_back(ParamImpl.get(), ArgIndex); } + +bool handler::eventNeeded() const { return MImpl->MEventNeeded; } } // namespace _V1 } // namespace sycl diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index db3ce2f5cb1b3..9648431a5a429 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -106,39 +106,44 @@ void queue::throw_asynchronous() { impl->throw_asynchronous(); } event queue::memset(void *Ptr, int Value, size_t Count, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return impl->memset(impl, Ptr, Value, Count, {}); + return impl->memset(impl, Ptr, Value, Count, {}, /*CallerNeedsEvent=*/true); } event queue::memset(void *Ptr, int Value, size_t Count, event DepEvent, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return impl->memset(impl, Ptr, Value, Count, {DepEvent}); + return impl->memset(impl, Ptr, Value, Count, {DepEvent}, + /*CallerNeedsEvent=*/true); } event queue::memset(void *Ptr, int Value, size_t Count, const std::vector &DepEvents, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return impl->memset(impl, Ptr, Value, Count, DepEvents); + return impl->memset(impl, Ptr, Value, Count, DepEvents, + /*CallerNeedsEvent=*/true); } event queue::memcpy(void *Dest, const void *Src, size_t Count, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return impl->memcpy(impl, Dest, Src, Count, {}, CodeLoc); + return impl->memcpy(impl, Dest, Src, Count, {}, /*CallerNeedsEvent=*/true, + CodeLoc); } event queue::memcpy(void *Dest, const void *Src, size_t Count, event DepEvent, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return impl->memcpy(impl, Dest, Src, Count, {DepEvent}, CodeLoc); + return impl->memcpy(impl, Dest, Src, Count, {DepEvent}, + /*CallerNeedsEvent=*/true, CodeLoc); } event queue::memcpy(void *Dest, const void *Src, size_t Count, const std::vector &DepEvents, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return impl->memcpy(impl, Dest, Src, Count, DepEvents, CodeLoc); + return impl->memcpy(impl, Dest, Src, Count, DepEvents, + /*CallerNeedsEvent=*/true, CodeLoc); } event queue::mem_advise(const void *Ptr, size_t Length, pi_mem_advice Advice, @@ -150,20 +155,23 @@ event queue::mem_advise(const void *Ptr, size_t Length, pi_mem_advice Advice, event queue::mem_advise(const void *Ptr, size_t Length, int Advice, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return impl->mem_advise(impl, Ptr, Length, pi_mem_advice(Advice), {}); + return impl->mem_advise(impl, Ptr, Length, pi_mem_advice(Advice), {}, + /*CallerNeedsEvent=*/true); } event queue::mem_advise(const void *Ptr, size_t Length, int Advice, event DepEvent, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return impl->mem_advise(impl, Ptr, Length, pi_mem_advice(Advice), {DepEvent}); + return impl->mem_advise(impl, Ptr, Length, pi_mem_advice(Advice), {DepEvent}, + /*CallerNeedsEvent=*/true); } event queue::mem_advise(const void *Ptr, size_t Length, int Advice, const std::vector &DepEvents, const detail::code_location &CodeLoc) { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return impl->mem_advise(impl, Ptr, Length, pi_mem_advice(Advice), DepEvents); + return impl->mem_advise(impl, Ptr, Length, pi_mem_advice(Advice), DepEvents, + /*CallerNeedsEvent=*/true); } event queue::discard_or_return(const event &Event) { @@ -184,6 +192,11 @@ event queue::submit_impl(std::function CGH, queue SecondQueue, return impl->submit(CGH, impl, SecondQueue.impl, CodeLoc); } +void queue::submit_without_event_impl(std::function CGH, + const detail::code_location &CodeLoc) { + return impl->submit_without_event(CGH, impl, CodeLoc); +} + event queue::submit_impl_and_postprocess( std::function CGH, const detail::code_location &CodeLoc, const SubmitPostProcessF &PostProcess) { @@ -225,8 +238,13 @@ getBarrierEventForInorderQueueHelper(const detail::QueueImplPtr QueueImpl) { /// \return a SYCL event object, which corresponds to the queue the command /// group is being enqueued on. event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) { - if (is_in_order() && !impl->getCommandGraph() && !impl->MIsProfilingEnabled) - return getBarrierEventForInorderQueueHelper(impl); + if (is_in_order() && !impl->getCommandGraph() && !impl->MDiscardEvents && + !impl->MIsProfilingEnabled) { + event InOrderLastEvent = getBarrierEventForInorderQueueHelper(impl); + // If the last event was discarded, fall back to enqueuing a barrier. + if (!detail::getSyclObjImpl(InOrderLastEvent)->isDiscarded()) + return InOrderLastEvent; + } return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(); }, CodeLoc); } @@ -247,9 +265,13 @@ event queue::ext_oneapi_submit_barrier(const std::vector &WaitList, auto EventImpl = detail::getSyclObjImpl(Event); return !EventImpl->isContextInitialized() || EventImpl->isNOP(); }); - if (is_in_order() && !impl->getCommandGraph() && !impl->MIsProfilingEnabled && - AllEventsEmptyOrNop) - return getBarrierEventForInorderQueueHelper(impl); + if (is_in_order() && !impl->getCommandGraph() && !impl->MDiscardEvents && + !impl->MIsProfilingEnabled && AllEventsEmptyOrNop) { + event InOrderLastEvent = getBarrierEventForInorderQueueHelper(impl); + // If the last event was discarded, fall back to enqueuing a barrier. + if (!detail::getSyclObjImpl(InOrderLastEvent)->isDiscarded()) + return InOrderLastEvent; + } return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(WaitList); }, CodeLoc); @@ -321,7 +343,7 @@ event queue::memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src, const std::vector &DepEvents) { return impl->memcpyToDeviceGlobal(impl, DeviceGlobalPtr, Src, IsDeviceImageScope, NumBytes, Offset, - DepEvents); + DepEvents, /*CallerNeedsEvent=*/true); } event queue::memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr, @@ -330,7 +352,7 @@ event queue::memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr, const std::vector &DepEvents) { return impl->memcpyFromDeviceGlobal(impl, Dest, DeviceGlobalPtr, IsDeviceImageScope, NumBytes, Offset, - DepEvents); + DepEvents, /*CallerNeedsEvent=*/true); } bool queue::device_has(aspect Aspect) const { diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 6391a69978a56..2c97a01f87da7 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2988,6 +2988,7 @@ _ZN4sycl3_V13ext5intel12experimental15online_compilerILNS3_15source_languageE0EE _ZN4sycl3_V13ext5intel12experimental15online_compilerILNS3_15source_languageE1EE7compileIJSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISE_EEEEES8_IhSaIhEERKSE_DpRKT_ _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE +_ZN4sycl3_V13ext6oneapi12experimental10mem_adviseENS0_5queueEPvmiRKNS0_6detail13code_locationE _ZN4sycl3_V13ext6oneapi10level_zero6detail11make_deviceERKNS0_8platformEm _ZN4sycl3_V13ext6oneapi12experimental12create_imageENS3_16image_mem_handleERKNS3_16image_descriptorERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental12create_imageENS3_16image_mem_handleERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE @@ -3088,6 +3089,8 @@ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_5 _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_7contextERKNS0_6deviceERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC2ERKNS0_5queueERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC2ERKNS0_7contextERKNS0_6deviceERKNS0_13property_listE +_ZN4sycl3_V13ext6oneapi12experimental6memcpyENS0_5queueEPvPKvmRKNS0_6detail13code_locationE +_ZN4sycl3_V13ext6oneapi12experimental6memsetENS0_5queueEPvimRKNS0_6detail13code_locationE _ZN4sycl3_V13ext6oneapi12experimental9image_memC1ERKNS3_16image_descriptorERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental9image_memC1ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental9image_memC2ERKNS3_16image_descriptorERKNS0_5queueE @@ -3127,6 +3130,7 @@ _ZN4sycl3_V15queue20wait_and_throw_proxyERKNS0_6detail13code_locationE _ZN4sycl3_V15queue22memcpyFromDeviceGlobalEPvPKvbmmRKSt6vectorINS0_5eventESaIS6_EE _ZN4sycl3_V15queue25ext_oneapi_submit_barrierERKNS0_6detail13code_locationE _ZN4sycl3_V15queue25ext_oneapi_submit_barrierERKSt6vectorINS0_5eventESaIS3_EERKNS0_6detail13code_locationE +_ZN4sycl3_V15queue25submit_without_event_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationE _ZN4sycl3_V15queue27submit_impl_and_postprocessESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationERKS2_IFvbbRNS0_5eventEEE _ZN4sycl3_V15queue27submit_impl_and_postprocessESt8functionIFvRNS0_7handlerEEES1_RKNS0_6detail13code_locationERKS2_IFvbbRNS0_5eventEEE _ZN4sycl3_V15queue29ext_oneapi_set_external_eventERKNS0_5eventE @@ -3526,10 +3530,14 @@ _ZN4sycl3_V17handler8finalizeEv _ZN4sycl3_V17handler8prefetchEPKvm _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b +_ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEES5_S5_bb _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb +_ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEbb _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b +_ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEES5_S5_bb _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEEb +_ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEEbb _ZN4sycl3_V17samplerC1ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE _ZN4sycl3_V17samplerC1EP11_cl_samplerRKNS0_7contextE _ZN4sycl3_V17samplerC2ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE @@ -4108,6 +4116,7 @@ _ZNK4sycl3_V17context8get_infoINS0_4info7context32atomic_memory_scope_capabiliti _ZNK4sycl3_V17context8get_infoINS0_4info7context7devicesEEENS0_6detail20is_context_info_descIT_E11return_typeEv _ZNK4sycl3_V17context8get_infoINS0_4info7context8platformEEENS0_6detail20is_context_info_descIT_E11return_typeEv _ZNK4sycl3_V17context9getNativeEv +_ZNK4sycl3_V17handler11eventNeededEv _ZNK4sycl3_V17handler15getCommandGraphEv _ZNK4sycl3_V17handler17getContextImplPtrEv _ZNK4sycl3_V17handler27isStateExplicitKernelBundleEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index bcfdab110778d..f27a5bbab639c 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -570,7 +570,9 @@ ??0half@host_half_impl@detail@_V1@sycl@@QEAA@G@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z +??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N1@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N@Z +??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N1@Z ??0host_selector@_V1@sycl@@QEAA@$$QEAV012@@Z ??0host_selector@_V1@sycl@@QEAA@AEBV012@@Z ??0host_selector@_V1@sycl@@QEAA@XZ @@ -3968,6 +3970,7 @@ ?end_recording@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEAVqueue@67@@Z ?end_recording@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBV?$vector@Vqueue@_V1@sycl@@V?$allocator@Vqueue@_V1@sycl@@@std@@@std@@@Z ?end_recording@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXXZ +?eventNeeded@handler@_V1@sycl@@AEBA_NXZ ?ext_codeplay_supports_fusion@queue@_V1@sycl@@QEBA_NXZ ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXVstring_view@detail@23@PEAX_K_N@Z @@ -4329,11 +4332,13 @@ ?map_external_memory_array@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@Uinterop_mem_handle@12345@AEBUimage_descriptor@12345@AEBVdevice@45@AEBVcontext@45@@Z ?map_external_memory_array@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@Uinterop_mem_handle@12345@AEBUimage_descriptor@12345@AEBVqueue@45@@Z ?markBufferAsInternal@detail@_V1@sycl@@YAXAEBV?$shared_ptr@Vbuffer_impl@detail@_V1@sycl@@@std@@@Z +?mem_advise@experimental@oneapi@ext@_V1@sycl@@YAXVqueue@45@PEAX_KHAEBUcode_location@detail@45@@Z ?mem_advise@handler@_V1@sycl@@QEAAXPEBX_KH@Z ?mem_advise@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KHAEBUcode_location@detail@23@@Z ?mem_advise@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KHAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z ?mem_advise@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KHV423@AEBUcode_location@detail@23@@Z ?mem_advise@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KW4_pi_mem_advice@@AEBUcode_location@detail@23@@Z +?memcpy@experimental@oneapi@ext@_V1@sycl@@YAXVqueue@45@PEAXPEBX_KAEBUcode_location@detail@45@@Z ?memcpy@handler@_V1@sycl@@QEAAXPEAXPEBX_K@Z ?memcpy@queue@_V1@sycl@@QEAA?AVevent@23@PEAXPEBX_KAEBUcode_location@detail@23@@Z ?memcpy@queue@_V1@sycl@@QEAA?AVevent@23@PEAXPEBX_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z @@ -4344,6 +4349,7 @@ ?memcpyToDeviceGlobal@handler@_V1@sycl@@AEAAXPEBX0_N_K2@Z ?memcpyToDeviceGlobal@queue@_V1@sycl@@AEAA?AVevent@23@PEAXPEBX_N_K3AEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@@Z ?memcpyToHostOnlyDeviceGlobal@handler@_V1@sycl@@AEAAXPEBX0_K_N11@Z +?memset@experimental@oneapi@ext@_V1@sycl@@YAXVqueue@45@PEAXH_KAEBUcode_location@detail@45@@Z ?memset@handler@_V1@sycl@@QEAAXPEAXH_K@Z ?memset@queue@_V1@sycl@@QEAA?AVevent@23@PEAXH_KAEBUcode_location@detail@23@@Z ?memset@queue@_V1@sycl@@QEAA?AVevent@23@PEAXH_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z @@ -4429,6 +4435,7 @@ ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@@Z ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@@Z ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@@Z +?submit_without_event_impl@queue@_V1@sycl@@AEAAXV?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@@Z ?supportsUSMFill2D@handler@_V1@sycl@@AEAA_NXZ ?supportsUSMMemcpy2D@handler@_V1@sycl@@AEAA_NXZ ?supportsUSMMemset2D@handler@_V1@sycl@@AEAA_NXZ diff --git a/sycl/unittests/Extensions/CMakeLists.txt b/sycl/unittests/Extensions/CMakeLists.txt index 491fa49225a81..ee39a80625d85 100644 --- a/sycl/unittests/Extensions/CMakeLists.txt +++ b/sycl/unittests/Extensions/CMakeLists.txt @@ -10,6 +10,8 @@ add_sycl_unittest(ExtensionsTests OBJECT USMP2P.cpp CompositeDevice.cpp OneAPIProd.cpp + EnqueueFunctionsEvents.cpp + DiscardEvent.cpp ) add_subdirectory(CommandGraph) diff --git a/sycl/unittests/Extensions/DiscardEvent.cpp b/sycl/unittests/Extensions/DiscardEvent.cpp new file mode 100644 index 0000000000000..dc729c74084e0 --- /dev/null +++ b/sycl/unittests/Extensions/DiscardEvent.cpp @@ -0,0 +1,83 @@ +//==------------------------- DiscardEvent.cpp -----------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +#include +#include +#include + +using namespace sycl; + +namespace oneapiext = ext::oneapi::experimental; + +namespace { + +thread_local size_t counter_piEnqueueKernelLaunch = 0; +inline pi_result redefined_piEnqueueKernelLaunch(pi_queue, pi_kernel, pi_uint32, + const size_t *, const size_t *, + const size_t *, pi_uint32, + const pi_event *, + pi_event *event) { + ++counter_piEnqueueKernelLaunch; + EXPECT_EQ(event, nullptr); + return PI_SUCCESS; +} + +thread_local size_t counter_piEnqueueEventsWaitWithBarrier = 0; +thread_local std::chrono::time_point + timestamp_piEnqueueEventsWaitWithBarrier; +inline pi_result after_piEnqueueEventsWaitWithBarrier(pi_queue, pi_uint32, + const pi_event *, + pi_event *) { + ++counter_piEnqueueEventsWaitWithBarrier; + timestamp_piEnqueueEventsWaitWithBarrier = std::chrono::steady_clock::now(); + return PI_SUCCESS; +} + +class DiscardEventTests : public ::testing::Test { +public: + DiscardEventTests() + : Mock{}, Q{context(Mock.getPlatform()), default_selector_v, + property::queue::in_order{}} {} + +protected: + void SetUp() override { + counter_piEnqueueKernelLaunch = 0; + counter_piEnqueueEventsWaitWithBarrier = 0; + } + + unittest::PiMock Mock; + queue Q; +}; + +TEST_F(DiscardEventTests, BarrierBeforeHostTask) { + // Special test for case where host_task need an event after, so a barrier is + // enqueued to create a usable event. + Mock.redefine( + redefined_piEnqueueKernelLaunch); + Mock.redefineAfter( + after_piEnqueueEventsWaitWithBarrier); + + oneapiext::single_task>(Q, []() {}); + + std::chrono::time_point HostTaskTimestamp; + Q.submit([&](handler &CGH) { + CGH.host_task( + [&]() { HostTaskTimestamp = std::chrono::steady_clock::now(); }); + }).wait(); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_piEnqueueEventsWaitWithBarrier, size_t{1}); + ASSERT_TRUE(HostTaskTimestamp > timestamp_piEnqueueEventsWaitWithBarrier); +} + +} // namespace diff --git a/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp b/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp new file mode 100644 index 0000000000000..842e3cf271216 --- /dev/null +++ b/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp @@ -0,0 +1,474 @@ +//==-------------------- EnqueueFunctionsEvents.cpp ------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Tests the behavior of enqueue free functions when events can be discarded. + +#include +#include + +#include + +#include +#include +#include +#include + +using namespace sycl; + +namespace oneapiext = ext::oneapi::experimental; + +namespace { + +inline pi_result after_piKernelGetInfo(pi_kernel kernel, + pi_kernel_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + constexpr char MockKernel[] = "TestKernel"; + if (param_name == PI_KERNEL_INFO_FUNCTION_NAME) { + if (param_value) { + assert(param_value_size == sizeof(MockKernel)); + std::memcpy(param_value, MockKernel, sizeof(MockKernel)); + } + if (param_value_size_ret) + *param_value_size_ret = sizeof(MockKernel); + } + return PI_SUCCESS; +} + +thread_local size_t counter_piEnqueueKernelLaunch = 0; +inline pi_result redefined_piEnqueueKernelLaunch(pi_queue, pi_kernel, pi_uint32, + const size_t *, const size_t *, + const size_t *, pi_uint32, + const pi_event *, + pi_event *event) { + ++counter_piEnqueueKernelLaunch; + EXPECT_EQ(event, nullptr); + return PI_SUCCESS; +} + +thread_local size_t counter_piextUSMEnqueueMemcpy = 0; +inline pi_result redefined_piextUSMEnqueueMemcpy(pi_queue, pi_bool, void *, + const void *, size_t, + pi_uint32, const pi_event *, + pi_event *event) { + ++counter_piextUSMEnqueueMemcpy; + EXPECT_EQ(event, nullptr); + return PI_SUCCESS; +} + +thread_local size_t counter_piextUSMEnqueueMemset = 0; +inline pi_result redefined_piextUSMEnqueueMemset(pi_queue, void *, pi_int32, + size_t, pi_uint32, + const pi_event *, + pi_event *event) { + ++counter_piextUSMEnqueueMemset; + EXPECT_EQ(event, nullptr); + return PI_SUCCESS; +} + +thread_local size_t counter_piextUSMEnqueuePrefetch = 0; +inline pi_result redefined_piextUSMEnqueuePrefetch(pi_queue, const void *, + size_t, + pi_usm_migration_flags, + pi_uint32, const pi_event *, + pi_event *event) { + ++counter_piextUSMEnqueuePrefetch; + EXPECT_EQ(event, nullptr); + return PI_SUCCESS; +} + +thread_local size_t counter_piextUSMEnqueueMemAdvise = 0; +inline pi_result redefined_piextUSMEnqueueMemAdvise(pi_queue, const void *, + size_t, pi_mem_advice, + pi_event *event) { + ++counter_piextUSMEnqueueMemAdvise; + EXPECT_EQ(event, nullptr); + return PI_SUCCESS; +} + +thread_local size_t counter_piEnqueueEventsWaitWithBarrier = 0; +thread_local std::chrono::time_point + timestamp_piEnqueueEventsWaitWithBarrier; +inline pi_result after_piEnqueueEventsWaitWithBarrier(pi_queue, pi_uint32, + const pi_event *, + pi_event *) { + ++counter_piEnqueueEventsWaitWithBarrier; + timestamp_piEnqueueEventsWaitWithBarrier = std::chrono::steady_clock::now(); + return PI_SUCCESS; +} + +class EnqueueFunctionsEventsTests : public ::testing::Test { +public: + EnqueueFunctionsEventsTests() + : Mock{}, Q{context(Mock.getPlatform()), default_selector_v, + property::queue::in_order{}} {} + +protected: + void SetUp() override { + counter_piEnqueueKernelLaunch = 0; + counter_piextUSMEnqueueMemcpy = 0; + counter_piextUSMEnqueueMemset = 0; + counter_piextUSMEnqueuePrefetch = 0; + counter_piextUSMEnqueueMemAdvise = 0; + counter_piEnqueueEventsWaitWithBarrier = 0; + } + + unittest::PiMock Mock; + queue Q; +}; + +TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + + oneapiext::submit(Q, [&](handler &CGH) { + oneapiext::single_task>(CGH, []() {}); + }); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + + oneapiext::single_task>(Q, []() {}); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskKernelNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + Mock.redefineAfter(after_piKernelGetInfo); + + auto KID = get_kernel_id>(); + auto KB = get_kernel_bundle( + Q.get_context(), std::vector{KID}); + + ASSERT_TRUE(KB.has_kernel(KID)); + + auto Kernel = KB.get_kernel(KID); + oneapiext::submit(Q, + [&](handler &CGH) { oneapiext::single_task(CGH, Kernel); }); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutKernelNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + Mock.redefineAfter(after_piKernelGetInfo); + + auto KID = get_kernel_id>(); + auto KB = get_kernel_bundle( + Q.get_context(), std::vector{KID}); + + ASSERT_TRUE(KB.has_kernel(KID)); + + auto Kernel = KB.get_kernel(KID); + + oneapiext::single_task(Q, Kernel); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + + oneapiext::submit(Q, [&](handler &CGH) { + oneapiext::parallel_for>(CGH, range<1>{32}, [](item<1>) {}); + }); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + + oneapiext::parallel_for>(Q, range<1>{32}, [](item<1>) {}); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForKernelNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + Mock.redefineAfter(after_piKernelGetInfo); + + auto KID = get_kernel_id>(); + auto KB = get_kernel_bundle( + Q.get_context(), std::vector{KID}); + + ASSERT_TRUE(KB.has_kernel(KID)); + + auto Kernel = KB.get_kernel(KID); + oneapiext::submit(Q, [&](handler &CGH) { + oneapiext::parallel_for(CGH, range<1>{32}, Kernel); + }); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutKernelNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + Mock.redefineAfter(after_piKernelGetInfo); + + auto KID = get_kernel_id>(); + auto KB = get_kernel_bundle( + Q.get_context(), std::vector{KID}); + + ASSERT_TRUE(KB.has_kernel(KID)); + + auto Kernel = KB.get_kernel(KID); + + oneapiext::parallel_for(Q, range<1>{32}, Kernel); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + + oneapiext::submit(Q, [&](handler &CGH) { + oneapiext::nd_launch>( + CGH, nd_range<1>{range<1>{32}, range<1>{32}}, [](nd_item<1>) {}); + }); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + + oneapiext::nd_launch>(Q, nd_range<1>{range<1>{32}, range<1>{32}}, + [](nd_item<1>) {}); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchKernelNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + Mock.redefineAfter(after_piKernelGetInfo); + + auto KID = get_kernel_id>(); + auto KB = get_kernel_bundle( + Q.get_context(), std::vector{KID}); + + ASSERT_TRUE(KB.has_kernel(KID)); + + auto Kernel = KB.get_kernel(KID); + oneapiext::submit(Q, [&](handler &CGH) { + oneapiext::nd_launch(CGH, nd_range<1>{range<1>{32}, range<1>{32}}, Kernel); + }); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutKernelNoEvent) { + Mock.redefine( + redefined_piEnqueueKernelLaunch); + Mock.redefineAfter(after_piKernelGetInfo); + + auto KID = get_kernel_id>(); + auto KB = get_kernel_bundle( + Q.get_context(), std::vector{KID}); + + ASSERT_TRUE(KB.has_kernel(KID)); + + auto Kernel = KB.get_kernel(KID); + + oneapiext::nd_launch(Q, nd_range<1>{range<1>{32}, range<1>{32}}, Kernel); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); +} + +TEST_F(EnqueueFunctionsEventsTests, SubmitMemcpyNoEvent) { + Mock.redefine( + redefined_piextUSMEnqueueMemcpy); + + constexpr size_t N = 1024; + int *Src = malloc_shared(N, Q); + int *Dst = malloc_shared(N, Q); + + oneapiext::submit(Q, [&](handler &CGH) { + oneapiext::memcpy(CGH, Src, Dst, sizeof(int) * N); + }); + + ASSERT_EQ(counter_piextUSMEnqueueMemcpy, size_t{1}); + + free(Src, Q); + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsEventsTests, MemcpyShortcutNoEvent) { + Mock.redefine( + redefined_piextUSMEnqueueMemcpy); + + constexpr size_t N = 1024; + int *Src = malloc_shared(N, Q); + int *Dst = malloc_shared(N, Q); + + oneapiext::memcpy(Q, Src, Dst, sizeof(int) * N); + + ASSERT_EQ(counter_piextUSMEnqueueMemcpy, size_t{1}); + + free(Src, Q); + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsEventsTests, SubmitCopyNoEvent) { + Mock.redefine( + redefined_piextUSMEnqueueMemcpy); + + constexpr size_t N = 1024; + int *Src = malloc_shared(N, Q); + int *Dst = malloc_shared(N, Q); + + oneapiext::submit(Q, + [&](handler &CGH) { oneapiext::copy(CGH, Dst, Src, N); }); + + ASSERT_EQ(counter_piextUSMEnqueueMemcpy, size_t{1}); + + free(Src, Q); + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsEventsTests, CopyShortcutNoEvent) { + Mock.redefine( + redefined_piextUSMEnqueueMemcpy); + + constexpr size_t N = 1024; + int *Src = malloc_shared(N, Q); + int *Dst = malloc_shared(N, Q); + + oneapiext::memcpy(Q, Dst, Src, N); + + ASSERT_EQ(counter_piextUSMEnqueueMemcpy, size_t{1}); + + free(Src, Q); + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsEventsTests, SubmitMemsetNoEvent) { + Mock.redefine( + redefined_piextUSMEnqueueMemset); + + constexpr size_t N = 1024; + int *Dst = malloc_shared(N, Q); + + oneapiext::submit(Q, [&](handler &CGH) { + oneapiext::memset(CGH, Dst, int{1}, sizeof(int) * N); + }); + + ASSERT_EQ(counter_piextUSMEnqueueMemset, size_t{1}); + + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsEventsTests, MemsetShortcutNoEvent) { + Mock.redefine( + redefined_piextUSMEnqueueMemset); + + constexpr size_t N = 1024; + int *Dst = malloc_shared(N, Q); + + oneapiext::memset(Q, Dst, 1, sizeof(int) * N); + + ASSERT_EQ(counter_piextUSMEnqueueMemset, size_t{1}); + + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsEventsTests, SubmitPrefetchNoEvent) { + Mock.redefine( + redefined_piextUSMEnqueuePrefetch); + + constexpr size_t N = 1024; + int *Dst = malloc_shared(N, Q); + + oneapiext::submit( + Q, [&](handler &CGH) { oneapiext::prefetch(CGH, Dst, sizeof(int) * N); }); + + ASSERT_EQ(counter_piextUSMEnqueuePrefetch, size_t{1}); + + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsEventsTests, PrefetchShortcutNoEvent) { + Mock.redefine( + redefined_piextUSMEnqueuePrefetch); + + constexpr size_t N = 1024; + int *Dst = malloc_shared(N, Q); + + oneapiext::prefetch(Q, Dst, sizeof(int) * N); + + ASSERT_EQ(counter_piextUSMEnqueuePrefetch, size_t{1}); + + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsEventsTests, SubmitMemAdviseNoEvent) { + Mock.redefine( + redefined_piextUSMEnqueueMemAdvise); + + constexpr size_t N = 1024; + int *Dst = malloc_shared(N, Q); + + oneapiext::submit(Q, [&](handler &CGH) { + oneapiext::mem_advise(CGH, Dst, sizeof(int) * N, 1); + }); + + ASSERT_EQ(counter_piextUSMEnqueueMemAdvise, size_t{1}); + + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsEventsTests, MemAdviseShortcutNoEvent) { + Mock.redefine( + redefined_piextUSMEnqueueMemAdvise); + + constexpr size_t N = 1024; + int *Dst = malloc_shared(N, Q); + + oneapiext::mem_advise(Q, Dst, sizeof(int) * N, 1); + + ASSERT_EQ(counter_piextUSMEnqueueMemAdvise, size_t{1}); + + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsEventsTests, BarrierBeforeHostTask) { + // Special test for case where host_task need an event after, so a barrier is + // enqueued to create a usable event. + Mock.redefine( + redefined_piEnqueueKernelLaunch); + Mock.redefineAfter( + after_piEnqueueEventsWaitWithBarrier); + + oneapiext::single_task>(Q, []() {}); + + std::chrono::time_point HostTaskTimestamp; + Q.submit([&](handler &CGH) { + CGH.host_task( + [&]() { HostTaskTimestamp = std::chrono::steady_clock::now(); }); + }).wait(); + + ASSERT_EQ(counter_piEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_piEnqueueEventsWaitWithBarrier, size_t{1}); + ASSERT_TRUE(HostTaskTimestamp > timestamp_piEnqueueEventsWaitWithBarrier); +} + +} // namespace diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index 56803e7eab5bb..ca29b9bd6aa1e 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -1023,7 +1023,8 @@ inline pi_result mock_piEnqueueKernelLaunch( const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1032,7 +1033,8 @@ inline pi_result mock_piextEnqueueCooperativeKernelLaunch( const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1040,14 +1042,16 @@ inline pi_result mock_piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } inline pi_result mock_piEnqueueEventsWaitWithBarrier( pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1056,7 +1060,8 @@ mock_piEnqueueMemBufferRead(pi_queue queue, pi_mem buffer, pi_bool blocking_read, size_t offset, size_t size, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1067,7 +1072,8 @@ inline pi_result mock_piEnqueueMemBufferReadRect( size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1076,7 +1082,8 @@ mock_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, size_t offset, size_t size, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1087,7 +1094,8 @@ inline pi_result mock_piEnqueueMemBufferWriteRect( size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1097,7 +1105,8 @@ mock_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, size_t dst_offset, size_t size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1108,7 +1117,8 @@ inline pi_result mock_piEnqueueMemBufferCopyRect( size_t dst_row_pitch, size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1119,7 +1129,8 @@ inline pi_result mock_piEnqueueMemBufferFill(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1128,7 +1139,8 @@ inline pi_result mock_piEnqueueMemImageRead( pi_image_offset origin, pi_image_region region, size_t row_pitch, size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1139,7 +1151,8 @@ mock_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, size_t input_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1149,7 +1162,8 @@ mock_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, pi_image_offset dst_origin, pi_image_region region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1159,7 +1173,8 @@ mock_piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, const size_t *region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1170,7 +1185,8 @@ inline pi_result mock_piEnqueueMemBufferMap(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event, void **ret_map) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); auto parentDummyHandle = reinterpret_cast(buffer); *ret_map = (void *)(parentDummyHandle->MData); @@ -1182,7 +1198,8 @@ inline pi_result mock_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1247,7 +1264,8 @@ inline pi_result mock_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1257,7 +1275,8 @@ inline pi_result mock_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1267,7 +1286,8 @@ inline pi_result mock_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1275,7 +1295,8 @@ inline pi_result mock_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, size_t length, pi_mem_advice advice, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1318,7 +1339,8 @@ inline pi_result mock_piextEnqueueDeviceGlobalVariableWrite( pi_bool blocking_write, size_t count, size_t offset, const void *src, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1326,7 +1348,8 @@ inline pi_result mock_piextEnqueueDeviceGlobalVariableRead( pi_queue queue, pi_program program, const char *name, pi_bool blocking_read, size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1526,7 +1549,8 @@ inline pi_result mock_piextEnqueueReadHostPipe( pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } @@ -1534,7 +1558,8 @@ inline pi_result mock_piextEnqueueWriteHostPipe( pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist, const pi_event *events_waitlist, pi_event *event) { - *event = createDummyHandle(); + if (event) + *event = createDummyHandle(); return PI_SUCCESS; } diff --git a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp index 35e353780d450..b4d4e7cdb7535 100644 --- a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp +++ b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp @@ -111,7 +111,7 @@ class MockHandler : public sycl::handler { public: MockHandler(std::shared_ptr Queue) - : sycl::handler(Queue, /* IsHost */ false) {} + : sycl::handler(Queue, /* IsHost */ false, /*CallerNeedsEvent*/ true) {} std::unique_ptr finalize() { auto CGH = static_cast(this); diff --git a/sycl/unittests/scheduler/AccessorDefaultCtor.cpp b/sycl/unittests/scheduler/AccessorDefaultCtor.cpp index 473ddcefe75fa..bf200ec660fb9 100644 --- a/sycl/unittests/scheduler/AccessorDefaultCtor.cpp +++ b/sycl/unittests/scheduler/AccessorDefaultCtor.cpp @@ -21,7 +21,8 @@ TEST_F(SchedulerTest, AccDefaultCtorDoesntAffectDepGraph) { std::vector ToEnqueue; - MockHandlerCustomFinalize MockCGH(QueueDevImpl, false); + MockHandlerCustomFinalize MockCGH(QueueDevImpl, false, + /*CallerNeedsEvent=*/true); sycl::accessor B; @@ -33,8 +34,8 @@ TEST_F(SchedulerTest, AccDefaultCtorDoesntAffectDepGraph) { std::unique_ptr CmdGroup = MockCGH.finalize(); - detail::Command *NewCmd = - MS.addCG(std::move(CmdGroup), QueueDevImpl, ToEnqueue); + detail::Command *NewCmd = MS.addCG(std::move(CmdGroup), QueueDevImpl, + ToEnqueue, /*EventNeeded=*/true); // if MDeps is empty, accessor built from default ctor does not affect // dependency graph in accordance with SYCL 2020 diff --git a/sycl/unittests/scheduler/Commands.cpp b/sycl/unittests/scheduler/Commands.cpp index a995800643421..bd0df10d1309a 100644 --- a/sycl/unittests/scheduler/Commands.cpp +++ b/sycl/unittests/scheduler/Commands.cpp @@ -81,6 +81,7 @@ TEST_F(SchedulerTest, WaitEmptyEventWithBarrier) { std::unique_ptr CommandGroup(new detail::CGBarrier( std::move(Arg), detail::CG::StorageInitHelper({}, {}, {}, {}, {}), detail::CG::CGTYPE::BarrierWaitlist, {})); - MS.Scheduler::addCG(std::move(CommandGroup), QueueImpl); + MS.Scheduler::addCG(std::move(CommandGroup), QueueImpl, + /*EventNeeded=*/true); } } diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp index 414f58c6f177c..2e54057e434d6 100644 --- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp +++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp @@ -60,7 +60,8 @@ class DependsOnTests : public ::testing::Test { std::vector ToEnqueue; // Emulating processing of command group function - MockHandlerCustomFinalize MockCGH(QueueDevImpl, false); + MockHandlerCustomFinalize MockCGH(QueueDevImpl, false, + /*CallerNeedsEvent=*/true); for (auto EventImpl : Events) MockCGH.depends_on(detail::createSyclObjFromImpl(EventImpl)); @@ -84,7 +85,7 @@ class DependsOnTests : public ::testing::Test { detail::Command *NewCmd = MS.addCG( std::move(CmdGroup), Type == TestCGType::HOST_TASK ? MS.getDefaultHostQueue() : QueueDevImpl, - ToEnqueue); + ToEnqueue, /*EventNeeded=*/true); EXPECT_EQ(ToEnqueue.size(), 0u); return NewCmd; } diff --git a/sycl/unittests/scheduler/GraphCleanup.cpp b/sycl/unittests/scheduler/GraphCleanup.cpp index 9bf4e37eea0db..8206728b2b221 100644 --- a/sycl/unittests/scheduler/GraphCleanup.cpp +++ b/sycl/unittests/scheduler/GraphCleanup.cpp @@ -106,7 +106,8 @@ static void checkCleanupOnEnqueue(MockScheduler &MS, /*SharedPtrStorage*/ {}, /*Requirements*/ {&MockReq}, /*Events*/ {}))}; - detail::EventImplPtr Event = MS.addCG(std::move(CG), QueueImpl); + detail::EventImplPtr Event = + MS.addCG(std::move(CG), QueueImpl, /*EventNeeded=*/true); auto *Cmd = static_cast(Event->getCommand()); verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); @@ -330,7 +331,8 @@ TEST_F(SchedulerTest, StreamBufferDeallocation) { AttachSchedulerWrapper AttachScheduler{MSPtr}; detail::EventImplPtr EventImplPtr; { - MockHandlerCustomFinalize MockCGH(QueueImplPtr, false); + MockHandlerCustomFinalize MockCGH(QueueImplPtr, false, + /*CallerNeedsEvent=*/true); kernel_bundle KernelBundle = sycl::get_kernel_bundle( QueueImplPtr->get_context()); @@ -341,7 +343,8 @@ TEST_F(SchedulerTest, StreamBufferDeallocation) { MockCGH.single_task>([] {}); std::unique_ptr CG = MockCGH.finalize(); - EventImplPtr = MSPtr->addCG(std::move(CG), QueueImplPtr); + EventImplPtr = + MSPtr->addCG(std::move(CG), QueueImplPtr, /*EventNeeded=*/true); } // The buffers should have been released with graph cleanup once the work is @@ -391,7 +394,8 @@ TEST_F(SchedulerTest, AuxiliaryResourcesDeallocation) { detail::EventImplPtr EventImplPtr; bool MockAuxResourceDeleted = false; { - MockHandlerCustomFinalize MockCGH(QueueImplPtr, false); + MockHandlerCustomFinalize MockCGH(QueueImplPtr, false, + /*CallerNeedsEvent=*/true); kernel_bundle KernelBundle = sycl::get_kernel_bundle( QueueImplPtr->get_context()); @@ -410,7 +414,8 @@ TEST_F(SchedulerTest, AuxiliaryResourcesDeallocation) { MockCGH.single_task>([] {}); std::unique_ptr CG = MockCGH.finalize(); - EventImplPtr = MSPtr->addCG(std::move(CG), QueueImplPtr); + EventImplPtr = + MSPtr->addCG(std::move(CG), QueueImplPtr, /*EventNeeded=*/true); } EventCompleted = false; diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp index ab37a667fc90a..3f97ffb003adc 100644 --- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp +++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp @@ -19,10 +19,21 @@ using namespace sycl; using ::testing::An; +class MockQueueImpl : public sycl::detail::queue_impl { +public: + MockQueueImpl(const sycl::detail::DeviceImplPtr &Device, + const sycl::async_handler &AsyncHandler, + const sycl::property_list &PropList) + : sycl::detail::queue_impl(Device, AsyncHandler, PropList) {} + using sycl::detail::queue_impl::finalizeHandler; +}; + // Define type with the only methods called by finalizeHandler class LimitedHandler { public: - LimitedHandler(sycl::detail::CG::CGTYPE CGType) : MCGType(CGType) {} + LimitedHandler(sycl::detail::CG::CGTYPE CGType, + std::shared_ptr Queue) + : MCGType(CGType), MQueue(Queue) {} virtual ~LimitedHandler() {} virtual void depends_on(const sycl::detail::EventImplPtr &) {} @@ -38,14 +49,16 @@ class LimitedHandler { sycl::detail::CG::CGTYPE getType() { return MCGType; } sycl::detail::CG::CGTYPE MCGType; + std::shared_ptr MQueue; }; // Needed to use EXPECT_CALL to verify depends_on that originally appends lst // event as dependency to the new CG class LimitedHandlerSimulation : public LimitedHandler { public: - LimitedHandlerSimulation(sycl::detail::CG::CGTYPE CGType) - : LimitedHandler(CGType) {} + LimitedHandlerSimulation(sycl::detail::CG::CGTYPE CGType, + std::shared_ptr Queue) + : LimitedHandler(CGType, Queue) {} MOCK_METHOD1(depends_on, void(const sycl::detail::EventImplPtr &)); MOCK_METHOD1(depends_on, void(event Event)); @@ -53,15 +66,6 @@ class LimitedHandlerSimulation : public LimitedHandler { void(const std::vector &Events)); }; -class MockQueueImpl : public sycl::detail::queue_impl { -public: - MockQueueImpl(const sycl::detail::DeviceImplPtr &Device, - const sycl::async_handler &AsyncHandler, - const sycl::property_list &PropList) - : sycl::detail::queue_impl(Device, AsyncHandler, PropList) {} - using sycl::detail::queue_impl::finalizeHandler; -}; - // Only check events dependency in queue_impl::finalizeHandler TEST_F(SchedulerTest, InOrderQueueSyncCheck) { sycl::unittest::PiMock Mock; @@ -76,13 +80,15 @@ TEST_F(SchedulerTest, InOrderQueueSyncCheck) { // previous task, this is needed to properly sync blocking & blocked tasks. sycl::event Event; { - LimitedHandlerSimulation MockCGH{detail::CG::CGTYPE::CodeplayHostTask}; + LimitedHandlerSimulation MockCGH{detail::CG::CGTYPE::CodeplayHostTask, + Queue}; EXPECT_CALL(MockCGH, depends_on(An())) .Times(0); Queue->finalizeHandler(MockCGH, Event); } { - LimitedHandlerSimulation MockCGH{detail::CG::CGTYPE::CodeplayHostTask}; + LimitedHandlerSimulation MockCGH{detail::CG::CGTYPE::CodeplayHostTask, + Queue}; EXPECT_CALL(MockCGH, depends_on(An())) .Times(1); Queue->finalizeHandler(MockCGH, Event); diff --git a/sycl/unittests/scheduler/KernelFusion.cpp b/sycl/unittests/scheduler/KernelFusion.cpp index 8b45c03e37f1f..1db16cbda1493 100644 --- a/sycl/unittests/scheduler/KernelFusion.cpp +++ b/sycl/unittests/scheduler/KernelFusion.cpp @@ -22,7 +22,8 @@ template detail::Command *CreateTaskCommand(MockScheduler &MS, detail::QueueImplPtr DevQueue, buffer &buf) { - MockHandlerCustomFinalize MockCGH(DevQueue, false); + MockHandlerCustomFinalize MockCGH(DevQueue, false, + /*CallerNeedsEvent=*/true); auto acc = buf.get_access(static_cast(MockCGH)); @@ -36,7 +37,8 @@ detail::Command *CreateTaskCommand(MockScheduler &MS, auto CmdGrp = MockCGH.finalize(); std::vector ToEnqueue; - detail::Command *NewCmd = MS.addCG(std::move(CmdGrp), DevQueue, ToEnqueue); + detail::Command *NewCmd = + MS.addCG(std::move(CmdGrp), DevQueue, ToEnqueue, /*EventNeeded=*/true); EXPECT_EQ(ToEnqueue.size(), 0u); return NewCmd; } diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp index c97428b9d55c6..c27e4d672e0fa 100644 --- a/sycl/unittests/scheduler/QueueFlushing.cpp +++ b/sycl/unittests/scheduler/QueueFlushing.cpp @@ -147,7 +147,8 @@ TEST_F(SchedulerTest, QueueFlushing) { /*SharedPtrStorage*/ {}, /*Requirements*/ {}, /*Events*/ {}))}; - detail::ExecCGCommand ExecCGCmd{std::move(CG), QueueImplA}; + detail::ExecCGCommand ExecCGCmd{std::move(CG), QueueImplA, + /*EventNeeded=*/true}; MockReq.MDims = 1; (void)ExecCGCmd.addDep(detail::DepDesc(&AllocaCmd, &MockReq, &AllocaCmd), ToCleanUp); diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index 1d7fa2075d0da..b1c667c5c40ca 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -194,11 +194,12 @@ class MockScheduler : public sycl::detail::Scheduler { return MGraphBuilder.addEmptyCmd(Cmd, Reqs, Queue, Reason, ToEnqueue); } - sycl::detail::Command * - addCG(std::unique_ptr CommandGroup, - sycl::detail::QueueImplPtr Queue, - std::vector &ToEnqueue) { - return MGraphBuilder.addCG(std::move(CommandGroup), Queue, ToEnqueue) + sycl::detail::Command *addCG(std::unique_ptr CommandGroup, + sycl::detail::QueueImplPtr Queue, + std::vector &ToEnqueue, + bool EventNeeded) { + return MGraphBuilder + .addCG(std::move(CommandGroup), Queue, ToEnqueue, EventNeeded) .NewCmd; } @@ -225,8 +226,9 @@ sycl::detail::Requirement getMockRequirement(const MemObjT &MemObj) { class MockHandler : public sycl::handler { public: - MockHandler(std::shared_ptr Queue, bool IsHost) - : sycl::handler(Queue, IsHost) {} + MockHandler(std::shared_ptr Queue, bool IsHost, + bool CallerNeedsEvent) + : sycl::handler(Queue, IsHost, CallerNeedsEvent) {} // Methods using sycl::handler::addReduction; using sycl::handler::getType; @@ -292,8 +294,8 @@ class MockHandler : public sycl::handler { class MockHandlerCustomFinalize : public MockHandler { public: MockHandlerCustomFinalize(std::shared_ptr Queue, - bool IsHost) - : MockHandler(Queue, IsHost) {} + bool IsHost, bool CallerNeedsEvent) + : MockHandler(Queue, IsHost, CallerNeedsEvent) {} std::unique_ptr finalize() { std::unique_ptr CommandGroup; diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index 4564028110341..7e76027c05431 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -20,8 +20,9 @@ inline constexpr auto DisableCleanupName = class MockHandlerStreamInit : public MockHandler { public: - MockHandlerStreamInit(std::shared_ptr Queue, bool IsHost) - : MockHandler(Queue, IsHost) {} + MockHandlerStreamInit(std::shared_ptr Queue, bool IsHost, + bool CallerNeedsEvent) + : MockHandler(Queue, IsHost, CallerNeedsEvent) {} std::unique_ptr finalize() { std::unique_ptr CommandGroup; switch (getType()) { @@ -85,7 +86,7 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) { /*PropList=*/{})); // Emulating processing of command group function - MockHandlerStreamInit MockCGH(HQueueImpl, true); + MockHandlerStreamInit MockCGH(HQueueImpl, true, /*CallerNeedsEvent=*/true); MockCGH.setType(detail::CG::Kernel); auto EmptyKernel = [](sycl::nd_item<1>) {}; @@ -118,7 +119,8 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) { MockScheduler MS; std::vector AuxCmds; - detail::Command *NewCmd = MS.addCG(std::move(MainCG), HQueueImpl, AuxCmds); + detail::Command *NewCmd = + MS.addCG(std::move(MainCG), HQueueImpl, AuxCmds, /*EventNeeded=*/true); ASSERT_TRUE(!!NewCmd) << "Failed to add command group into scheduler"; ASSERT_GT(NewCmd->MDeps.size(), 0u) << "No deps appeared in the new exec kernel command";