diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 809814cbf71ef..7b81850917bc6 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -92,6 +92,7 @@ variables in production code. | `SYCL_DEVICELIB_NO_FALLBACK` | Any(\*) | Disable loading and linking of device library images | | `SYCL_PRINT_EXECUTION_GRAPH` | Described [below](#sycl_print_execution_graph-options) | Print execution graph to DOT text file. | | `SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP` | Any(\*) | Disable cleanup of finished command nodes at host-device synchronization points. | +| `SYCL_DISABLE_POST_ENQUEUE_CLEANUP` | Any(\*) | Disable cleanup of enqueued command nodes during submission. | | `SYCL_THROW_ON_BLOCK` | Any(\*) | Throw an exception on attempt to wait for a blocked command. | | `SYCL_DEVICELIB_INHIBIT_NATIVE` | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. | | `SYCL_PROGRAM_COMPILE_OPTIONS` | String of valid OpenCL compile options | Override compile options for all programs. | diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index 170c0f39906c3..f4938aa43120d 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -290,6 +290,7 @@ class CGExecKernel : public CG { } void clearStreams() { MStreams.clear(); } + bool hasStreams() { return !MStreams.empty(); } }; /// "Copy memory" command group class. diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def index fcfe345b52975..95750092bfb2e 100644 --- a/sycl/source/detail/config.def +++ b/sycl/source/detail/config.def @@ -12,6 +12,7 @@ CONFIG(SYCL_PRINT_EXECUTION_GRAPH, 32, __SYCL_PRINT_EXECUTION_GRAPH) CONFIG(SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP, 1, __SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP) +CONFIG(SYCL_DISABLE_POST_ENQUEUE_CLEANUP, 1, __SYCL_DISABLE_POST_ENQUEUE_CLEANUP) CONFIG(SYCL_DEVICE_ALLOWLIST, 1024, __SYCL_DEVICE_ALLOWLIST) CONFIG(SYCL_BE, 16, __SYCL_BE) CONFIG(SYCL_PI_TRACE, 16, __SYCL_PI_TRACE) diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index 503c71af62823..24b2809955fb0 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -186,9 +186,14 @@ class device_image_impl { std::lock_guard Lock{MSpecConstAccessMtx}; if (nullptr == MSpecConstsBuffer && !MSpecConstsBlob.empty()) { const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin(); + // Uses PI_MEM_FLAGS_HOST_PTR_COPY instead of PI_MEM_FLAGS_HOST_PTR_USE + // since post-enqueue cleanup might trigger destruction of + // device_image_impl and, as a result, destruction of MSpecConstsBlob + // while MSpecConstsBuffer is still in use. + // TODO consider changing the lifetime of device_image_impl instead memBufferCreateHelper(Plugin, detail::getSyclObjImpl(MContext)->getHandleRef(), - PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_USE, + PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY, MSpecConstsBlob.size(), MSpecConstsBlob.data(), &MSpecConstsBuffer, nullptr); } diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 4bafe6a55f38b..f615f37214208 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -364,7 +364,6 @@ std::vector event_impl::getWaitList() { } void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) { - assert(MEvent != nullptr); if (MIsFlushed) return; @@ -379,6 +378,7 @@ void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) { return; // Check if the task for this event has already been submitted. + assert(MEvent != nullptr); pi_event_status Status = PI_EVENT_QUEUED; getPlugin().call( MEvent, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(pi_int32), &Status, diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index fb8214ddf6b8d..0e7b896a596f7 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -201,6 +201,11 @@ class event_impl { /// \return true if this event is discarded. bool isDiscarded() const { return MState == HES_Discarded; } + void setNeedsCleanupAfterWait(bool NeedsCleanupAfterWait) { + MNeedsCleanupAfterWait = NeedsCleanupAfterWait; + } + bool needsCleanupAfterWait() { return MNeedsCleanupAfterWait; } + private: // When instrumentation is enabled emits trace event for event wait begin and // returns the telemetry event generated for the wait @@ -231,6 +236,12 @@ class event_impl { // HostEventState enum. std::atomic MState; + // A temporary workaround for the current limitations of post enqueue graph + // cleanup. Indicates that the command associated with this event isn't + // handled by post enqueue cleanup yet and has to be deleted by cleanup after + // wait. + bool MNeedsCleanupAfterWait = false; + std::mutex MMutex; }; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c67645ec36c57..d5427ffc52020 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -74,10 +74,7 @@ event queue_impl::memset(const std::shared_ptr &Self, event ResEvent = prepareUSMEvent(Self, NativeEvent); // Track only if we won't be able to handle it with piQueueFinish. - // FIXME these events are stored for level zero until as a workaround, remove - // once piEventRelease no longer calls wait on the event in the plugin. - if (!MSupportOOO || - getPlugin().getBackend() == backend::ext_oneapi_level_zero) + if (!MSupportOOO) addSharedEvent(ResEvent); return MDiscardEvents ? createDiscardedEvent() : ResEvent; } @@ -99,10 +96,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, event ResEvent = prepareUSMEvent(Self, NativeEvent); // Track only if we won't be able to handle it with piQueueFinish. - // FIXME these events are stored for level zero until as a workaround, remove - // once piEventRelease no longer calls wait on the event in the plugin. - if (!MSupportOOO || - getPlugin().getBackend() == backend::ext_oneapi_level_zero) + if (!MSupportOOO) addSharedEvent(ResEvent); return MDiscardEvents ? createDiscardedEvent() : ResEvent; } @@ -125,29 +119,30 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, event ResEvent = prepareUSMEvent(Self, NativeEvent); // Track only if we won't be able to handle it with piQueueFinish. - // FIXME these events are stored for level zero until as a workaround, remove - // once piEventRelease no longer calls wait on the event in the plugin. - if (!MSupportOOO || - getPlugin().getBackend() == backend::ext_oneapi_level_zero) + if (!MSupportOOO) addSharedEvent(ResEvent); return MDiscardEvents ? createDiscardedEvent() : ResEvent; } void queue_impl::addEvent(const event &Event) { - EventImplPtr Eimpl = getSyclObjImpl(Event); - Command *Cmd = (Command *)(Eimpl->getCommand()); + EventImplPtr EImpl = getSyclObjImpl(Event); + auto *Cmd = static_cast(EImpl->getCommand()); if (!Cmd) { // if there is no command on the event, we cannot track it with MEventsWeak // as that will leave it with no owner. Track in MEventsShared only if we're // unable to call piQueueFinish during wait. - // FIXME these events are stored for level zero until as a workaround, - // remove once piEventRelease no longer calls wait on the event in the - // plugin. - if (is_host() || !MSupportOOO || - getPlugin().getBackend() == backend::ext_oneapi_level_zero) + if (is_host() || !MSupportOOO) addSharedEvent(Event); - } else { - std::weak_ptr EventWeakPtr{Eimpl}; + } + // As long as the queue supports piQueueFinish we only need to store events + // with command nodes in the following cases: + // 1. Unenqueued commands, since they aren't covered by piQueueFinish. + // 2. Kernels with streams, since they are not supported by post enqueue + // cleanup. + // 3. Host tasks, for both reasons. + else if (is_host() || !MSupportOOO || EImpl->getHandleRef() == nullptr || + EImpl->needsCleanupAfterWait()) { + std::weak_ptr EventWeakPtr{EImpl}; std::lock_guard Lock{MMutex}; MEventsWeak.push_back(std::move(EventWeakPtr)); } @@ -157,10 +152,7 @@ void queue_impl::addEvent(const event &Event) { /// but some events have no other owner. In this case, /// addSharedEvent will have the queue track the events via a shared pointer. void queue_impl::addSharedEvent(const event &Event) { - // FIXME The assertion should be corrected once the Level Zero workaround is - // removed. - assert(is_host() || !MSupportOOO || - getPlugin().getBackend() == backend::ext_oneapi_level_zero); + assert(is_host() || !MSupportOOO); std::lock_guard Lock(MMutex); // Events stored in MEventsShared are not released anywhere else aside from // calls to queue::wait/wait_and_throw, which a user application might not @@ -293,50 +285,31 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { // directly. Otherwise, only wait for unenqueued or host task events, starting // from the latest submitted task in order to minimize total amount of calls, // then handle the rest with piQueueFinish. - // TODO the new workflow has worse performance with Level Zero, keep the old - // behavior until this is addressed - if (!is_host() && - getPlugin().getBackend() == backend::ext_oneapi_level_zero) { + const bool SupportsPiFinish = !is_host() && MSupportOOO; + for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); + EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtrIt->lock()) { + // A nullptr PI event indicates that piQueueFinish will not cover it, + // either because it's a host task event or an unenqueued one. + if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) { + EventImplSharedPtr->wait(EventImplSharedPtr); + } + } + } + if (SupportsPiFinish) { + const detail::plugin &Plugin = getPlugin(); + Plugin.call(getHandleRef()); for (std::weak_ptr &EventImplWeakPtr : WeakEvents) if (std::shared_ptr EventImplSharedPtr = EventImplWeakPtr.lock()) - EventImplSharedPtr->wait(EventImplSharedPtr); + if (EventImplSharedPtr->needsCleanupAfterWait()) + EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); + assert(SharedEvents.empty() && "Queues that support calling piQueueFinish " + "shouldn't have shared events"); + } else { for (event &Event : SharedEvents) Event.wait(); - } else { - bool SupportsPiFinish = !is_host() && MSupportOOO; - for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); - EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { - if (std::shared_ptr EventImplSharedPtr = - EventImplWeakPtrIt->lock()) { - // A nullptr PI event indicates that piQueueFinish will not cover it, - // either because it's a host task event or an unenqueued one. - if (!SupportsPiFinish || - nullptr == EventImplSharedPtr->getHandleRef()) { - EventImplSharedPtr->wait(EventImplSharedPtr); - } - } - } - if (SupportsPiFinish) { - const detail::plugin &Plugin = getPlugin(); - Plugin.call(getHandleRef()); - for (std::weak_ptr &EventImplWeakPtr : WeakEvents) - if (std::shared_ptr EventImplSharedPtr = - EventImplWeakPtr.lock()) - EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); - // FIXME these events are stored for level zero until as a workaround, - // remove once piEventRelease no longer calls wait on the event in the - // plugin. - if (Plugin.getBackend() == backend::ext_oneapi_level_zero) { - SharedEvents.clear(); - } - assert(SharedEvents.empty() && - "Queues that support calling piQueueFinish " - "shouldn't have shared events"); - } else { - for (event &Event : SharedEvents) - Event.wait(); - } } #ifdef XPTI_ENABLE_INSTRUMENTATION instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index c5ff9eb30dfaa..b7ab6c4d69f84 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -267,8 +267,9 @@ class DispatchHostTask { // of empty command. // Also, it's possible to have record deallocated prior to enqueue process. // Thus we employ read-lock of graph. + std::vector ToCleanUp; + Scheduler &Sched = Scheduler::getInstance(); { - Scheduler &Sched = Scheduler::getInstance(); Scheduler::ReadLockT Lock(Sched.MGraphLock); std::vector Deps = MThisCmd->MDeps; @@ -279,8 +280,9 @@ class DispatchHostTask { EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; for (const DepDesc &Dep : Deps) - Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement); + Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement, ToCleanUp); } + Sched.cleanupCommands(ToCleanUp); } }; @@ -523,7 +525,8 @@ void Command::makeTraceEventEpilog() { #endif } -Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { +Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, + std::vector &ToCleanUp) { const QueueImplPtr &WorkerQueue = getWorkerQueue(); const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr(); @@ -555,7 +558,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { // If contexts don't match we'll connect them using host task if (DepEventContext != WorkerContext && !WorkerContext->is_host()) { Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; - ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep); + ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp); } else MPreparedDepsEvents.push_back(std::move(DepEvent)); @@ -570,11 +573,17 @@ const QueueImplPtr &Command::getWorkerQueue() const { return MQueue; } bool Command::producesPiEvent() const { return true; } -Command *Command::addDep(DepDesc NewDep) { +bool Command::supportsPostEnqueueCleanup() const { + // Isolated commands are cleaned up separately + return !MUsers.empty() || !MDeps.empty(); +} + +Command *Command::addDep(DepDesc NewDep, std::vector &ToCleanUp) { Command *ConnectionCmd = nullptr; if (NewDep.MDepCommand) { - ConnectionCmd = processDepEvent(NewDep.MDepCommand->getEvent(), NewDep); + ConnectionCmd = + processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp); } MDeps.push_back(NewDep); #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -586,7 +595,8 @@ Command *Command::addDep(DepDesc NewDep) { return ConnectionCmd; } -Command *Command::addDep(EventImplPtr Event) { +Command *Command::addDep(EventImplPtr Event, + std::vector &ToCleanUp) { #ifdef XPTI_ENABLE_INSTRUMENTATION // We need this for just the instrumentation, so guarding it will prevent // unused variable warnings when instrumentation is turned off @@ -596,7 +606,8 @@ Command *Command::addDep(EventImplPtr Event) { emitEdgeEventForEventDependence(Cmd, PiEventAddr); #endif - return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr}); + return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr}, + ToCleanUp); } void Command::emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr) { @@ -622,7 +633,8 @@ void Command::emitInstrumentation(uint16_t Type, const char *Txt) { #endif } -bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { +bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, + std::vector &ToCleanUp) { // Exit if already enqueued if (MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess) return true; @@ -691,6 +703,12 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { // Consider the command is successfully enqueued if return code is // CL_SUCCESS MEnqueueStatus = EnqueueResultT::SyclEnqueueSuccess; + if (MLeafCounter == 0 && supportsPostEnqueueCleanup() && + !SYCLConfig::get()) { + assert(!MPostEnqueueCleanup); + MPostEnqueueCleanup = true; + ToCleanUp.push_back(this); + } } // Emit this correlation signal before the task end @@ -783,6 +801,8 @@ void AllocaCommandBase::emitInstrumentationData() { bool AllocaCommandBase::producesPiEvent() const { return false; } +bool AllocaCommandBase::supportsPostEnqueueCleanup() const { return false; } + AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData, AllocaCommandBase *LinkedAllocaCmd) @@ -793,8 +813,11 @@ AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req, // so this call must be before the addDep() call. emitInstrumentationDataProxy(); // "Nothing to depend on" - Command *ConnectionCmd = addDep(DepDesc(nullptr, getRequirement(), this)); + std::vector ToCleanUp; + Command *ConnectionCmd = + addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp); assert(ConnectionCmd == nullptr); + assert(ToCleanUp.empty()); (void)ConnectionCmd; } @@ -859,7 +882,8 @@ void AllocaCommand::printDot(std::ostream &Stream) const { AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, - std::vector &ToEnqueue) + std::vector &ToEnqueue, + std::vector &ToCleanUp) : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue), std::move(Req), /*LinkedAllocaCmd*/ nullptr), @@ -868,8 +892,8 @@ AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, // is added to this node, so this call must be before // the addDep() call. emitInstrumentationDataProxy(); - Command *ConnectionCmd = - addDep(DepDesc(MParentAlloca, getRequirement(), MParentAlloca)); + Command *ConnectionCmd = addDep( + DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp); if (ConnectionCmd) ToEnqueue.push_back(ConnectionCmd); } @@ -1049,6 +1073,8 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { bool ReleaseCommand::producesPiEvent() const { return false; } +bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; } + MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode) @@ -1441,8 +1467,11 @@ void EmptyCommand::addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *const StoredReq = &MRequirements.back(); // EmptyCommand is always host one, so we believe that result of addDep is nil - Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}); + std::vector ToCleanUp; + Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp); assert(Cmd == nullptr && "Conection command should be null for EmptyCommand"); + assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if " + "there's a connection command"); (void)Cmd; } @@ -1577,9 +1606,14 @@ ExecCGCommand::ExecCGCommand(std::unique_ptr CommandGroup, QueueImplPtr Queue) : Command(CommandType::RUN_CG, std::move(Queue)), MCommandGroup(std::move(CommandGroup)) { - if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) + if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) { MSubmittedQueue = static_cast(MCommandGroup.get())->MQueue; + MEvent->setNeedsCleanupAfterWait(true); + } else if (MCommandGroup->getType() == CG::CGTYPE::Kernel && + (static_cast(MCommandGroup.get()))->hasStreams()) + MEvent->setNeedsCleanupAfterWait(true); + emitInstrumentationDataProxy(); } @@ -2355,6 +2389,15 @@ bool ExecCGCommand::producesPiEvent() const { return MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask; } +bool ExecCGCommand::supportsPostEnqueueCleanup() const { + // TODO enable cleaning up host task commands and kernels with streams after + // enqueue + return Command::supportsPostEnqueueCleanup() && + (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask) && + (MCommandGroup->getType() != CG::CGTYPE::Kernel || + !(static_cast(MCommandGroup.get()))->hasStreams()); +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index e90ff2422d3bc..0f21a0e639491 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -107,11 +107,17 @@ class Command { Command(CommandType Type, QueueImplPtr Queue); + /// \param NewDep dependency to be added + /// \param ToCleanUp container for commands that can be cleaned up. /// \return an optional connection cmd to enqueue - [[nodiscard]] Command *addDep(DepDesc NewDep); + [[nodiscard]] Command *addDep(DepDesc NewDep, + std::vector &ToCleanUp); + /// \param NewDep dependency to be added + /// \param ToCleanUp container for commands that can be cleaned up. /// \return an optional connection cmd to enqueue - [[nodiscard]] Command *addDep(EventImplPtr Event); + [[nodiscard]] Command *addDep(EventImplPtr Event, + std::vector &ToCleanUp); void addUser(Command *NewUser) { MUsers.insert(NewUser); } @@ -123,8 +129,10 @@ class Command { /// \param EnqueueResult is set to the specific status if enqueue failed. /// \param Blocking if this argument is true, function will wait for the /// command to be unblocked before calling enqueueImp. + /// \param ToCleanUp container for commands that can be cleaned up. /// \return true if the command is enqueued. - virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking); + virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, + std::vector &ToCleanUp); bool isFinished(); @@ -194,6 +202,9 @@ class Command { /// Returns true iff the command produces a PI event on non-host devices. virtual bool producesPiEvent() const; + /// Returns true iff this command can be freed by post enqueue cleanup. + virtual bool supportsPostEnqueueCleanup() const; + protected: QueueImplPtr MQueue; QueueImplPtr MSubmittedQueue; @@ -212,6 +223,7 @@ class Command { /// Perform glueing of events from different contexts /// \param DepEvent event this commands should depend on /// \param Dep optional DepDesc to perform connection of events properly + /// \param ToCleanUp container for commands that can be cleaned up. /// \return returns an optional connection command to enqueue /// /// Glueing (i.e. connecting) will be performed if and only if DepEvent is @@ -220,7 +232,8 @@ class Command { /// /// Optionality of Dep is set by Dep.MDepCommand not equal to nullptr. [[nodiscard]] Command *processDepEvent(EventImplPtr DepEvent, - const DepDesc &Dep); + const DepDesc &Dep, + std::vector &ToCleanUp); /// Private interface. Derived classes should implement this method. virtual cl_int enqueueImp() = 0; @@ -297,6 +310,10 @@ class Command { // By default the flag is set to true due to most of host operations are // synchronous. The only asynchronous operation currently is host-task. bool MShouldCompleteEventIfPossible = true; + + /// Indicates that the node will be freed by cleanup after enqueue. Such nodes + /// should be ignored by other cleanup mechanisms. + bool MPostEnqueueCleanup = false; }; /// The empty command does nothing during enqueue. The task can be used to @@ -332,6 +349,7 @@ class ReleaseCommand : public Command { void printDot(std::ostream &Stream) const final; void emitInstrumentationData() override; bool producesPiEvent() const final; + bool supportsPostEnqueueCleanup() const final; private: cl_int enqueueImp() final; @@ -358,6 +376,8 @@ class AllocaCommandBase : public Command { bool producesPiEvent() const final; + bool supportsPostEnqueueCleanup() const final; + void *MMemAllocation = nullptr; /// Alloca command linked with current command. @@ -403,7 +423,8 @@ class AllocaSubBufCommand : public AllocaCommandBase { public: AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, - std::vector &ToEnqueue); + std::vector &ToEnqueue, + std::vector &ToCleanUp); void *getMemAllocation() const final; void printDot(std::ostream &Stream) const final; @@ -541,6 +562,8 @@ class ExecCGCommand : public Command { bool producesPiEvent() const final; + bool supportsPostEnqueueCleanup() const final; + private: cl_int enqueueImp() final; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 61f10b41845cd..ec5f9713657c6 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -190,10 +190,17 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( // of the requirements for the current record DepDesc Dep = findDepForRecord(Dependant, Record); Dep.MDepCommand = Dependency; - if (Command *ConnectionCmd = Dependant->addDep(Dep)) + std::vector ToCleanUp; + if (Command *ConnectionCmd = Dependant->addDep(Dep, ToCleanUp)) ToEnqueue.push_back(ConnectionCmd); Dependency->addUser(Dependant); --(Dependency->MLeafCounter); + if (Dependency->MLeafCounter == 0 && + Dependency->isSuccessfullyEnqueued() && + Dependency->supportsPostEnqueueCleanup()) + ToCleanUp.push_back(Dependency); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); }; const ContextImplPtr &InteropCtxPtr = Req->MSYCLMemObj->getInteropContext(); @@ -227,15 +234,21 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( void Scheduler::GraphBuilder::updateLeaves(const std::set &Cmds, MemObjRecord *Record, - access::mode AccessMode) { + access::mode AccessMode, + std::vector &ToCleanUp) { const bool ReadOnlyReq = AccessMode == access::mode::read; if (ReadOnlyReq) return; for (Command *Cmd : Cmds) { + bool WasLeaf = Cmd->MLeafCounter > 0; Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd); Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd); + if (WasLeaf && Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && + Cmd->supportsPostEnqueueCleanup()) { + ToCleanUp.push_back(Cmd); + } } } @@ -263,15 +276,18 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( std::set Deps = findDepsForReq(Record, Req, Queue->getContextImplPtr()); + std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = - UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}); + UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); Dep->addUser(UpdateCommand); } - updateLeaves(Deps, Record, Req->MAccessMode); + updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp); addNodeToLeaves(Record, UpdateCommand, Req->MAccessMode, ToEnqueue); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); return UpdateCommand; } @@ -376,16 +392,18 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue()); } } - + std::vector ToCleanUp; for (Command *Dep : Deps) { - Command *ConnCmd = - NewCmd->addDep(DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}); + Command *ConnCmd = NewCmd->addDep( + DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); Dep->addUser(NewCmd); } - updateLeaves(Deps, Record, access::mode::read_write); + updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp); addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); Record->MCurContext = Queue->getContextImplPtr(); return NewCmd; } @@ -414,22 +432,25 @@ Command *Scheduler::GraphBuilder::remapMemoryObject( LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(), &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode); + std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = UnMapCmd->addDep( - DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}); + DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); Dep->addUser(UnMapCmd); } Command *ConnCmd = MapCmd->addDep( - DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}); + DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); UnMapCmd->addUser(MapCmd); - updateLeaves(Deps, Record, access::mode::read_write); + updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp); addNodeToLeaves(Record, MapCmd, access::mode::read_write, ToEnqueue); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); Record->MHostAccess = MapMode; return MapCmd; } @@ -462,16 +483,20 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req, throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY); MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release(); + + std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = MemCpyCmd->addDep( - DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}); + DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); Dep->addUser(MemCpyCmd); } - updateLeaves(Deps, Record, Req->MAccessMode); + updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp); addNodeToLeaves(Record, MemCpyCmd, Req->MAccessMode, ToEnqueue); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); if (MPrintOptionsArray[AfterAddCopyBack]) printGraphAsDot("after_addCopyBack"); return MemCpyCmd; @@ -651,6 +676,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( findAllocaForReq(Record, Req, Queue->getContextImplPtr()); if (!AllocaCmd) { + std::vector ToCleanUp; if (IsSuitableSubReq(Req)) { // Get parent requirement. It's hard to get right parents' range // so full parent requirement has range represented in bytes @@ -662,7 +688,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( auto *ParentAlloca = getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue); - AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue); + AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue, + ToCleanUp); } else { const Requirement FullReq(/*Offset*/ {0, 0, 0}, Req->MMemoryRange, @@ -748,8 +775,10 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // Update linked command if (LinkedAllocaCmd) { - Command *ConnCmd = AllocaCmd->addDep(DepDesc{ - LinkedAllocaCmd, AllocaCmd->getRequirement(), LinkedAllocaCmd}); + Command *ConnCmd = AllocaCmd->addDep( + DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(), + LinkedAllocaCmd}, + ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); LinkedAllocaCmd->addUser(AllocaCmd); @@ -758,7 +787,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // To ensure that the leader allocation is removed first ConnCmd = AllocaCmd->getReleaseCmd()->addDep( DepDesc(LinkedAllocaCmd->getReleaseCmd(), - AllocaCmd->getRequirement(), LinkedAllocaCmd)); + AllocaCmd->getRequirement(), LinkedAllocaCmd), + ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); @@ -775,13 +805,13 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( std::set Deps = findDepsForReq(Record, Req, Queue->getContextImplPtr()); for (Command *Dep : Deps) { - Command *ConnCmd = - AllocaCmd->addDep(DepDesc{Dep, Req, LinkedAllocaCmd}); + Command *ConnCmd = AllocaCmd->addDep( + DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); Dep->addUser(AllocaCmd); } - updateLeaves(Deps, Record, Req->MAccessMode); + updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp); addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue); } } @@ -790,6 +820,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( Record->MAllocaCommands.push_back(AllocaCmd); Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue); ++(AllocaCmd->MLeafCounter); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); } return AllocaCmd; } @@ -838,13 +870,16 @@ Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector &Reqs, Cmd->addUser(EmptyCmd); const std::vector &Deps = Cmd->MDeps; + std::vector ToCleanUp; for (const DepDesc &Dep : Deps) { const Requirement *Req = Dep.MDepRequirement; MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); - updateLeaves({Cmd}, Record, Req->MAccessMode); + updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp); addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue); } + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); return EmptyCmd; } @@ -898,6 +933,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, // AllocaCommand creation will be dependent on the access mode of the first // requirement. Combine these access modes to take all of them into account. combineAccessModesOfReqs(Reqs); + std::vector ToCleanUp; for (Requirement *Req : Reqs) { MemObjRecord *Record = nullptr; AllocaCommandBase *AllocaCmd = nullptr; @@ -955,7 +991,8 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, findDepsForReq(Record, Req, Queue->getContextImplPtr()); for (Command *Dep : Deps) - if (Command *ConnCmd = NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd})) + if (Command *ConnCmd = + NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp)) ToEnqueue.push_back(ConnCmd); } @@ -968,13 +1005,13 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, Dep.MDepCommand->addUser(NewCmd.get()); const Requirement *Req = Dep.MDepRequirement; MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); - updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode); + updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp); addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue); } // Register all the events as dependencies for (detail::EventImplPtr e : Events) { - if (Command *ConnCmd = NewCmd->addDep(e)) + if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp)) ToEnqueue.push_back(ConnCmd); } @@ -986,6 +1023,8 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, if (MPrintOptionsArray[AfterAddCG]) printGraphAsDot("after_addCG"); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); return NewCmd.release(); } @@ -993,9 +1032,15 @@ void Scheduler::GraphBuilder::decrementLeafCountersForRecord( MemObjRecord *Record) { for (Command *Cmd : Record->MReadLeaves) { --(Cmd->MLeafCounter); + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && + Cmd->supportsPostEnqueueCleanup()) + cleanupCommand(Cmd); } for (Command *Cmd : Record->MWriteLeaves) { --(Cmd->MLeafCounter); + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && + Cmd->supportsPostEnqueueCleanup()) + cleanupCommand(Cmd); } } @@ -1088,14 +1133,64 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord( // If all dependencies have been removed this way, mark the command for // deletion if (Cmd->MDeps.empty()) { - Cmd->MMarks.MToBeDeleted = true; Cmd->MUsers.clear(); + // Do not delete the node if it's scheduled for post-enqueue cleanup to + // avoid double free. + if (!Cmd->MPostEnqueueCleanup) + Cmd->MMarks.MToBeDeleted = true; } } handleVisitedNodes(MVisitedCmds); } +void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) { + if (SYCLConfig::get()) + return; + assert(Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()); + Command::CommandType CmdT = Cmd->getType(); + + assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF); + assert(CmdT != Command::RELEASE); + assert(CmdT != Command::RUN_CG || + (static_cast(Cmd))->getCG().getType() != + CG::CGTYPE::CodeplayHostTask); +#ifndef NDEBUG + if (CmdT == Command::RUN_CG) { + auto *ExecCGCmd = static_cast(Cmd); + if (ExecCGCmd->getCG().getType() == CG::CGTYPE::Kernel) { + auto *ExecKernelCG = static_cast(&ExecCGCmd->getCG()); + assert(!ExecKernelCG->hasStreams()); + } + } +#endif + (void)CmdT; + + for (Command *UserCmd : Cmd->MUsers) { + for (DepDesc &Dep : UserCmd->MDeps) { + // Link the users of the command to the alloca command(s) instead + if (Dep.MDepCommand == Cmd) { + // ... unless the user is the alloca itself. + if (Dep.MAllocaCmd == UserCmd) { + Dep.MDepCommand = nullptr; + } else { + Dep.MDepCommand = Dep.MAllocaCmd; + Dep.MDepCommand->MUsers.insert(UserCmd); + } + } + } + } + // Update dependency users + for (DepDesc &Dep : Cmd->MDeps) { + Command *DepCmd = Dep.MDepCommand; + DepCmd->MUsers.erase(Cmd); + } + + Cmd->getEvent()->setCommand(nullptr); + Cmd->getEvent()->cleanupDependencyEvents(); + delete Cmd; +} + void Scheduler::GraphBuilder::cleanupFinishedCommands( Command *FinishedCmd, std::vector> &StreamsToDeallocate) { @@ -1148,7 +1243,14 @@ void Scheduler::GraphBuilder::cleanupFinishedCommands( DepCmd->MUsers.erase(Cmd); } - Cmd->MMarks.MToBeDeleted = true; + // Isolate the node instead of deleting it if it's scheduled for + // post-enqueue cleanup to avoid double free. + if (Cmd->MPostEnqueueCleanup) { + Cmd->MDeps.clear(); + Cmd->MUsers.clear(); + } else { + Cmd->MMarks.MToBeDeleted = true; + } } handleVisitedNodes(MVisitedCmds); } @@ -1175,9 +1277,9 @@ void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) { // requirement in Dep we make ConnectCmd depend on DepEvent's command with this // requirement. // Optionality of Dep is set by Dep.MDepCommand equal to nullptr. -Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, - EventImplPtr DepEvent, - const DepDesc &Dep) { +Command *Scheduler::GraphBuilder::connectDepEvent( + Command *const Cmd, EventImplPtr DepEvent, const DepDesc &Dep, + std::vector &ToCleanUp) { assert(Cmd->getWorkerContext() != DepEvent->getContextImpl()); // construct Host Task type command manually and make it depend on DepEvent @@ -1206,14 +1308,15 @@ Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, // make ConnectCmd depend on requirement // Dismiss the result here as it's not a connection now, // 'cause ConnectCmd is host one - (void)ConnectCmd->addDep(Dep); + (void)ConnectCmd->addDep(Dep, ToCleanUp); assert(reinterpret_cast(DepEvent->getCommand()) == Dep.MDepCommand); // add user to Dep.MDepCommand is already performed beyond this if branch MemObjRecord *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj); + updateLeaves({Dep.MDepCommand}, Record, Dep.MDepRequirement->MAccessMode, + ToCleanUp); - updateLeaves({Dep.MDepCommand}, Record, Dep.MDepRequirement->MAccessMode); std::vector ToEnqueue; addNodeToLeaves(Record, ConnectCmd, Dep.MDepRequirement->MAccessMode, ToEnqueue); @@ -1233,7 +1336,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, // Dismiss the result here as it's not a connection now, // 'cause EmptyCmd is host one - (void)Cmd->addDep(CmdDep); + (void)Cmd->addDep(CmdDep, ToCleanUp); } } else { std::vector ToEnqueue; @@ -1246,13 +1349,13 @@ Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, // ConnectCmd via its event. // Dismiss the result here as it's not a connection now, // 'cause ConnectCmd is host one. - (void)EmptyCmd->addDep(ConnectCmd->getEvent()); - (void)ConnectCmd->addDep(DepEvent); + (void)EmptyCmd->addDep(ConnectCmd->getEvent(), ToCleanUp); + (void)ConnectCmd->addDep(DepEvent, ToCleanUp); // Depend Cmd on empty command // Dismiss the result here as it's not a connection now, // 'cause EmptyCmd is host one - (void)Cmd->addDep(EmptyCmd->getEvent()); + (void)Cmd->addDep(EmptyCmd->getEvent(), ToCleanUp); } EmptyCmd->addUser(Cmd); diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index 69dbb626f9dc2..6e533df30a09c 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -23,6 +23,7 @@ static Command *getCommand(const EventImplPtr &Event) { void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event, ReadLockT &GraphReadLock, + std::vector &ToCleanUp, bool LockTheLock) { Command *Cmd = getCommand(Event); // Command can be nullptr if user creates cl::sycl::event explicitly or the @@ -31,7 +32,7 @@ void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event, return; EnqueueResultT Res; - bool Enqueued = enqueueCommand(Cmd, Res, BLOCKING); + bool Enqueued = enqueueCommand(Cmd, Res, ToCleanUp, BLOCKING); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) // TODO: Reschedule commands. throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); @@ -45,9 +46,9 @@ void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event, GraphReadLock.lock(); } -bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, - EnqueueResultT &EnqueueResult, - BlockingT Blocking) { +bool Scheduler::GraphProcessor::enqueueCommand( + Command *Cmd, EnqueueResultT &EnqueueResult, + std::vector &ToCleanUp, BlockingT Blocking) { if (!Cmd || Cmd->isSuccessfullyEnqueued()) return true; @@ -60,7 +61,7 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, // Recursively enqueue all the dependencies first and // exit immediately if any of the commands cannot be enqueued. for (DepDesc &Dep : Cmd->MDeps) { - if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, Blocking)) + if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, ToCleanUp, Blocking)) return false; } @@ -76,7 +77,7 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, // implemented. for (const EventImplPtr &Event : Cmd->getPreparedHostDepsEvents()) { if (Command *DepCmd = static_cast(Event->getCommand())) - if (!enqueueCommand(DepCmd, EnqueueResult, Blocking)) + if (!enqueueCommand(DepCmd, EnqueueResult, ToCleanUp, Blocking)) return false; } @@ -93,7 +94,7 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, // on completion of C and starts cleanup process. This thread is still in the // middle of enqueue of B. The other thread modifies dependency list of A by // removing C out of it. Iterators become invalid. - return Cmd->enqueue(EnqueueResult, Blocking); + return Cmd->enqueue(EnqueueResult, Blocking, ToCleanUp); } } // namespace detail diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c17beb8a3621d..232ee0a5d6e47 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -32,31 +32,32 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, // Will contain the list of dependencies for the Release Command std::set DepCommands; #endif + std::vector ToCleanUp; for (Command *Cmd : Record->MReadLeaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION // Capture the dependencies DepCommands.insert(Cmd); #endif - GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock); + GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, ToCleanUp); } for (Command *Cmd : Record->MWriteLeaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION DepCommands.insert(Cmd); #endif - GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock); + GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, ToCleanUp); } for (AllocaCommandBase *AllocaCmd : Record->MAllocaCommands) { Command *ReleaseCmd = AllocaCmd->getReleaseCmd(); EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, Res); + bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -64,7 +65,8 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, // reported as edges ReleaseCmd->resolveReleaseDependencies(DepCommands); #endif - GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock); + GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock, + ToCleanUp); } } @@ -108,6 +110,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, NewEvent = NewCmd->getEvent(); } + std::vector ToCleanUp; { ReadLockT Lock(MGraphLock); @@ -127,7 +130,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, }; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); try { if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Auxiliary enqueue process failed.", @@ -144,7 +147,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, // TODO: Check if lazy mode. EnqueueResultT Res; try { - bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); + bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } catch (...) { @@ -161,6 +164,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, CleanUp(); } } + cleanupCommands(ToCleanUp); for (auto StreamImplPtr : Streams) { StreamImplPtr->flush(); @@ -182,24 +186,27 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { return nullptr; } + std::vector ToCleanUp; try { ReadLockT Lock(MGraphLock); EnqueueResultT Res; bool Enqueued; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } - Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); + Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } catch (...) { NewCmd->getQueue()->reportAsyncException(std::current_exception()); } - return NewCmd->getEvent(); + EventImplPtr NewEvent = NewCmd->getEvent(); + cleanupCommands(ToCleanUp); + return NewEvent; } Scheduler &Scheduler::getInstance() { @@ -210,7 +217,10 @@ void Scheduler::waitForEvent(EventImplPtr Event) { ReadLockT Lock(MGraphLock); // It's fine to leave the lock unlocked upon return from waitForEvent as // there's no more actions to do here with graph - GraphProcessor::waitForEvent(std::move(Event), Lock, /*LockTheLock=*/false); + std::vector ToCleanUp; + GraphProcessor::waitForEvent(std::move(Event), Lock, ToCleanUp, + /*LockTheLock=*/false); + cleanupCommands(ToCleanUp); } static void deallocateStreams( @@ -293,44 +303,51 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { if (!NewCmd) return nullptr; + std::vector ToCleanUp; { ReadLockT ReadLock(MGraphLock); EnqueueResultT Res; bool Enqueued; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } - Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); + Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } - return NewCmd->getEvent(); + EventImplPtr NewEvent = NewCmd->getEvent(); + cleanupCommands(ToCleanUp); + return NewEvent; } void Scheduler::releaseHostAccessor(Requirement *Req) { Command *const BlockedCmd = Req->MBlockedCmd; - ReadLockT Lock(MGraphLock); + std::vector ToCleanUp; + { + ReadLockT Lock(MGraphLock); - assert(BlockedCmd && "Can't find appropriate command to unblock"); + assert(BlockedCmd && "Can't find appropriate command to unblock"); - BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; + BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; - enqueueLeavesOfReqUnlocked(Req); + enqueueLeavesOfReqUnlocked(Req, ToCleanUp); + } + cleanupCommands(ToCleanUp); } -// static -void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req) { +void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req, + std::vector &ToCleanUp) { MemObjRecord *Record = Req->MSYCLMemObj->MRecord.get(); - auto EnqueueLeaves = [](LeavesCollection &Leaves) { + auto EnqueueLeaves = [&ToCleanUp](LeavesCollection &Leaves) { for (Command *Cmd : Leaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } @@ -379,6 +396,10 @@ Scheduler::~Scheduler() { "not all resources were released. Please be sure that all kernels " "have synchronization points.\n\n"); } + // There might be some commands scheduled for post enqueue cleanup that + // haven't been freed because of the graph mutex being locked at the time, + // clean them up now. + cleanupCommands({}); } void Scheduler::acquireWriteLock(WriteLockT &Lock) { @@ -406,6 +427,32 @@ MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) { return Req->MSYCLMemObj->MRecord.get(); } +void Scheduler::cleanupCommands(const std::vector &Cmds) { + if (Cmds.empty()) + return; + WriteLockT Lock(MGraphLock, std::try_to_lock); + // In order to avoid deadlocks related to blocked commands, defer cleanup if + // the lock wasn't acquired. + if (Lock.owns_lock()) { + for (Command *Cmd : Cmds) { + MGraphBuilder.cleanupCommand(Cmd); + } + std::vector DeferredCleanupCommands; + { + std::lock_guard Lock{MDeferredCleanupMutex}; + std::swap(DeferredCleanupCommands, MDeferredCleanupCommands); + } + for (Command *Cmd : DeferredCleanupCommands) { + MGraphBuilder.cleanupCommand(Cmd); + } + + } else { + std::lock_guard Lock{MDeferredCleanupMutex}; + MDeferredCleanupCommands.insert(MDeferredCleanupCommands.end(), + Cmds.begin(), Cmds.end()); + } +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 66e17d7862301..63e2cccaabfd2 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -459,7 +459,10 @@ class Scheduler { /// \param Lock is an instance of WriteLockT, created with \c std::defer_lock void acquireWriteLock(WriteLockT &Lock); - static void enqueueLeavesOfReqUnlocked(const Requirement *const Req); + void cleanupCommands(const std::vector &Cmds); + + static void enqueueLeavesOfReqUnlocked(const Requirement *const Req, + std::vector &ToCleanUp); /// Graph builder class. /// @@ -505,6 +508,8 @@ class Scheduler { /// with Event passed and its dependencies. void optimize(EventImplPtr Event); + void cleanupCommand(Command *Cmd); + /// Removes finished non-leaf non-alloca commands from the subgraph /// (assuming that all its commands have been waited for). void cleanupFinishedCommands( @@ -547,17 +552,21 @@ class Scheduler { /// Removes commands from leaves. void updateLeaves(const std::set &Cmds, MemObjRecord *Record, - access::mode AccessMode); + access::mode AccessMode, + std::vector &ToCleanUp); /// Perform connection of events in multiple contexts /// \param Cmd dependant command /// \param DepEvent event to depend on /// \param Dep optional DepDesc to perform connection properly + /// \param ToCleanUp container for commands that can be cleaned up due to + /// their removal from leaves /// \returns the connecting command which is to be enqueued /// /// Optionality of Dep is set by Dep.MDepCommand equal to nullptr. Command *connectDepEvent(Command *const Cmd, EventImplPtr DepEvent, - const DepDesc &Dep); + const DepDesc &Dep, + std::vector &ToCleanUp); std::vector MMemObjs; @@ -720,21 +729,25 @@ class Scheduler { public: /// Waits for the command, associated with Event passed, is completed. /// \param GraphReadLock read-lock which is already acquired for reading + /// \param ToCleanUp container for commands that can be cleaned up. /// \param LockTheLock selects if graph lock should be locked upon return /// /// The function may unlock and lock GraphReadLock as needed. Upon return /// the lock is left in locked state if and only if LockTheLock is true. static void waitForEvent(EventImplPtr Event, ReadLockT &GraphReadLock, + std::vector &ToCleanUp, bool LockTheLock = true); /// Enqueues the command and all its dependencies. /// /// \param EnqueueResult is set to specific status if enqueue failed. + /// \param ToCleanUp container for commands that can be cleaned up. /// \return true if the command is successfully enqueued. /// /// The function may unlock and lock GraphReadLock as needed. Upon return /// the lock is left in locked state. static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult, + std::vector &ToCleanUp, BlockingT Blocking = NON_BLOCKING); }; @@ -751,6 +764,9 @@ class Scheduler { GraphBuilder MGraphBuilder; RWLockT MGraphLock; + std::vector MDeferredCleanupCommands; + std::mutex MDeferredCleanupMutex; + QueueImplPtr DefaultHostQueue; friend class Command; diff --git a/sycl/unittests/scheduler/BlockedCommands.cpp b/sycl/unittests/scheduler/BlockedCommands.cpp index 967b3ee75531c..c45b785980c8b 100644 --- a/sycl/unittests/scheduler/BlockedCommands.cpp +++ b/sycl/unittests/scheduler/BlockedCommands.cpp @@ -81,10 +81,10 @@ TEST_F(SchedulerTest, DontEnqueueDepsIfOneOfThemIsBlocked) { // // If C is blocked, we should not try to enqueue D. - EXPECT_CALL(A, enqueue(_, _)).Times(0); - EXPECT_CALL(B, enqueue(_, _)).Times(1); - EXPECT_CALL(C, enqueue(_, _)).Times(0); - EXPECT_CALL(D, enqueue(_, _)).Times(0); + EXPECT_CALL(A, enqueue).Times(0); + EXPECT_CALL(B, enqueue).Times(1); + EXPECT_CALL(C, enqueue).Times(0); + EXPECT_CALL(D, enqueue).Times(0); MockScheduler MS; auto Lock = MS.acquireGraphReadLock(); @@ -113,8 +113,8 @@ TEST_F(SchedulerTest, EnqueueBlockedCommandEarlyExit) { // // If A is blocked, we should not try to enqueue B. - EXPECT_CALL(A, enqueue(_, _)).Times(0); - EXPECT_CALL(B, enqueue(_, _)).Times(0); + EXPECT_CALL(A, enqueue).Times(0); + EXPECT_CALL(B, enqueue).Times(0); MockScheduler MS; auto Lock = MS.acquireGraphReadLock(); @@ -127,8 +127,8 @@ TEST_F(SchedulerTest, EnqueueBlockedCommandEarlyExit) { // But if the enqueue type is blocking we should not exit early. - EXPECT_CALL(A, enqueue(_, _)).Times(0); - EXPECT_CALL(B, enqueue(_, _)).Times(1); + EXPECT_CALL(A, enqueue).Times(0); + EXPECT_CALL(B, enqueue).Times(1); Enqueued = MockScheduler::enqueueCommand(&A, Res, detail::BLOCKING); ASSERT_FALSE(Enqueued) << "Blocked command should not be enqueued\n"; @@ -154,7 +154,8 @@ TEST_F(SchedulerTest, EnqueueHostDependency) { new cl::sycl::detail::event_impl(detail::getSyclObjImpl(MQueue))}; DepEvent->setCommand(&B); - (void)A.addDep(DepEvent); + std::vector ToCleanUp; + (void)A.addDep(DepEvent, ToCleanUp); // We have such a "graph": // @@ -166,8 +167,8 @@ TEST_F(SchedulerTest, EnqueueHostDependency) { // "Graph" is quoted as we don't have this dependency in MDeps. Instead, we // have this dependecy as result of handler::depends_on() call. - EXPECT_CALL(A, enqueue(_, _)).Times(1); - EXPECT_CALL(B, enqueue(_, _)).Times(1); + EXPECT_CALL(A, enqueue).Times(1); + EXPECT_CALL(B, enqueue).Times(1); MockScheduler MS; auto Lock = MS.acquireGraphReadLock(); diff --git a/sycl/unittests/scheduler/CMakeLists.txt b/sycl/unittests/scheduler/CMakeLists.txt index 1ec8c7b4d894a..ceb7c8990c3f2 100644 --- a/sycl/unittests/scheduler/CMakeLists.txt +++ b/sycl/unittests/scheduler/CMakeLists.txt @@ -16,5 +16,6 @@ add_sycl_unittest(SchedulerTests OBJECT AllocaLinking.cpp RequiredWGSize.cpp QueueFlushing.cpp + PostEnqueueCleanup.cpp utils.cpp ) diff --git a/sycl/unittests/scheduler/FailedCommands.cpp b/sycl/unittests/scheduler/FailedCommands.cpp index 37a7a71a4afdc..4d294b8eff741 100644 --- a/sycl/unittests/scheduler/FailedCommands.cpp +++ b/sycl/unittests/scheduler/FailedCommands.cpp @@ -16,7 +16,8 @@ TEST_F(SchedulerTest, FailedDependency) { MockCommand MDep(detail::getSyclObjImpl(MQueue)); MockCommand MUser(detail::getSyclObjImpl(MQueue)); MDep.addUser(&MUser); - (void)MUser.addDep(detail::DepDesc{&MDep, &MockReq, nullptr}); + std::vector ToCleanUp; + (void)MUser.addDep(detail::DepDesc{&MDep, &MockReq, nullptr}, ToCleanUp); MUser.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueReady; MDep.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueFailed; diff --git a/sycl/unittests/scheduler/LeafLimit.cpp b/sycl/unittests/scheduler/LeafLimit.cpp index ffed74ba0e1ec..333a06dca5bd3 100644 --- a/sycl/unittests/scheduler/LeafLimit.cpp +++ b/sycl/unittests/scheduler/LeafLimit.cpp @@ -9,6 +9,9 @@ #include "SchedulerTest.hpp" #include "SchedulerTestUtils.hpp" +#include +#include + #include #include #include @@ -16,10 +19,19 @@ using namespace cl::sycl; +inline constexpr auto DisablePostEnqueueCleanupName = + "SYCL_DISABLE_POST_ENQUEUE_CLEANUP"; + // Checks that scheduler's (or graph-builder's) addNodeToLeaves method works // correctly with dependency tracking when leaf-limit for generic commands is // overflowed. TEST_F(SchedulerTest, LeafLimit) { + // All of the mock commands are owned on the test side, prevent post enqueue + // cleanup from deleting some of them. + unittest::ScopedEnvVar DisabledCleanup{ + DisablePostEnqueueCleanupName, "1", + detail::SYCLConfig::reset}; + cl::sycl::queue HQueue(host_selector{}); MockScheduler MS; std::vector> LeavesToAdd; std::unique_ptr MockDepCmd; @@ -40,10 +52,12 @@ TEST_F(SchedulerTest, LeafLimit) { std::make_unique(detail::getSyclObjImpl(MQueue), MockReq)); } // Create edges: all soon-to-be leaves are direct users of MockDep + std::vector ToCleanUp; for (auto &Leaf : LeavesToAdd) { MockDepCmd->addUser(Leaf.get()); (void)Leaf->addDep( - detail::DepDesc{MockDepCmd.get(), Leaf->getRequirement(), nullptr}); + detail::DepDesc{MockDepCmd.get(), Leaf->getRequirement(), nullptr}, + ToCleanUp); } std::vector ToEnqueue; // Add edges as leaves and exceed the leaf limit diff --git a/sycl/unittests/scheduler/PostEnqueueCleanup.cpp b/sycl/unittests/scheduler/PostEnqueueCleanup.cpp new file mode 100644 index 0000000000000..8386af6394628 --- /dev/null +++ b/sycl/unittests/scheduler/PostEnqueueCleanup.cpp @@ -0,0 +1,280 @@ +//==--------- PostEnqueueCleanup.cpp --- Scheduler unit tests --------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SchedulerTest.hpp" +#include "SchedulerTestUtils.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +using namespace sycl; + +inline constexpr auto HostUnifiedMemoryName = "SYCL_HOST_UNIFIED_MEMORY"; + +int val; +static pi_result redefinedEnqueueMemBufferMap( + pi_queue command_queue, pi_mem buffer, pi_bool blocking_map, + pi_map_flags map_flags, size_t offset, size_t size, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event, void **ret_map) { + *event = reinterpret_cast(new int{}); + *ret_map = &val; + return PI_SUCCESS; +} + +static pi_result redefinedEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, + void *mapped_ptr, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event) { + *event = reinterpret_cast(new int{}); + return PI_SUCCESS; +} + +static pi_result redefinedEnqueueMemBufferFill( + pi_queue command_queue, pi_mem buffer, const void *pattern, + size_t pattern_size, size_t offset, size_t size, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event) { + *event = reinterpret_cast(new int{}); + return PI_SUCCESS; +} + +static void verifyCleanup(detail::MemObjRecord *Record, + detail::AllocaCommandBase *AllocaCmd, + detail::Command *DeletedCmd, bool &CmdDeletedFlag) { + EXPECT_TRUE(CmdDeletedFlag); + CmdDeletedFlag = false; + EXPECT_EQ( + std::find(AllocaCmd->MUsers.begin(), AllocaCmd->MUsers.end(), DeletedCmd), + AllocaCmd->MUsers.end()); + detail::Command *Leaf = *Record->MWriteLeaves.begin(); + EXPECT_FALSE(std::any_of(Leaf->MDeps.begin(), Leaf->MDeps.end(), + [&](const detail::DepDesc &Dep) { + return Dep.MDepCommand == DeletedCmd; + })); +} + +// Check that any non-leaf commands enqueued as part of high level scheduler +// calls are cleaned up. +static void checkCleanupOnEnqueue(MockScheduler &MS, + detail::QueueImplPtr &QueueImpl, + buffer &Buf, + detail::Requirement &MockReq) { + bool CommandDeleted = false; + std::vector AuxCmds; + std::vector ToCleanUp; + std::vector ToEnqueue; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QueueImpl, &MockReq, AuxCmds); + detail::AllocaCommandBase *AllocaCmd = + MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, AuxCmds); + std::function Callback = [&CommandDeleted]() { + CommandDeleted = true; + }; + + // Check addCG. + MockCommand *MockCmd = + new MockCommandWithCallback(QueueImpl, MockReq, Callback); + (void)MockCmd->addDep(detail::DepDesc(AllocaCmd, &MockReq, nullptr), + ToCleanUp); + EXPECT_TRUE(ToCleanUp.empty()); + MS.addNodeToLeaves(Record, MockCmd, access::mode::read_write, ToEnqueue); + MS.updateLeaves({AllocaCmd}, Record, access::mode::read_write, ToCleanUp); + + EXPECT_TRUE(ToCleanUp.empty()); + std::unique_ptr CG{new detail::CGFill(/*Pattern*/ {}, &MockReq, + /*ArgsStorage*/ {}, + /*AccStorage*/ {}, + /*SharedPtrStorage*/ {}, + /*Requirements*/ {&MockReq}, + /*Events*/ {})}; + detail::EventImplPtr Event = MS.addCG(std::move(CG), QueueImpl); + auto *Cmd = static_cast(Event->getCommand()); + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + + // Check add/releaseHostAccessor. + CommandDeleted = false; + MockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback); + addEdge(MockCmd, Cmd, AllocaCmd); + MS.addNodeToLeaves(Record, MockCmd, access::mode::read_write, ToEnqueue); + MS.updateLeaves({Cmd}, Record, access::mode::read_write, ToCleanUp); + MS.addHostAccessor(&MockReq); + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + + CommandDeleted = false; + MockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback); + addEdge(MockCmd, AllocaCmd, AllocaCmd); + MockCommand *LeafMockCmd = + new MockCommandWithCallback(QueueImpl, MockReq, Callback); + addEdge(LeafMockCmd, MockCmd, AllocaCmd); + MS.addNodeToLeaves(Record, LeafMockCmd, access::mode::read_write, ToEnqueue); + MS.releaseHostAccessor(&MockReq); + MockReq.MBlockedCmd = nullptr; + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + + auto addNewMockCmds = [&]() -> MockCommand * { + CommandDeleted = false; + MockCmd = LeafMockCmd; + LeafMockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback); + addEdge(LeafMockCmd, MockCmd, AllocaCmd); + MS.addNodeToLeaves(Record, LeafMockCmd, access::mode::read_write, + ToEnqueue); + // Since this mock command has already been enqueued, it's expected to be + // cleaned up during removal from leaves. + ToCleanUp.clear(); + MS.updateLeaves({MockCmd}, Record, access::mode::read_write, ToCleanUp); + EXPECT_EQ(ToCleanUp.size(), 1U); + EXPECT_EQ(ToCleanUp[0], MockCmd); + MS.cleanupCommands({MockCmd}); + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + CommandDeleted = false; + MockCmd = LeafMockCmd; + LeafMockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback); + addEdge(LeafMockCmd, MockCmd, AllocaCmd); + MS.addNodeToLeaves(Record, LeafMockCmd, access::mode::read_write, + ToEnqueue); + MS.updateLeaves({MockCmd}, Record, access::mode::read_write, ToCleanUp); + return MockCmd; + }; + + // Check waitForEvent + MockCmd = addNewMockCmds(); + MS.waitForEvent(LeafMockCmd->getEvent()); + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + + // Check addCopyBack + MockCmd = addNewMockCmds(); + LeafMockCmd->getEvent()->getHandleRef() = + reinterpret_cast(new int{}); + MS.addCopyBack(&MockReq); + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + + MS.removeRecordForMemObj(detail::getSyclObjImpl(Buf).get()); +} + +static void checkCleanupOnLeafUpdate( + MockScheduler &MS, detail::QueueImplPtr &QueueImpl, buffer &Buf, + detail::Requirement &MockReq, + std::function SchedulerCall) { + bool CommandDeleted = false; + std::vector AuxCmds; + std::vector ToCleanUp; + std::vector ToEnqueue; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QueueImpl, &MockReq, AuxCmds); + detail::AllocaCommandBase *AllocaCmd = + MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, AuxCmds); + std::function Callback = [&CommandDeleted]() { + CommandDeleted = true; + }; + + // Add a mock command as a leaf and enqueue it. + MockCommand *MockCmd = + new MockCommandWithCallback(QueueImpl, MockReq, Callback); + (void)MockCmd->addDep(detail::DepDesc(AllocaCmd, &MockReq, nullptr), + ToCleanUp); + EXPECT_TRUE(ToCleanUp.empty()); + MS.addNodeToLeaves(Record, MockCmd, access::mode::read_write, ToEnqueue); + MS.updateLeaves({AllocaCmd}, Record, access::mode::read_write, ToCleanUp); + detail::EnqueueResultT Res; + MockScheduler::enqueueCommand(MockCmd, Res, detail::BLOCKING); + + EXPECT_FALSE(CommandDeleted); + SchedulerCall(Record); + EXPECT_TRUE(CommandDeleted); + MS.removeRecordForMemObj(detail::getSyclObjImpl(Buf).get()); +} + +TEST_F(SchedulerTest, PostEnqueueCleanup) { + default_selector Selector; + platform Plt{default_selector()}; + if (Plt.is_host()) { + std::cout << "Not run due to host-only environment\n"; + return; + } + + // Enforce creation of linked commands to test all sites of calling cleanup. + unittest::ScopedEnvVar HostUnifiedMemoryVar{ + HostUnifiedMemoryName, "1", + detail::SYCLConfig::reset}; + unittest::PiMock Mock{Plt}; + setupDefaultMockAPIs(Mock); + Mock.redefine( + redefinedEnqueueMemBufferMap); + Mock.redefine(redefinedEnqueueMemUnmap); + Mock.redefine( + redefinedEnqueueMemBufferFill); + + context Ctx{Plt}; + queue Queue{Ctx, Selector}; + detail::QueueImplPtr QueueImpl = detail::getSyclObjImpl(Queue); + MockScheduler MS; + + buffer Buf{range<1>(1)}; + std::shared_ptr BufImpl = detail::getSyclObjImpl(Buf); + detail::Requirement MockReq = getMockRequirement(Buf); + MockReq.MDims = 1; + MockReq.MSYCLMemObj = BufImpl.get(); + + checkCleanupOnEnqueue(MS, QueueImpl, Buf, MockReq); + std::vector ToEnqueue; + checkCleanupOnLeafUpdate(MS, QueueImpl, Buf, MockReq, + [&](detail::MemObjRecord *Record) { + MS.decrementLeafCountersForRecord(Record); + }); + checkCleanupOnLeafUpdate( + MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) { + MS.insertMemoryMove(Record, &MockReq, QueueImpl, ToEnqueue); + }); + checkCleanupOnLeafUpdate(MS, QueueImpl, Buf, MockReq, + [&](detail::MemObjRecord *Record) { + Record->MMemModified = true; + MS.addCopyBack(&MockReq, ToEnqueue); + }); + checkCleanupOnLeafUpdate( + MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) { + detail::Command *Leaf = *Record->MWriteLeaves.begin(); + MS.addEmptyCmd(Leaf, {&MockReq}, QueueImpl, + detail::Command::BlockReason::HostTask, ToEnqueue); + }); + device HostDevice; + detail::QueueImplPtr DefaultHostQueue{ + new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; + checkCleanupOnLeafUpdate( + MS, DefaultHostQueue, Buf, MockReq, [&](detail::MemObjRecord *Record) { + MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, ToEnqueue); + }); + // Check cleanup on exceeding leaf limit. + checkCleanupOnLeafUpdate( + MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) { + std::vector> Leaves; + for (std::size_t I = 0; + I < Record->MWriteLeaves.genericCommandsCapacity(); ++I) + Leaves.push_back(std::make_unique(QueueImpl, MockReq)); + + detail::AllocaCommandBase *AllocaCmd = Record->MAllocaCommands[0]; + std::vector ToCleanUp; + for (std::unique_ptr &MockCmd : Leaves) { + (void)MockCmd->addDep(detail::DepDesc(AllocaCmd, &MockReq, AllocaCmd), + ToCleanUp); + MS.addNodeToLeaves(Record, MockCmd.get(), access::mode::read_write, + ToEnqueue); + } + for (std::unique_ptr &MockCmd : Leaves) + MS.updateLeaves({MockCmd.get()}, Record, access::mode::read_write, + ToCleanUp); + EXPECT_TRUE(ToCleanUp.empty()); + }); +} diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp index 14e622dcdeb43..194d7c14fa59a 100644 --- a/sycl/unittests/scheduler/QueueFlushing.cpp +++ b/sycl/unittests/scheduler/QueueFlushing.cpp @@ -98,8 +98,9 @@ static void addDepAndEnqueue(detail::Command *Cmd, detail::QueueImplPtr &DepQueue, detail::Requirement &MockReq) { MockCommand DepCmd(DepQueue); + std::vector ToCleanUp; DepCmd.getEvent()->getHandleRef() = reinterpret_cast(new int{}); - (void)Cmd->addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}); + (void)Cmd->addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}, ToCleanUp); detail::EnqueueResultT Res; MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); @@ -159,6 +160,7 @@ TEST_F(SchedulerTest, QueueFlushing) { detail::AllocaCommand AllocaCmd = detail::AllocaCommand(QueueImplA, MockReq); void *MockHostPtr; detail::EnqueueResultT Res; + std::vector ToCleanUp; // Check that each of the non-blocking commands flush the dependency queue { @@ -194,7 +196,8 @@ TEST_F(SchedulerTest, QueueFlushing) { /*Events*/ {})}; detail::ExecCGCommand ExecCGCmd{std::move(CG), QueueImplA}; MockReq.MDims = 1; - (void)ExecCGCmd.addDep(detail::DepDesc(&AllocaCmd, &MockReq, &AllocaCmd)); + (void)ExecCGCmd.addDep(detail::DepDesc(&AllocaCmd, &MockReq, &AllocaCmd), + ToCleanUp); testCommandEnqueue(&ExecCGCmd, QueueImplB, MockReq); } @@ -206,7 +209,7 @@ TEST_F(SchedulerTest, QueueFlushing) { detail::EventImplPtr DepEvent{new detail::event_impl(QueueImplB)}; DepEvent->setContextImpl(QueueImplB->getContextImplPtr()); DepEvent->getHandleRef() = reinterpret_cast(new int{}); - (void)Cmd.addDep(DepEvent); + (void)Cmd.addDep(DepEvent, ToCleanUp); MockScheduler::enqueueCommand(&Cmd, Res, detail::NON_BLOCKING); EXPECT_TRUE(QueueFlushed); } @@ -224,7 +227,7 @@ TEST_F(SchedulerTest, QueueFlushing) { DepEvent->setContextImpl(TempQueueImpl->getContextImplPtr()); DepEvent->getHandleRef() = reinterpret_cast(new int{}); } - (void)Cmd.addDep(DepEvent); + (void)Cmd.addDep(DepEvent, ToCleanUp); MockScheduler::enqueueCommand(&Cmd, Res, detail::NON_BLOCKING); EXPECT_FALSE(EventStatusQueried); EXPECT_FALSE(QueueFlushed); @@ -244,10 +247,10 @@ TEST_F(SchedulerTest, QueueFlushing) { access::mode::read_write}; MockCommand DepCmdA(QueueImplB); DepCmdA.getEvent()->getHandleRef() = reinterpret_cast(new int{}); - (void)Cmd.addDep(detail::DepDesc{&DepCmdA, &MockReq, nullptr}); + (void)Cmd.addDep(detail::DepDesc{&DepCmdA, &MockReq, nullptr}, ToCleanUp); MockCommand DepCmdB(QueueImplB); DepCmdB.getEvent()->getHandleRef() = reinterpret_cast(new int{}); - (void)Cmd.addDep(detail::DepDesc{&DepCmdB, &MockReq, nullptr}); + (void)Cmd.addDep(detail::DepDesc{&DepCmdB, &MockReq, nullptr}, ToCleanUp); // The check is performed in redefinedQueueFlush MockScheduler::enqueueCommand(&Cmd, Res, detail::NON_BLOCKING); } @@ -259,13 +262,13 @@ TEST_F(SchedulerTest, QueueFlushing) { access::mode::read_write}; MockCommand DepCmd(QueueImplB); DepCmd.getEvent()->getHandleRef() = reinterpret_cast(new int{}); - (void)CmdA.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}); + (void)CmdA.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}, ToCleanUp); MockScheduler::enqueueCommand(&CmdA, Res, detail::NON_BLOCKING); EventStatusQueried = false; detail::MapMemObject CmdB{&AllocaCmd, MockReq, &MockHostPtr, QueueImplA, access::mode::read_write}; - (void)CmdB.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}); + (void)CmdB.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}, ToCleanUp); MockScheduler::enqueueCommand(&CmdB, Res, detail::NON_BLOCKING); EXPECT_FALSE(EventStatusQueried); } diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index bd80f24820f8f..60f5cbf05e04a 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -38,9 +39,9 @@ class MockCommand : public cl::sycl::detail::Command { cl::sycl::detail::Command::RUN_CG) : Command{Type, Queue}, MRequirement{std::move(Req)} { using namespace testing; - ON_CALL(*this, enqueue(_, _)) + ON_CALL(*this, enqueue) .WillByDefault(Invoke(this, &MockCommand::enqueueOrigin)); - EXPECT_CALL(*this, enqueue(_, _)).Times(AnyNumber()); + EXPECT_CALL(*this, enqueue).Times(AnyNumber()); } MockCommand(cl::sycl::detail::QueueImplPtr Queue, @@ -48,9 +49,9 @@ class MockCommand : public cl::sycl::detail::Command { cl::sycl::detail::Command::RUN_CG) : Command{Type, Queue}, MRequirement{std::move(getMockRequirement())} { using namespace testing; - ON_CALL(*this, enqueue(_, _)) + ON_CALL(*this, enqueue) .WillByDefault(Invoke(this, &MockCommand::enqueueOrigin)); - EXPECT_CALL(*this, enqueue(_, _)).Times(AnyNumber()); + EXPECT_CALL(*this, enqueue).Times(AnyNumber()); } void printDot(std::ostream &) const override {} @@ -62,11 +63,14 @@ class MockCommand : public cl::sycl::detail::Command { cl_int enqueueImp() override { return MRetVal; } - MOCK_METHOD2(enqueue, bool(cl::sycl::detail::EnqueueResultT &, - cl::sycl::detail::BlockingT)); + MOCK_METHOD3(enqueue, bool(cl::sycl::detail::EnqueueResultT &, + cl::sycl::detail::BlockingT, + std::vector &)); bool enqueueOrigin(cl::sycl::detail::EnqueueResultT &EnqueueResult, - cl::sycl::detail::BlockingT Blocking) { - return cl::sycl::detail::Command::enqueue(EnqueueResult, Blocking); + cl::sycl::detail::BlockingT Blocking, + std::vector &ToCleanUp) { + return cl::sycl::detail::Command::enqueue(EnqueueResult, Blocking, + ToCleanUp); } cl_int MRetVal = CL_SUCCESS; @@ -99,6 +103,10 @@ class MockCommandWithCallback : public MockCommand { class MockScheduler : public cl::sycl::detail::Scheduler { public: + using cl::sycl::detail::Scheduler::addCG; + using cl::sycl::detail::Scheduler::addCopyBack; + using cl::sycl::detail::Scheduler::cleanupCommands; + cl::sycl::detail::MemObjRecord * getOrInsertMemObjRecord(const cl::sycl::detail::QueueImplPtr &Queue, cl::sycl::detail::Requirement *Req, @@ -106,6 +114,10 @@ class MockScheduler : public cl::sycl::detail::Scheduler { return MGraphBuilder.getOrInsertMemObjRecord(Queue, Req, ToEnqueue); } + void decrementLeafCountersForRecord(cl::sycl::detail::MemObjRecord *Rec) { + MGraphBuilder.decrementLeafCountersForRecord(Rec); + } + void removeRecordForMemObj(cl::sycl::detail::SYCLMemObjI *MemObj) { MGraphBuilder.removeRecordForMemObj(MemObj); } @@ -123,10 +135,19 @@ class MockScheduler : public cl::sycl::detail::Scheduler { return MGraphBuilder.addNodeToLeaves(Rec, Cmd, Mode, ToEnqueue); } + void updateLeaves(const std::set &Cmds, + cl::sycl::detail::MemObjRecord *Record, + cl::sycl::access::mode AccessMode, + std::vector &ToCleanUp) { + return MGraphBuilder.updateLeaves(Cmds, Record, AccessMode, ToCleanUp); + } + static bool enqueueCommand(cl::sycl::detail::Command *Cmd, cl::sycl::detail::EnqueueResultT &EnqueueResult, cl::sycl::detail::BlockingT Blocking) { - return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, Blocking); + std::vector ToCleanUp; + return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, ToCleanUp, + Blocking); } cl::sycl::detail::AllocaCommandBase * @@ -147,6 +168,29 @@ class MockScheduler : public cl::sycl::detail::Scheduler { return MGraphBuilder.insertMemoryMove(Record, Req, Queue, ToEnqueue); } + cl::sycl::detail::Command * + addCopyBack(cl::sycl::detail::Requirement *Req, + std::vector &ToEnqueue) { + return MGraphBuilder.addCopyBack(Req, ToEnqueue); + } + + cl::sycl::detail::UpdateHostRequirementCommand * + insertUpdateHostReqCmd(cl::sycl::detail::MemObjRecord *Record, + cl::sycl::detail::Requirement *Req, + const cl::sycl::detail::QueueImplPtr &Queue, + std::vector &ToEnqueue) { + return MGraphBuilder.insertUpdateHostReqCmd(Record, Req, Queue, ToEnqueue); + } + + cl::sycl::detail::EmptyCommand * + addEmptyCmd(cl::sycl::detail::Command *Cmd, + const std::vector &Reqs, + const cl::sycl::detail::QueueImplPtr &Queue, + cl::sycl::detail::Command::BlockReason Reason, + std::vector &ToEnqueue) { + return MGraphBuilder.addEmptyCmd(Cmd, Reqs, Queue, Reason, ToEnqueue); + } + cl::sycl::detail::Command * addCG(std::unique_ptr CommandGroup, cl::sycl::detail::QueueImplPtr Queue, diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index c992919bab32b..478465603199a 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -9,10 +9,15 @@ #include "SchedulerTest.hpp" #include "SchedulerTestUtils.hpp" +#include #include +#include using namespace cl::sycl; +inline constexpr auto DisablePostEnqueueCleanupName = + "SYCL_DISABLE_POST_ENQUEUE_CLEANUP"; + class MockHandler : public sycl::handler { public: MockHandler(std::shared_ptr Queue, bool IsHost) @@ -91,6 +96,11 @@ static bool ValidateDepCommandsTree(const detail::Command *Cmd, } TEST_F(SchedulerTest, StreamInitDependencyOnHost) { + // Disable post enqueue cleanup so that it doesn't interfere with dependency + // checks. + unittest::ScopedEnvVar DisabledCleanup{ + DisablePostEnqueueCleanupName, "1", + detail::SYCLConfig::reset}; cl::sycl::queue HQueue(host_selector{}); detail::QueueImplPtr HQueueImpl = detail::getSyclObjImpl(HQueue); diff --git a/sycl/unittests/scheduler/utils.cpp b/sycl/unittests/scheduler/utils.cpp index b6bb23b4325d8..373b4572ecfb4 100644 --- a/sycl/unittests/scheduler/utils.cpp +++ b/sycl/unittests/scheduler/utils.cpp @@ -10,8 +10,10 @@ void addEdge(cl::sycl::detail::Command *User, cl::sycl::detail::Command *Dep, cl::sycl::detail::AllocaCommandBase *Alloca) { + std::vector ToCleanUp; (void)User->addDep( - cl::sycl::detail::DepDesc{Dep, User->getRequirement(), Alloca}); + cl::sycl::detail::DepDesc{Dep, User->getRequirement(), Alloca}, + ToCleanUp); Dep->addUser(User); }