From de3532338981b86e2cc05ad339fc453cb1ddff53 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 1 Dec 2021 18:33:28 +0300 Subject: [PATCH 01/23] [DO NOT MERGE] Graph cleanup experiments A very rough implementation of cleaning up command nodes after they're enqueued and stop being leaves, alloca commands excluded. Handles only a subset of cases. --- sycl/source/detail/event_impl.cpp | 2 - sycl/source/detail/scheduler/commands.cpp | 11 ++- sycl/source/detail/scheduler/commands.hpp | 2 +- .../source/detail/scheduler/graph_builder.cpp | 77 +++++++++++++++++- .../detail/scheduler/graph_processor.cpp | 12 +-- sycl/source/detail/scheduler/scheduler.cpp | 78 ++++++++++++++----- sycl/source/detail/scheduler/scheduler.hpp | 17 +++- sycl/unittests/scheduler/BlockedCommands.cpp | 20 ++--- .../scheduler/SchedulerTestUtils.hpp | 22 +++--- 9 files changed, 182 insertions(+), 59 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index dbcf4284953c8..2ff167b223a08 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -235,8 +235,6 @@ void event_impl::wait_and_throw( void event_impl::cleanupCommand( std::shared_ptr Self) const { - if (MCommand && !SYCLConfig::get()) - detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self)); } template <> diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index f74b85e229942..0de731da92e2c 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -260,8 +260,9 @@ class DispatchHostTask { // of empty command. // Also, it's possible to have record deallocated prior to enqueue process. // Thus we employ read-lock of graph. + std::vector EnqueuedCmds; + Scheduler &Sched = Scheduler::getInstance(); { - Scheduler &Sched = Scheduler::getInstance(); Scheduler::ReadLockT Lock(Sched.MGraphLock); std::vector Deps = MThisCmd->MDeps; @@ -272,8 +273,10 @@ class DispatchHostTask { EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; for (const DepDesc &Dep : Deps) - Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement); + Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement, + EnqueuedCmds); } + Sched.cleanupCommands(EnqueuedCmds); } }; @@ -614,7 +617,7 @@ void Command::emitInstrumentation(uint16_t Type, const char *Txt) { #endif } -bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { +bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector &EnqueuedCommands) { // Exit if already enqueued if (MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess) return true; @@ -683,6 +686,8 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { // Consider the command is successfully enqueued if return code is // CL_SUCCESS MEnqueueStatus = EnqueueResultT::SyclEnqueueSuccess; + if (MLeafCounter == 0 && (!MDeps.empty() || !MUsers.empty())) + EnqueuedCommands.push_back(this); } // Emit this correlation signal before the task end diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 4a556f8a5567e..8155b83d0743d 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -124,7 +124,7 @@ class Command { /// \param Blocking if this argument is true, function will wait for the /// command to be unblocked before calling enqueueImp. /// \return true if the command is enqueued. - virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking); + virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector &EnqueuedCommands); bool isFinished(); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 61f10b41845cd..d34eb805b6a16 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -194,6 +194,8 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( ToEnqueue.push_back(ConnectionCmd); Dependency->addUser(Dependant); --(Dependency->MLeafCounter); + if (Dependency->MLeafCounter == 0 && Dependency->isSuccessfullyEnqueued()) + cleanupCommand(Dependency); }; const ContextImplPtr &InteropCtxPtr = Req->MSYCLMemObj->getInteropContext(); @@ -225,17 +227,25 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( return MemObject->MRecord.get(); } -void Scheduler::GraphBuilder::updateLeaves(const std::set &Cmds, - MemObjRecord *Record, - access::mode AccessMode) { +void Scheduler::GraphBuilder::updateLeaves( + const std::set &Cmds, MemObjRecord *Record, + access::mode AccessMode, std::vector *CommandsToCleanUp) { const bool ReadOnlyReq = AccessMode == access::mode::read; if (ReadOnlyReq) return; for (Command *Cmd : Cmds) { + bool WasLeaf = Cmd->MLeafCounter > 0; Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd); Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd); + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()) { + if (CommandsToCleanUp) { + if (WasLeaf) + CommandsToCleanUp->push_back(Cmd); + } else + cleanupCommand(Cmd); + } } } @@ -963,14 +973,23 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, // Node dependencies can be modified further when adding the node to leaves, // iterate over their copy. // FIXME employ a reference here to eliminate copying of a vector + // Updating leaves might also clean up some of the dep commands, so update + // their users first. + // FIXME there's probably a better way of handling cleanup & leaf/dep update + // here considering that some of the updated might be destroyed by cleanup + // immediately after. std::vector Deps = NewCmd->MDeps; + std::vector CommandsToCleanUp; for (DepDesc &Dep : Deps) { Dep.MDepCommand->addUser(NewCmd.get()); const Requirement *Req = Dep.MDepRequirement; MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); - updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode); + updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, + &CommandsToCleanUp); addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue); } + for (Command *Cmd : CommandsToCleanUp) + cleanupCommand(Cmd); // Register all the events as dependencies for (detail::EventImplPtr e : Events) { @@ -993,9 +1012,13 @@ void Scheduler::GraphBuilder::decrementLeafCountersForRecord( MemObjRecord *Record) { for (Command *Cmd : Record->MReadLeaves) { --(Cmd->MLeafCounter); + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()) + cleanupCommand(Cmd); } for (Command *Cmd : Record->MWriteLeaves) { --(Cmd->MLeafCounter); + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()) + cleanupCommand(Cmd); } } @@ -1096,6 +1119,52 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord( handleVisitedNodes(MVisitedCmds); } + +void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) { + if (SYCLConfig::get()) + return; + assert(Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()); + // Isolated command nodes are cleaned up by scheduler instead. + assert(Cmd->MDeps.size() != 0 || Cmd->MUsers.size() != 0); + Command::CommandType CmdT = Cmd->getType(); + // Allocas have to be kept alive until memory objects are released. + if (CmdT == Command::ALLOCA || CmdT == Command::ALLOCA_SUB_BUF) + return; + + // FIXME handle host tasks + if (CmdT == Command::RUN_CG) { + auto *ExecCGCmd = static_cast(Cmd); + if (ExecCGCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask) { + return; + } + } + assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF); + + for (Command *UserCmd : Cmd->MUsers) { + for (DepDesc &Dep : UserCmd->MDeps) { + // Link the users of the command to the alloca command(s) instead + if (Dep.MDepCommand == Cmd) { + // ... unless the user is the alloca itself. + if (Dep.MAllocaCmd == UserCmd) { + Dep.MDepCommand = nullptr; + } + else { + Dep.MDepCommand = Dep.MAllocaCmd; + Dep.MDepCommand->MUsers.insert(UserCmd); + } + } + } + } + // Update dependency users + for (DepDesc &Dep : Cmd->MDeps) { + Command *DepCmd = Dep.MDepCommand; + DepCmd->MUsers.erase(Cmd); + } + Cmd->getEvent()->setCommand(nullptr); + Cmd->getEvent()->cleanupDependencyEvents(); + delete Cmd; +} + void Scheduler::GraphBuilder::cleanupFinishedCommands( Command *FinishedCmd, std::vector> &StreamsToDeallocate) { diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index 69dbb626f9dc2..ca7ea37cd27a5 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -23,7 +23,7 @@ static Command *getCommand(const EventImplPtr &Event) { void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event, ReadLockT &GraphReadLock, - bool LockTheLock) { + std::vector &EnqueuedCmds, bool LockTheLock) { Command *Cmd = getCommand(Event); // Command can be nullptr if user creates cl::sycl::event explicitly or the // event has been waited on by another thread @@ -31,7 +31,7 @@ void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event, return; EnqueueResultT Res; - bool Enqueued = enqueueCommand(Cmd, Res, BLOCKING); + bool Enqueued = enqueueCommand(Cmd, Res, EnqueuedCmds, BLOCKING); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) // TODO: Reschedule commands. throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); @@ -47,7 +47,7 @@ void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event, bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult, - BlockingT Blocking) { + std::vector &EnqueuedCommands, BlockingT Blocking) { if (!Cmd || Cmd->isSuccessfullyEnqueued()) return true; @@ -60,7 +60,7 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, // Recursively enqueue all the dependencies first and // exit immediately if any of the commands cannot be enqueued. for (DepDesc &Dep : Cmd->MDeps) { - if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, Blocking)) + if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, EnqueuedCommands, Blocking)) return false; } @@ -76,7 +76,7 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, // implemented. for (const EventImplPtr &Event : Cmd->getPreparedHostDepsEvents()) { if (Command *DepCmd = static_cast(Event->getCommand())) - if (!enqueueCommand(DepCmd, EnqueueResult, Blocking)) + if (!enqueueCommand(DepCmd, EnqueueResult, EnqueuedCommands, Blocking)) return false; } @@ -93,7 +93,7 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, // on completion of C and starts cleanup process. This thread is still in the // middle of enqueue of B. The other thread modifies dependency list of A by // removing C out of it. Iterators become invalid. - return Cmd->enqueue(EnqueueResult, Blocking); + return Cmd->enqueue(EnqueueResult, Blocking, EnqueuedCommands); } } // namespace detail diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c17beb8a3621d..606cc1b48ac55 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -32,31 +32,32 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, // Will contain the list of dependencies for the Release Command std::set DepCommands; #endif + std::vector EnqueuedCmds; for (Command *Cmd : Record->MReadLeaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION // Capture the dependencies DepCommands.insert(Cmd); #endif - GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock); + GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, EnqueuedCmds); } for (Command *Cmd : Record->MWriteLeaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION DepCommands.insert(Cmd); #endif - GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock); + GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, EnqueuedCmds); } for (AllocaCommandBase *AllocaCmd : Record->MAllocaCommands) { Command *ReleaseCmd = AllocaCmd->getReleaseCmd(); EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, Res); + bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -64,7 +65,7 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, // reported as edges ReleaseCmd->resolveReleaseDependencies(DepCommands); #endif - GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock); + GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock, EnqueuedCmds); } } @@ -108,6 +109,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, NewEvent = NewCmd->getEvent(); } + std::vector EnqueuedCmds; { ReadLockT Lock(MGraphLock); @@ -127,7 +129,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, }; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); try { if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Auxiliary enqueue process failed.", @@ -144,7 +146,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, // TODO: Check if lazy mode. EnqueueResultT Res; try { - bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); + bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } catch (...) { @@ -161,6 +163,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, CleanUp(); } } + cleanupCommands(EnqueuedCmds); for (auto StreamImplPtr : Streams) { StreamImplPtr->flush(); @@ -182,23 +185,25 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { return nullptr; } + std::vector EnqueuedCmds; try { ReadLockT Lock(MGraphLock); EnqueueResultT Res; bool Enqueued; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } - Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); + Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } catch (...) { NewCmd->getQueue()->reportAsyncException(std::current_exception()); } + cleanupCommands(EnqueuedCmds); return NewCmd->getEvent(); } @@ -210,7 +215,9 @@ void Scheduler::waitForEvent(EventImplPtr Event) { ReadLockT Lock(MGraphLock); // It's fine to leave the lock unlocked upon return from waitForEvent as // there's no more actions to do here with graph - GraphProcessor::waitForEvent(std::move(Event), Lock, /*LockTheLock=*/false); + std::vector EnqueuedCmds; + GraphProcessor::waitForEvent(std::move(Event), Lock, EnqueuedCmds, /*LockTheLock=*/false); + cleanupCommands(EnqueuedCmds); } static void deallocateStreams( @@ -293,44 +300,52 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { if (!NewCmd) return nullptr; + std::vector EnqueuedCmds; { ReadLockT ReadLock(MGraphLock); EnqueueResultT Res; bool Enqueued; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } - Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); + Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } + cleanupCommands(EnqueuedCmds); return NewCmd->getEvent(); } void Scheduler::releaseHostAccessor(Requirement *Req) { Command *const BlockedCmd = Req->MBlockedCmd; - ReadLockT Lock(MGraphLock); + std::vector EnqueuedCmds; + { + ReadLockT Lock(MGraphLock); - assert(BlockedCmd && "Can't find appropriate command to unblock"); + assert(BlockedCmd && "Can't find appropriate command to unblock"); - BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; + BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; - enqueueLeavesOfReqUnlocked(Req); + enqueueLeavesOfReqUnlocked(Req, EnqueuedCmds); + } + cleanupCommands(EnqueuedCmds); } // static -void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req) { +void Scheduler::enqueueLeavesOfReqUnlocked( + const Requirement *const Req, std::vector &EnqueuedCmds) { + //FIXME handle this as well MemObjRecord *Record = Req->MSYCLMemObj->MRecord.get(); - auto EnqueueLeaves = [](LeavesCollection &Leaves) { + auto EnqueueLeaves = [&EnqueuedCmds](LeavesCollection &Leaves) { for (Command *Cmd : Leaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } @@ -406,6 +421,29 @@ MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) { return Req->MSYCLMemObj->MRecord.get(); } +void Scheduler::cleanupCommands(std::vector &Cmds) { + WriteLockT Lock(MGraphLock, std::try_to_lock); + // In order to avoid deadlocks related to block commands, defer cleanup if the + // lock wasn't acquired. + if (Lock.owns_lock()) { + for (Command *Cmd : Cmds) { + MGraphBuilder.cleanupCommand(Cmd); + } + std::vector DeferredCleanupCommands; + { + std::lock_guard Lock{MDeferredCleanupMutex}; + std::swap(DeferredCleanupCommands, MDeferredCleanupCommands); + } + for (Command *Cmd : DeferredCleanupCommands) { + MGraphBuilder.cleanupCommand(Cmd); + } + } else { + std::lock_guard Lock{MDeferredCleanupMutex}; + MDeferredCleanupCommands.insert(MDeferredCleanupCommands.end(), + Cmds.begin(), Cmds.end()); + } +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 66e17d7862301..3ddbefc856d87 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -459,7 +459,10 @@ class Scheduler { /// \param Lock is an instance of WriteLockT, created with \c std::defer_lock void acquireWriteLock(WriteLockT &Lock); - static void enqueueLeavesOfReqUnlocked(const Requirement *const Req); + void cleanupCommands(std::vector &Cmds); + + static void enqueueLeavesOfReqUnlocked(const Requirement *const Req, + std::vector &EnqueuedCmds); /// Graph builder class. /// @@ -505,6 +508,8 @@ class Scheduler { /// with Event passed and its dependencies. void optimize(EventImplPtr Event); + void cleanupCommand(Command *Cmd); + /// Removes finished non-leaf non-alloca commands from the subgraph /// (assuming that all its commands have been waited for). void cleanupFinishedCommands( @@ -547,7 +552,8 @@ class Scheduler { /// Removes commands from leaves. void updateLeaves(const std::set &Cmds, MemObjRecord *Record, - access::mode AccessMode); + access::mode AccessMode, + std::vector *CommandsToCleanUp = nullptr); /// Perform connection of events in multiple contexts /// \param Cmd dependant command @@ -724,7 +730,7 @@ class Scheduler { /// /// The function may unlock and lock GraphReadLock as needed. Upon return /// the lock is left in locked state if and only if LockTheLock is true. - static void waitForEvent(EventImplPtr Event, ReadLockT &GraphReadLock, + static void waitForEvent(EventImplPtr Event, ReadLockT &GraphReadLock, std::vector &EnqueueCommands, bool LockTheLock = true); /// Enqueues the command and all its dependencies. @@ -735,7 +741,7 @@ class Scheduler { /// The function may unlock and lock GraphReadLock as needed. Upon return /// the lock is left in locked state. static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult, - BlockingT Blocking = NON_BLOCKING); + std::vector &EnqueuedCommands, BlockingT Blocking = NON_BLOCKING); }; /// This function waits on all of the graph leaves which somehow use the @@ -751,6 +757,9 @@ class Scheduler { GraphBuilder MGraphBuilder; RWLockT MGraphLock; + std::vector MDeferredCleanupCommands; + std::mutex MDeferredCleanupMutex; + QueueImplPtr DefaultHostQueue; friend class Command; diff --git a/sycl/unittests/scheduler/BlockedCommands.cpp b/sycl/unittests/scheduler/BlockedCommands.cpp index 967b3ee75531c..9ff6fce1460ef 100644 --- a/sycl/unittests/scheduler/BlockedCommands.cpp +++ b/sycl/unittests/scheduler/BlockedCommands.cpp @@ -81,10 +81,10 @@ TEST_F(SchedulerTest, DontEnqueueDepsIfOneOfThemIsBlocked) { // // If C is blocked, we should not try to enqueue D. - EXPECT_CALL(A, enqueue(_, _)).Times(0); - EXPECT_CALL(B, enqueue(_, _)).Times(1); - EXPECT_CALL(C, enqueue(_, _)).Times(0); - EXPECT_CALL(D, enqueue(_, _)).Times(0); + EXPECT_CALL(A, enqueue).Times(0); + EXPECT_CALL(B, enqueue).Times(1); + EXPECT_CALL(C, enqueue).Times(0); + EXPECT_CALL(D, enqueue).Times(0); MockScheduler MS; auto Lock = MS.acquireGraphReadLock(); @@ -113,8 +113,8 @@ TEST_F(SchedulerTest, EnqueueBlockedCommandEarlyExit) { // // If A is blocked, we should not try to enqueue B. - EXPECT_CALL(A, enqueue(_, _)).Times(0); - EXPECT_CALL(B, enqueue(_, _)).Times(0); + EXPECT_CALL(A, enqueue).Times(0); + EXPECT_CALL(B, enqueue).Times(0); MockScheduler MS; auto Lock = MS.acquireGraphReadLock(); @@ -127,8 +127,8 @@ TEST_F(SchedulerTest, EnqueueBlockedCommandEarlyExit) { // But if the enqueue type is blocking we should not exit early. - EXPECT_CALL(A, enqueue(_, _)).Times(0); - EXPECT_CALL(B, enqueue(_, _)).Times(1); + EXPECT_CALL(A, enqueue).Times(0); + EXPECT_CALL(B, enqueue).Times(1); Enqueued = MockScheduler::enqueueCommand(&A, Res, detail::BLOCKING); ASSERT_FALSE(Enqueued) << "Blocked command should not be enqueued\n"; @@ -166,8 +166,8 @@ TEST_F(SchedulerTest, EnqueueHostDependency) { // "Graph" is quoted as we don't have this dependency in MDeps. Instead, we // have this dependecy as result of handler::depends_on() call. - EXPECT_CALL(A, enqueue(_, _)).Times(1); - EXPECT_CALL(B, enqueue(_, _)).Times(1); + EXPECT_CALL(A, enqueue).Times(1); + EXPECT_CALL(B, enqueue).Times(1); MockScheduler MS; auto Lock = MS.acquireGraphReadLock(); diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index e918951108488..1a2fe05e16254 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -38,9 +39,9 @@ class MockCommand : public cl::sycl::detail::Command { cl::sycl::detail::Command::RUN_CG) : Command{Type, Queue}, MRequirement{std::move(Req)} { using namespace testing; - ON_CALL(*this, enqueue(_, _)) + ON_CALL(*this, enqueue) .WillByDefault(Invoke(this, &MockCommand::enqueueOrigin)); - EXPECT_CALL(*this, enqueue(_, _)).Times(AnyNumber()); + EXPECT_CALL(*this, enqueue).Times(AnyNumber()); } MockCommand(cl::sycl::detail::QueueImplPtr Queue, @@ -48,9 +49,9 @@ class MockCommand : public cl::sycl::detail::Command { cl::sycl::detail::Command::RUN_CG) : Command{Type, Queue}, MRequirement{std::move(getMockRequirement())} { using namespace testing; - ON_CALL(*this, enqueue(_, _)) + ON_CALL(*this, enqueue) .WillByDefault(Invoke(this, &MockCommand::enqueueOrigin)); - EXPECT_CALL(*this, enqueue(_, _)).Times(AnyNumber()); + EXPECT_CALL(*this, enqueue).Times(AnyNumber()); } void printDot(std::ostream &) const override {} @@ -62,11 +63,13 @@ class MockCommand : public cl::sycl::detail::Command { cl_int enqueueImp() override { return MRetVal; } - MOCK_METHOD2(enqueue, bool(cl::sycl::detail::EnqueueResultT &, - cl::sycl::detail::BlockingT)); + MOCK_METHOD3(enqueue, bool(cl::sycl::detail::EnqueueResultT &, + cl::sycl::detail::BlockingT, + std::vector &)); bool enqueueOrigin(cl::sycl::detail::EnqueueResultT &EnqueueResult, - cl::sycl::detail::BlockingT Blocking) { - return cl::sycl::detail::Command::enqueue(EnqueueResult, Blocking); + cl::sycl::detail::BlockingT Blocking, + std::vector &EnqueuedCmds) { + return cl::sycl::detail::Command::enqueue(EnqueueResult, Blocking, EnqueuedCmds); } cl_int MRetVal = CL_SUCCESS; @@ -124,7 +127,8 @@ class MockScheduler : public cl::sycl::detail::Scheduler { static bool enqueueCommand(cl::sycl::detail::Command *Cmd, cl::sycl::detail::EnqueueResultT &EnqueueResult, cl::sycl::detail::BlockingT Blocking) { - return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, Blocking); + std::vector EnqueuedCmds; + return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, EnqueuedCmds, Blocking); } cl::sycl::detail::AllocaCommandBase * From 0651e70c790da3e16f3eb64dfc4e2fe440bcb075 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 9 Dec 2021 19:10:38 +0300 Subject: [PATCH 02/23] Fix read after free --- sycl/source/detail/scheduler/scheduler.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 606cc1b48ac55..f963a87ce0676 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -203,8 +203,9 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { } catch (...) { NewCmd->getQueue()->reportAsyncException(std::current_exception()); } + EventImplPtr NewEvent = NewCmd->getEvent(); cleanupCommands(EnqueuedCmds); - return NewCmd->getEvent(); + return NewEvent; } Scheduler &Scheduler::getInstance() { @@ -317,8 +318,9 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } + EventImplPtr NewEvent = NewCmd->getEvent(); cleanupCommands(EnqueuedCmds); - return NewCmd->getEvent(); + return NewEvent; } void Scheduler::releaseHostAccessor(Requirement *Req) { From ef4b476603725bdbed6834c2c6017d1d3ca5c89c Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 13 Dec 2021 14:41:10 +0300 Subject: [PATCH 03/23] Fix post-enqueue and graph traversal cleanup conflict --- sycl/source/detail/scheduler/commands.cpp | 5 ++++- sycl/source/detail/scheduler/commands.hpp | 4 ++++ sycl/source/detail/scheduler/graph_builder.cpp | 15 +++++++++------ sycl/source/detail/scheduler/scheduler.cpp | 3 +++ 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 0de731da92e2c..390f3e10c7288 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -686,8 +686,11 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::ve // Consider the command is successfully enqueued if return code is // CL_SUCCESS MEnqueueStatus = EnqueueResultT::SyclEnqueueSuccess; - if (MLeafCounter == 0 && (!MDeps.empty() || !MUsers.empty())) + if (MLeafCounter == 0 && (!MDeps.empty() || !MUsers.empty())) { + assert(!MPostEnqueueCleanup); + MPostEnqueueCleanup = true; EnqueuedCommands.push_back(this); + } } // Emit this correlation signal before the task end diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 8155b83d0743d..875d96414edb5 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -297,6 +297,10 @@ class Command { // By default the flag is set to true due to most of host operations are // synchronous. The only asynchronous operation currently is host-task. bool MShouldCompleteEventIfPossible = true; + + /// Indicates that the node will be freed by cleanup after enqueue. Such nodes + /// should be ignored by other cleanup mechanisms. + bool MPostEnqueueCleanup = false; }; /// The empty command does nothing during enqueue. The task can be used to diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index d34eb805b6a16..56174e7dda78c 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1111,8 +1111,11 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord( // If all dependencies have been removed this way, mark the command for // deletion if (Cmd->MDeps.empty()) { - Cmd->MMarks.MToBeDeleted = true; Cmd->MUsers.clear(); + // Do not delete the node if it's scheduled for post-enqueue cleanup to + // avoid double free. + if (!Cmd->MPostEnqueueCleanup) + Cmd->MMarks.MToBeDeleted = true; } } @@ -1124,21 +1127,18 @@ void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) { if (SYCLConfig::get()) return; assert(Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()); - // Isolated command nodes are cleaned up by scheduler instead. - assert(Cmd->MDeps.size() != 0 || Cmd->MUsers.size() != 0); Command::CommandType CmdT = Cmd->getType(); // Allocas have to be kept alive until memory objects are released. if (CmdT == Command::ALLOCA || CmdT == Command::ALLOCA_SUB_BUF) return; - // FIXME handle host tasks + // TODO enable cleaning up host tasks after enqueue. if (CmdT == Command::RUN_CG) { auto *ExecCGCmd = static_cast(Cmd); if (ExecCGCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask) { return; } } - assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF); for (Command *UserCmd : Cmd->MUsers) { for (DepDesc &Dep : UserCmd->MDeps) { @@ -1217,7 +1217,10 @@ void Scheduler::GraphBuilder::cleanupFinishedCommands( DepCmd->MUsers.erase(Cmd); } - Cmd->MMarks.MToBeDeleted = true; + // Do not delete the node if it's scheduled for post-enqueue cleanup to + // avoid double free. + if (!Cmd->MPostEnqueueCleanup) + Cmd->MMarks.MToBeDeleted = true; } handleVisitedNodes(MVisitedCmds); } diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index f963a87ce0676..0b5bc1f2c4a1f 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -424,6 +424,8 @@ MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) { } void Scheduler::cleanupCommands(std::vector &Cmds) { + if (Cmds.empty()) + return; WriteLockT Lock(MGraphLock, std::try_to_lock); // In order to avoid deadlocks related to block commands, defer cleanup if the // lock wasn't acquired. @@ -439,6 +441,7 @@ void Scheduler::cleanupCommands(std::vector &Cmds) { for (Command *Cmd : DeferredCleanupCommands) { MGraphBuilder.cleanupCommand(Cmd); } + } else { std::lock_guard Lock{MDeferredCleanupMutex}; MDeferredCleanupCommands.insert(MDeferredCleanupCommands.end(), From b0ce766c7926d32a3e7efe3ca28e75d786e11c50 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 13 Dec 2021 16:52:21 +0300 Subject: [PATCH 04/23] Disable post enqueue cleanup for unsupported commands --- sycl/include/CL/sycl/detail/cg.hpp | 1 + sycl/source/detail/scheduler/commands.cpp | 16 ++++++++++++- sycl/source/detail/scheduler/commands.hpp | 7 ++++++ .../source/detail/scheduler/graph_builder.cpp | 24 +++++++++++++------ 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index 170c0f39906c3..f4938aa43120d 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -290,6 +290,7 @@ class CGExecKernel : public CG { } void clearStreams() { MStreams.clear(); } + bool hasStreams() { return !MStreams.empty(); } }; /// "Copy memory" command group class. diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 390f3e10c7288..27ff150b467c4 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -565,6 +565,11 @@ const QueueImplPtr &Command::getWorkerQueue() const { return MQueue; } bool Command::producesPiEvent() const { return true; } +bool Command::supportsPostEnqueueCleanup() const { + // Isolated commands are cleaned up separately + return !MUsers.empty() || !MDeps.empty(); +} + Command *Command::addDep(DepDesc NewDep) { Command *ConnectionCmd = nullptr; @@ -686,7 +691,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::ve // Consider the command is successfully enqueued if return code is // CL_SUCCESS MEnqueueStatus = EnqueueResultT::SyclEnqueueSuccess; - if (MLeafCounter == 0 && (!MDeps.empty() || !MUsers.empty())) { + if (MLeafCounter == 0 && supportsPostEnqueueCleanup()) { assert(!MPostEnqueueCleanup); MPostEnqueueCleanup = true; EnqueuedCommands.push_back(this); @@ -783,6 +788,8 @@ void AllocaCommandBase::emitInstrumentationData() { bool AllocaCommandBase::producesPiEvent() const { return false; } +bool AllocaCommandBase::supportsPostEnqueueCleanup() const { return false; } + AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData, AllocaCommandBase *LinkedAllocaCmd) @@ -1049,6 +1056,8 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { bool ReleaseCommand::producesPiEvent() const { return false; } +bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; } + MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue, access::mode MapMode) @@ -2333,6 +2342,11 @@ bool ExecCGCommand::producesPiEvent() const { return MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask; } +bool ExecCGCommand::supportsPostEnqueueCleanup() const { + // TODO enable cleaning up host task commands and kernels with streams after enqueue + return Command::supportsPostEnqueueCleanup() && (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask) && (MCommandGroup->getType() != CG::CGTYPE::Kernel || !(static_cast(MCommandGroup.get()))->hasStreams()); +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 875d96414edb5..d79891b02b188 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -194,6 +194,9 @@ class Command { /// Returns true iff the command produces a PI event on non-host devices. virtual bool producesPiEvent() const; + /// Returns true iff this command can be freed by post enqueue cleanup. + virtual bool supportsPostEnqueueCleanup() const; + protected: QueueImplPtr MQueue; QueueImplPtr MSubmittedQueue; @@ -336,6 +339,7 @@ class ReleaseCommand : public Command { void printDot(std::ostream &Stream) const final; void emitInstrumentationData() override; bool producesPiEvent() const final; + bool supportsPostEnqueueCleanup() const final; private: cl_int enqueueImp() final; @@ -362,6 +366,8 @@ class AllocaCommandBase : public Command { bool producesPiEvent() const final; + bool supportsPostEnqueueCleanup() const final; + void *MMemAllocation = nullptr; /// Alloca command linked with current command. @@ -545,6 +551,7 @@ class ExecCGCommand : public Command { bool producesPiEvent() const final; + bool supportsPostEnqueueCleanup() const final; private: cl_int enqueueImp() final; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 56174e7dda78c..23c7962cccc41 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -194,7 +194,7 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( ToEnqueue.push_back(ConnectionCmd); Dependency->addUser(Dependant); --(Dependency->MLeafCounter); - if (Dependency->MLeafCounter == 0 && Dependency->isSuccessfullyEnqueued()) + if (Dependency->MLeafCounter == 0 && Dependency->isSuccessfullyEnqueued() && Dependency->supportsPostEnqueueCleanup()) cleanupCommand(Dependency); }; @@ -239,7 +239,7 @@ void Scheduler::GraphBuilder::updateLeaves( bool WasLeaf = Cmd->MLeafCounter > 0; Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd); Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd); - if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()) { + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && Cmd->supportsPostEnqueueCleanup()) { if (CommandsToCleanUp) { if (WasLeaf) CommandsToCleanUp->push_back(Cmd); @@ -1012,12 +1012,12 @@ void Scheduler::GraphBuilder::decrementLeafCountersForRecord( MemObjRecord *Record) { for (Command *Cmd : Record->MReadLeaves) { --(Cmd->MLeafCounter); - if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()) + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && Cmd->supportsPostEnqueueCleanup()) cleanupCommand(Cmd); } for (Command *Cmd : Record->MWriteLeaves) { --(Cmd->MLeafCounter); - if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()) + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && Cmd->supportsPostEnqueueCleanup()) cleanupCommand(Cmd); } } @@ -1128,9 +1128,19 @@ void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) { return; assert(Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()); Command::CommandType CmdT = Cmd->getType(); - // Allocas have to be kept alive until memory objects are released. - if (CmdT == Command::ALLOCA || CmdT == Command::ALLOCA_SUB_BUF) - return; + + assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF); + assert(CmdT != Command::RELEASE); + assert(CmdT != Command::RUN_CG || (static_cast(Cmd))->getCG().getType() != CG::CGTYPE::CodeplayHostTask); +#ifndef NDEBUG + if (CmdT == Command::RUN_CG) { + auto *ExecCGCmd = static_cast(Cmd); + if (ExecCGCmd->getCG().getType() == CG::CGTYPE::Kernel) { + auto *ExecKernelCG = static_cast(&ExecCGCmd->getCG()); + assert(!ExecKernelCG->hasStreams()); + } + } +#endif // TODO enable cleaning up host tasks after enqueue. if (CmdT == Command::RUN_CG) { From 99686b54655042df728c8a50cfea92fbd9e15c51 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 13 Dec 2021 19:47:12 +0300 Subject: [PATCH 05/23] Add a workaround for spec const issue --- sycl/CMakeLists.txt | 1 + sycl/source/detail/device_image_impl.hpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 5720fcfa6944b..02f7412e49875 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -9,6 +9,7 @@ option(SYCL_ENABLE_WERROR "Treat all warnings as errors in SYCL project" OFF) option(SYCL_DISABLE_STL_ASSERTIONS "Disable assertions in STL containers" OFF) option(SYCL_ADD_DEV_VERSION_POSTFIX "Adds -V postfix to version string" ON) +add_definitions(-g -O0) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") include(AddSYCLExecutable) include(SYCLUtils) diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index c038031201705..e892babe02dc4 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -187,7 +187,7 @@ class device_image_impl { const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin(); Plugin.call( detail::getSyclObjImpl(MContext)->getHandleRef(), - PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_USE, + PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY, MSpecConstsBlob.size(), MSpecConstsBlob.data(), &MSpecConstsBuffer, nullptr); } From b6b371bc40bcbb366d9cf8fb3321389d0124335b Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 13 Dec 2021 19:47:58 +0300 Subject: [PATCH 06/23] Turn on old cleanup --- sycl/CMakeLists.txt | 1 - sycl/source/detail/config.def | 1 + sycl/source/detail/event_impl.cpp | 2 ++ sycl/source/detail/scheduler/commands.cpp | 2 +- sycl/source/detail/scheduler/graph_builder.cpp | 2 +- 5 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 02f7412e49875..5720fcfa6944b 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -9,7 +9,6 @@ option(SYCL_ENABLE_WERROR "Treat all warnings as errors in SYCL project" OFF) option(SYCL_DISABLE_STL_ASSERTIONS "Disable assertions in STL containers" OFF) option(SYCL_ADD_DEV_VERSION_POSTFIX "Adds -V postfix to version string" ON) -add_definitions(-g -O0) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") include(AddSYCLExecutable) include(SYCLUtils) diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def index fcfe345b52975..0ed2d45d0b16c 100644 --- a/sycl/source/detail/config.def +++ b/sycl/source/detail/config.def @@ -36,3 +36,4 @@ CONFIG(SYCL_CACHE_MAX_DEVICE_IMAGE_SIZE, 16, __SYCL_CACHE_MAX_DEVICE_IMAGE_SIZE) CONFIG(INTEL_ENABLE_OFFLOAD_ANNOTATIONS, 1, __SYCL_INTEL_ENABLE_OFFLOAD_ANNOTATIONS) CONFIG(SYCL_ENABLE_DEFAULT_CONTEXTS, 1, __SYCL_ENABLE_DEFAULT_CONTEXTS) CONFIG(SYCL_QUEUE_THREAD_POOL_SIZE, 4, __SYCL_QUEUE_THREAD_POOL_SIZE) +CONFIG(SYCL_DISABLE_POST_ENQUEUE_CLEANUP, 1, __SYCL_DISABLE_POST_ENQUEUE_CLEANUP) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 2ff167b223a08..dbcf4284953c8 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -235,6 +235,8 @@ void event_impl::wait_and_throw( void event_impl::cleanupCommand( std::shared_ptr Self) const { + if (MCommand && !SYCLConfig::get()) + detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self)); } template <> diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 27ff150b467c4..ef2132a30d51d 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -691,7 +691,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::ve // Consider the command is successfully enqueued if return code is // CL_SUCCESS MEnqueueStatus = EnqueueResultT::SyclEnqueueSuccess; - if (MLeafCounter == 0 && supportsPostEnqueueCleanup()) { + if (MLeafCounter == 0 && supportsPostEnqueueCleanup() && !SYCLConfig::get()) { assert(!MPostEnqueueCleanup); MPostEnqueueCleanup = true; EnqueuedCommands.push_back(this); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 23c7962cccc41..5697864bdbea2 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1124,7 +1124,7 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord( void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) { - if (SYCLConfig::get()) + if (SYCLConfig::get()) return; assert(Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()); Command::CommandType CmdT = Cmd->getType(); From ed4a785bf9507e5ff575b9c3c423488085f996b9 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 14 Dec 2021 14:52:07 +0300 Subject: [PATCH 07/23] Isolate nodes scheduled for post-enqueue cleanup --- sycl/source/detail/scheduler/commands.cpp | 3 ++- sycl/source/detail/scheduler/graph_builder.cpp | 10 +++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index ef2132a30d51d..0ae828263519d 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -691,7 +691,8 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::ve // Consider the command is successfully enqueued if return code is // CL_SUCCESS MEnqueueStatus = EnqueueResultT::SyclEnqueueSuccess; - if (MLeafCounter == 0 && supportsPostEnqueueCleanup() && !SYCLConfig::get()) { + if (MLeafCounter == 0 && supportsPostEnqueueCleanup() && + !SYCLConfig::get()) { assert(!MPostEnqueueCleanup); MPostEnqueueCleanup = true; EnqueuedCommands.push_back(this); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 5697864bdbea2..a53064d0be7f1 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1227,10 +1227,14 @@ void Scheduler::GraphBuilder::cleanupFinishedCommands( DepCmd->MUsers.erase(Cmd); } - // Do not delete the node if it's scheduled for post-enqueue cleanup to - // avoid double free. - if (!Cmd->MPostEnqueueCleanup) + // Isolate the node instead of deleting it if it's scheduled for + // post-enqueue cleanup to avoid double free. + if (Cmd->MPostEnqueueCleanup) { + Cmd->MDeps.clear(); + Cmd->MUsers.clear(); + } else { Cmd->MMarks.MToBeDeleted = true; + } } handleVisitedNodes(MVisitedCmds); } From 307c2ae1ba2516146cfd2b3ba5a4b4cb97079ae7 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 14 Dec 2021 14:53:10 +0300 Subject: [PATCH 08/23] Apply clang-format --- sycl/source/detail/scheduler/commands.cpp | 11 ++++++--- sycl/source/detail/scheduler/commands.hpp | 4 +++- .../source/detail/scheduler/graph_builder.cpp | 23 +++++++++++-------- .../detail/scheduler/graph_processor.cpp | 15 ++++++------ sycl/source/detail/scheduler/scheduler.cpp | 14 +++++++---- sycl/source/detail/scheduler/scheduler.hpp | 6 +++-- .../scheduler/SchedulerTestUtils.hpp | 6 +++-- 7 files changed, 50 insertions(+), 29 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 0ae828263519d..9792160633e10 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -622,7 +622,8 @@ void Command::emitInstrumentation(uint16_t Type, const char *Txt) { #endif } -bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector &EnqueuedCommands) { +bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, + std::vector &EnqueuedCommands) { // Exit if already enqueued if (MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess) return true; @@ -2344,8 +2345,12 @@ bool ExecCGCommand::producesPiEvent() const { } bool ExecCGCommand::supportsPostEnqueueCleanup() const { - // TODO enable cleaning up host task commands and kernels with streams after enqueue - return Command::supportsPostEnqueueCleanup() && (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask) && (MCommandGroup->getType() != CG::CGTYPE::Kernel || !(static_cast(MCommandGroup.get()))->hasStreams()); + // TODO enable cleaning up host task commands and kernels with streams after + // enqueue + return Command::supportsPostEnqueueCleanup() && + (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask) && + (MCommandGroup->getType() != CG::CGTYPE::Kernel || + !(static_cast(MCommandGroup.get()))->hasStreams()); } } // namespace detail diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index d79891b02b188..5fabd94596edc 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -124,7 +124,8 @@ class Command { /// \param Blocking if this argument is true, function will wait for the /// command to be unblocked before calling enqueueImp. /// \return true if the command is enqueued. - virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, std::vector &EnqueuedCommands); + virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, + std::vector &EnqueuedCommands); bool isFinished(); @@ -552,6 +553,7 @@ class ExecCGCommand : public Command { bool producesPiEvent() const final; bool supportsPostEnqueueCleanup() const final; + private: cl_int enqueueImp() final; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index a53064d0be7f1..27cc8b6987e85 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -194,7 +194,9 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( ToEnqueue.push_back(ConnectionCmd); Dependency->addUser(Dependant); --(Dependency->MLeafCounter); - if (Dependency->MLeafCounter == 0 && Dependency->isSuccessfullyEnqueued() && Dependency->supportsPostEnqueueCleanup()) + if (Dependency->MLeafCounter == 0 && + Dependency->isSuccessfullyEnqueued() && + Dependency->supportsPostEnqueueCleanup()) cleanupCommand(Dependency); }; @@ -239,7 +241,8 @@ void Scheduler::GraphBuilder::updateLeaves( bool WasLeaf = Cmd->MLeafCounter > 0; Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd); Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd); - if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && Cmd->supportsPostEnqueueCleanup()) { + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && + Cmd->supportsPostEnqueueCleanup()) { if (CommandsToCleanUp) { if (WasLeaf) CommandsToCleanUp->push_back(Cmd); @@ -1012,12 +1015,14 @@ void Scheduler::GraphBuilder::decrementLeafCountersForRecord( MemObjRecord *Record) { for (Command *Cmd : Record->MReadLeaves) { --(Cmd->MLeafCounter); - if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && Cmd->supportsPostEnqueueCleanup()) + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && + Cmd->supportsPostEnqueueCleanup()) cleanupCommand(Cmd); } for (Command *Cmd : Record->MWriteLeaves) { --(Cmd->MLeafCounter); - if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && Cmd->supportsPostEnqueueCleanup()) + if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && + Cmd->supportsPostEnqueueCleanup()) cleanupCommand(Cmd); } } @@ -1122,16 +1127,17 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord( handleVisitedNodes(MVisitedCmds); } - void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) { if (SYCLConfig::get()) return; assert(Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued()); Command::CommandType CmdT = Cmd->getType(); - + assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF); assert(CmdT != Command::RELEASE); - assert(CmdT != Command::RUN_CG || (static_cast(Cmd))->getCG().getType() != CG::CGTYPE::CodeplayHostTask); + assert(CmdT != Command::RUN_CG || + (static_cast(Cmd))->getCG().getType() != + CG::CGTYPE::CodeplayHostTask); #ifndef NDEBUG if (CmdT == Command::RUN_CG) { auto *ExecCGCmd = static_cast(Cmd); @@ -1157,8 +1163,7 @@ void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) { // ... unless the user is the alloca itself. if (Dep.MAllocaCmd == UserCmd) { Dep.MDepCommand = nullptr; - } - else { + } else { Dep.MDepCommand = Dep.MAllocaCmd; Dep.MDepCommand->MUsers.insert(UserCmd); } diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index ca7ea37cd27a5..fbde85ddb01e3 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -21,9 +21,9 @@ static Command *getCommand(const EventImplPtr &Event) { return (Command *)Event->getCommand(); } -void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event, - ReadLockT &GraphReadLock, - std::vector &EnqueuedCmds, bool LockTheLock) { +void Scheduler::GraphProcessor::waitForEvent( + EventImplPtr Event, ReadLockT &GraphReadLock, + std::vector &EnqueuedCmds, bool LockTheLock) { Command *Cmd = getCommand(Event); // Command can be nullptr if user creates cl::sycl::event explicitly or the // event has been waited on by another thread @@ -45,9 +45,9 @@ void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event, GraphReadLock.lock(); } -bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, - EnqueueResultT &EnqueueResult, - std::vector &EnqueuedCommands, BlockingT Blocking) { +bool Scheduler::GraphProcessor::enqueueCommand( + Command *Cmd, EnqueueResultT &EnqueueResult, + std::vector &EnqueuedCommands, BlockingT Blocking) { if (!Cmd || Cmd->isSuccessfullyEnqueued()) return true; @@ -60,7 +60,8 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, // Recursively enqueue all the dependencies first and // exit immediately if any of the commands cannot be enqueued. for (DepDesc &Dep : Cmd->MDeps) { - if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, EnqueuedCommands, Blocking)) + if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, EnqueuedCommands, + Blocking)) return false; } diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 0b5bc1f2c4a1f..fceb2a01be9a1 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -57,7 +57,8 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, for (AllocaCommandBase *AllocaCmd : Record->MAllocaCommands) { Command *ReleaseCmd = AllocaCmd->getReleaseCmd(); EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, Res, EnqueuedCmds); + bool Enqueued = + GraphProcessor::enqueueCommand(ReleaseCmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -65,7 +66,8 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, // reported as edges ReleaseCmd->resolveReleaseDependencies(DepCommands); #endif - GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock, EnqueuedCmds); + GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock, + EnqueuedCmds); } } @@ -146,7 +148,8 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, // TODO: Check if lazy mode. EnqueueResultT Res; try { - bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, EnqueuedCmds); + bool Enqueued = + GraphProcessor::enqueueCommand(NewCmd, Res, EnqueuedCmds); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } catch (...) { @@ -217,7 +220,8 @@ void Scheduler::waitForEvent(EventImplPtr Event) { // It's fine to leave the lock unlocked upon return from waitForEvent as // there's no more actions to do here with graph std::vector EnqueuedCmds; - GraphProcessor::waitForEvent(std::move(Event), Lock, EnqueuedCmds, /*LockTheLock=*/false); + GraphProcessor::waitForEvent(std::move(Event), Lock, EnqueuedCmds, + /*LockTheLock=*/false); cleanupCommands(EnqueuedCmds); } @@ -342,7 +346,7 @@ void Scheduler::releaseHostAccessor(Requirement *Req) { // static void Scheduler::enqueueLeavesOfReqUnlocked( const Requirement *const Req, std::vector &EnqueuedCmds) { - //FIXME handle this as well + // FIXME handle this as well MemObjRecord *Record = Req->MSYCLMemObj->MRecord.get(); auto EnqueueLeaves = [&EnqueuedCmds](LeavesCollection &Leaves) { for (Command *Cmd : Leaves) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 3ddbefc856d87..03727009c934c 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -730,7 +730,8 @@ class Scheduler { /// /// The function may unlock and lock GraphReadLock as needed. Upon return /// the lock is left in locked state if and only if LockTheLock is true. - static void waitForEvent(EventImplPtr Event, ReadLockT &GraphReadLock, std::vector &EnqueueCommands, + static void waitForEvent(EventImplPtr Event, ReadLockT &GraphReadLock, + std::vector &EnqueueCommands, bool LockTheLock = true); /// Enqueues the command and all its dependencies. @@ -741,7 +742,8 @@ class Scheduler { /// The function may unlock and lock GraphReadLock as needed. Upon return /// the lock is left in locked state. static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult, - std::vector &EnqueuedCommands, BlockingT Blocking = NON_BLOCKING); + std::vector &EnqueuedCommands, + BlockingT Blocking = NON_BLOCKING); }; /// This function waits on all of the graph leaves which somehow use the diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index 1a2fe05e16254..51b7ffa2d595d 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -69,7 +69,8 @@ class MockCommand : public cl::sycl::detail::Command { bool enqueueOrigin(cl::sycl::detail::EnqueueResultT &EnqueueResult, cl::sycl::detail::BlockingT Blocking, std::vector &EnqueuedCmds) { - return cl::sycl::detail::Command::enqueue(EnqueueResult, Blocking, EnqueuedCmds); + return cl::sycl::detail::Command::enqueue(EnqueueResult, Blocking, + EnqueuedCmds); } cl_int MRetVal = CL_SUCCESS; @@ -128,7 +129,8 @@ class MockScheduler : public cl::sycl::detail::Scheduler { cl::sycl::detail::EnqueueResultT &EnqueueResult, cl::sycl::detail::BlockingT Blocking) { std::vector EnqueuedCmds; - return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, EnqueuedCmds, Blocking); + return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, EnqueuedCmds, + Blocking); } cl::sycl::detail::AllocaCommandBase * From 8e3d9bdab89c1b04de3d093b5ed16c6a71c0a954 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 14 Dec 2021 16:49:26 +0300 Subject: [PATCH 09/23] Add deferred cleanup on scheduler destruction --- sycl/source/detail/scheduler/scheduler.cpp | 6 +++++- sycl/source/detail/scheduler/scheduler.hpp | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index fceb2a01be9a1..c36d084cb5aa5 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -400,6 +400,10 @@ Scheduler::~Scheduler() { "not all resources were released. Please be sure that all kernels " "have synchronization points.\n\n"); } + // There might be some commands scheduled for post enqueue cleanup that + // haven't been freed because of the graph mutex being locked at the time, + // clean them up now. + cleanupCommands({}); } void Scheduler::acquireWriteLock(WriteLockT &Lock) { @@ -427,7 +431,7 @@ MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) { return Req->MSYCLMemObj->MRecord.get(); } -void Scheduler::cleanupCommands(std::vector &Cmds) { +void Scheduler::cleanupCommands(const std::vector &Cmds) { if (Cmds.empty()) return; WriteLockT Lock(MGraphLock, std::try_to_lock); diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 03727009c934c..827c9d4f29a92 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -459,7 +459,7 @@ class Scheduler { /// \param Lock is an instance of WriteLockT, created with \c std::defer_lock void acquireWriteLock(WriteLockT &Lock); - void cleanupCommands(std::vector &Cmds); + void cleanupCommands(const std::vector &Cmds); static void enqueueLeavesOfReqUnlocked(const Requirement *const Req, std::vector &EnqueuedCmds); From c9e5e36eca12a392526d593038d83731149c2779 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 14 Dec 2021 18:14:24 +0300 Subject: [PATCH 10/23] Disable cleanup for some of the unit tests --- sycl/unittests/scheduler/LeafLimit.cpp | 12 ++++++++++++ .../scheduler/StreamInitDependencyOnHost.cpp | 10 ++++++++++ 2 files changed, 22 insertions(+) diff --git a/sycl/unittests/scheduler/LeafLimit.cpp b/sycl/unittests/scheduler/LeafLimit.cpp index ffed74ba0e1ec..d13e901d12314 100644 --- a/sycl/unittests/scheduler/LeafLimit.cpp +++ b/sycl/unittests/scheduler/LeafLimit.cpp @@ -9,6 +9,9 @@ #include "SchedulerTest.hpp" #include "SchedulerTestUtils.hpp" +#include +#include + #include #include #include @@ -16,10 +19,19 @@ using namespace cl::sycl; +inline constexpr auto DisablePostEnqueueCleanupName = + "SYCL_DISABLE_POST_ENQUEUE_CLEANUP"; + // Checks that scheduler's (or graph-builder's) addNodeToLeaves method works // correctly with dependency tracking when leaf-limit for generic commands is // overflowed. TEST_F(SchedulerTest, LeafLimit) { + // All of the mock commands are owned on the test side, prevent post enqueue + // cleanup from deleting some of them. + unittest::ScopedEnvVar DisabledCleanup{ + DisablePostEnqueueCleanupName, "1", + detail::SYCLConfig::reset}; + cl::sycl::queue HQueue(host_selector{}); MockScheduler MS; std::vector> LeavesToAdd; std::unique_ptr MockDepCmd; diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index e1e87d8464f57..8f265018cc787 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -9,10 +9,15 @@ #include "SchedulerTest.hpp" #include "SchedulerTestUtils.hpp" +#include #include +#include using namespace cl::sycl; +inline constexpr auto DisablePostEnqueueCleanupName = + "SYCL_DISABLE_POST_ENQUEUE_CLEANUP"; + class MockHandler : public sycl::handler { public: MockHandler(std::shared_ptr Queue, bool IsHost) @@ -91,6 +96,11 @@ static bool ValidateDepCommandsTree(const detail::Command *Cmd, } TEST_F(SchedulerTest, StreamInitDependencyOnHost) { + // Disable post enqueue cleanup so that it doesn't interfere with dependency + // checks. + unittest::ScopedEnvVar DisabledCleanup{ + DisablePostEnqueueCleanupName, "1", + detail::SYCLConfig::reset}; cl::sycl::queue HQueue(host_selector{}); detail::QueueImplPtr HQueueImpl = detail::getSyclObjImpl(HQueue); From 54fd4b4ccd68d0f21f9203b4d7bda0097736aa92 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 14 Dec 2021 19:33:14 +0300 Subject: [PATCH 11/23] Minor stylistic changes --- sycl/source/detail/device_image_impl.hpp | 3 + sycl/source/detail/scheduler/commands.cpp | 10 ++-- sycl/source/detail/scheduler/commands.hpp | 2 +- .../detail/scheduler/graph_processor.cpp | 12 ++-- sycl/source/detail/scheduler/scheduler.cpp | 56 +++++++++---------- sycl/source/detail/scheduler/scheduler.hpp | 4 +- .../scheduler/SchedulerTestUtils.hpp | 8 +-- 7 files changed, 49 insertions(+), 46 deletions(-) diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index e892babe02dc4..58da062ccb5c2 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -185,6 +185,9 @@ class device_image_impl { std::lock_guard Lock{MSpecConstAccessMtx}; if (nullptr == MSpecConstsBuffer && !MSpecConstsBlob.empty()) { const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin(); + // Uses PI_MEM_FLAGS_HOST_PTR_COPY since post-enqueue cleanup might + // destroy MSpecConstsBuffer. + // TODO consider changing the lifetime of device_image_impl instead Plugin.call( detail::getSyclObjImpl(MContext)->getHandleRef(), PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 9792160633e10..4a17b1579b4ef 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -260,7 +260,7 @@ class DispatchHostTask { // of empty command. // Also, it's possible to have record deallocated prior to enqueue process. // Thus we employ read-lock of graph. - std::vector EnqueuedCmds; + std::vector CmdsToCleanUp; Scheduler &Sched = Scheduler::getInstance(); { Scheduler::ReadLockT Lock(Sched.MGraphLock); @@ -274,9 +274,9 @@ class DispatchHostTask { for (const DepDesc &Dep : Deps) Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement, - EnqueuedCmds); + CmdsToCleanUp); } - Sched.cleanupCommands(EnqueuedCmds); + Sched.cleanupCommands(CmdsToCleanUp); } }; @@ -623,7 +623,7 @@ void Command::emitInstrumentation(uint16_t Type, const char *Txt) { } bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, - std::vector &EnqueuedCommands) { + std::vector &CmdsToCleanUp) { // Exit if already enqueued if (MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess) return true; @@ -696,7 +696,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, !SYCLConfig::get()) { assert(!MPostEnqueueCleanup); MPostEnqueueCleanup = true; - EnqueuedCommands.push_back(this); + CmdsToCleanUp.push_back(this); } } diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 5fabd94596edc..5217e58cc90c2 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -125,7 +125,7 @@ class Command { /// command to be unblocked before calling enqueueImp. /// \return true if the command is enqueued. virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, - std::vector &EnqueuedCommands); + std::vector &CmdsToCleanUp); bool isFinished(); diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index fbde85ddb01e3..7c49697defe85 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -23,7 +23,7 @@ static Command *getCommand(const EventImplPtr &Event) { void Scheduler::GraphProcessor::waitForEvent( EventImplPtr Event, ReadLockT &GraphReadLock, - std::vector &EnqueuedCmds, bool LockTheLock) { + std::vector &CmdsToCleanUp, bool LockTheLock) { Command *Cmd = getCommand(Event); // Command can be nullptr if user creates cl::sycl::event explicitly or the // event has been waited on by another thread @@ -31,7 +31,7 @@ void Scheduler::GraphProcessor::waitForEvent( return; EnqueueResultT Res; - bool Enqueued = enqueueCommand(Cmd, Res, EnqueuedCmds, BLOCKING); + bool Enqueued = enqueueCommand(Cmd, Res, CmdsToCleanUp, BLOCKING); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) // TODO: Reschedule commands. throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); @@ -47,7 +47,7 @@ void Scheduler::GraphProcessor::waitForEvent( bool Scheduler::GraphProcessor::enqueueCommand( Command *Cmd, EnqueueResultT &EnqueueResult, - std::vector &EnqueuedCommands, BlockingT Blocking) { + std::vector &CmdsToCleanUp, BlockingT Blocking) { if (!Cmd || Cmd->isSuccessfullyEnqueued()) return true; @@ -60,7 +60,7 @@ bool Scheduler::GraphProcessor::enqueueCommand( // Recursively enqueue all the dependencies first and // exit immediately if any of the commands cannot be enqueued. for (DepDesc &Dep : Cmd->MDeps) { - if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, EnqueuedCommands, + if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, CmdsToCleanUp, Blocking)) return false; } @@ -77,7 +77,7 @@ bool Scheduler::GraphProcessor::enqueueCommand( // implemented. for (const EventImplPtr &Event : Cmd->getPreparedHostDepsEvents()) { if (Command *DepCmd = static_cast(Event->getCommand())) - if (!enqueueCommand(DepCmd, EnqueueResult, EnqueuedCommands, Blocking)) + if (!enqueueCommand(DepCmd, EnqueueResult, CmdsToCleanUp, Blocking)) return false; } @@ -94,7 +94,7 @@ bool Scheduler::GraphProcessor::enqueueCommand( // on completion of C and starts cleanup process. This thread is still in the // middle of enqueue of B. The other thread modifies dependency list of A by // removing C out of it. Iterators become invalid. - return Cmd->enqueue(EnqueueResult, Blocking, EnqueuedCommands); + return Cmd->enqueue(EnqueueResult, Blocking, CmdsToCleanUp); } } // namespace detail diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c36d084cb5aa5..c4fef67665bc2 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -32,33 +32,33 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, // Will contain the list of dependencies for the Release Command std::set DepCommands; #endif - std::vector EnqueuedCmds; + std::vector CmdsToCleanUp; for (Command *Cmd : Record->MReadLeaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION // Capture the dependencies DepCommands.insert(Cmd); #endif - GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, EnqueuedCmds); + GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, CmdsToCleanUp); } for (Command *Cmd : Record->MWriteLeaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION DepCommands.insert(Cmd); #endif - GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, EnqueuedCmds); + GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, CmdsToCleanUp); } for (AllocaCommandBase *AllocaCmd : Record->MAllocaCommands) { Command *ReleaseCmd = AllocaCmd->getReleaseCmd(); EnqueueResultT Res; bool Enqueued = - GraphProcessor::enqueueCommand(ReleaseCmd, Res, EnqueuedCmds); + GraphProcessor::enqueueCommand(ReleaseCmd, Res, CmdsToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -67,7 +67,7 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, ReleaseCmd->resolveReleaseDependencies(DepCommands); #endif GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock, - EnqueuedCmds); + CmdsToCleanUp); } } @@ -111,7 +111,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, NewEvent = NewCmd->getEvent(); } - std::vector EnqueuedCmds; + std::vector CmdsToCleanUp; { ReadLockT Lock(MGraphLock); @@ -131,7 +131,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, }; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); try { if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Auxiliary enqueue process failed.", @@ -149,7 +149,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, EnqueueResultT Res; try { bool Enqueued = - GraphProcessor::enqueueCommand(NewCmd, Res, EnqueuedCmds); + GraphProcessor::enqueueCommand(NewCmd, Res, CmdsToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } catch (...) { @@ -166,7 +166,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, CleanUp(); } } - cleanupCommands(EnqueuedCmds); + cleanupCommands(CmdsToCleanUp); for (auto StreamImplPtr : Streams) { StreamImplPtr->flush(); @@ -188,26 +188,26 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { return nullptr; } - std::vector EnqueuedCmds; + std::vector CmdsToCleanUp; try { ReadLockT Lock(MGraphLock); EnqueueResultT Res; bool Enqueued; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } - Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, EnqueuedCmds); + Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, CmdsToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } catch (...) { NewCmd->getQueue()->reportAsyncException(std::current_exception()); } EventImplPtr NewEvent = NewCmd->getEvent(); - cleanupCommands(EnqueuedCmds); + cleanupCommands(CmdsToCleanUp); return NewEvent; } @@ -219,10 +219,10 @@ void Scheduler::waitForEvent(EventImplPtr Event) { ReadLockT Lock(MGraphLock); // It's fine to leave the lock unlocked upon return from waitForEvent as // there's no more actions to do here with graph - std::vector EnqueuedCmds; - GraphProcessor::waitForEvent(std::move(Event), Lock, EnqueuedCmds, + std::vector CmdsToCleanUp; + GraphProcessor::waitForEvent(std::move(Event), Lock, CmdsToCleanUp, /*LockTheLock=*/false); - cleanupCommands(EnqueuedCmds); + cleanupCommands(CmdsToCleanUp); } static void deallocateStreams( @@ -305,32 +305,32 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { if (!NewCmd) return nullptr; - std::vector EnqueuedCmds; + std::vector CmdsToCleanUp; { ReadLockT ReadLock(MGraphLock); EnqueueResultT Res; bool Enqueued; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } - Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, EnqueuedCmds); + Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, CmdsToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } EventImplPtr NewEvent = NewCmd->getEvent(); - cleanupCommands(EnqueuedCmds); + cleanupCommands(CmdsToCleanUp); return NewEvent; } void Scheduler::releaseHostAccessor(Requirement *Req) { Command *const BlockedCmd = Req->MBlockedCmd; - std::vector EnqueuedCmds; + std::vector CmdsToCleanUp; { ReadLockT Lock(MGraphLock); @@ -338,20 +338,20 @@ void Scheduler::releaseHostAccessor(Requirement *Req) { BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; - enqueueLeavesOfReqUnlocked(Req, EnqueuedCmds); + enqueueLeavesOfReqUnlocked(Req, CmdsToCleanUp); } - cleanupCommands(EnqueuedCmds); + cleanupCommands(CmdsToCleanUp); } // static void Scheduler::enqueueLeavesOfReqUnlocked( - const Requirement *const Req, std::vector &EnqueuedCmds) { + const Requirement *const Req, std::vector &CmdsToCleanUp) { // FIXME handle this as well MemObjRecord *Record = Req->MSYCLMemObj->MRecord.get(); - auto EnqueueLeaves = [&EnqueuedCmds](LeavesCollection &Leaves) { + auto EnqueueLeaves = [&CmdsToCleanUp](LeavesCollection &Leaves) { for (Command *Cmd : Leaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, EnqueuedCmds); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 827c9d4f29a92..e7c11e6e8a814 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -462,7 +462,7 @@ class Scheduler { void cleanupCommands(const std::vector &Cmds); static void enqueueLeavesOfReqUnlocked(const Requirement *const Req, - std::vector &EnqueuedCmds); + std::vector &CmdsToCleanUp); /// Graph builder class. /// @@ -742,7 +742,7 @@ class Scheduler { /// The function may unlock and lock GraphReadLock as needed. Upon return /// the lock is left in locked state. static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult, - std::vector &EnqueuedCommands, + std::vector &CmdsToCleanUp, BlockingT Blocking = NON_BLOCKING); }; diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index 51b7ffa2d595d..8b2b4811744b8 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -68,9 +68,9 @@ class MockCommand : public cl::sycl::detail::Command { std::vector &)); bool enqueueOrigin(cl::sycl::detail::EnqueueResultT &EnqueueResult, cl::sycl::detail::BlockingT Blocking, - std::vector &EnqueuedCmds) { + std::vector &CmdsToCleanUp) { return cl::sycl::detail::Command::enqueue(EnqueueResult, Blocking, - EnqueuedCmds); + CmdsToCleanUp); } cl_int MRetVal = CL_SUCCESS; @@ -128,8 +128,8 @@ class MockScheduler : public cl::sycl::detail::Scheduler { static bool enqueueCommand(cl::sycl::detail::Command *Cmd, cl::sycl::detail::EnqueueResultT &EnqueueResult, cl::sycl::detail::BlockingT Blocking) { - std::vector EnqueuedCmds; - return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, EnqueuedCmds, + std::vector CmdsToCleanUp; + return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, CmdsToCleanUp, Blocking); } From 63524d1820453467c9a12ab8edb8a86cc0c9a9fd Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 15 Dec 2021 17:21:15 +0300 Subject: [PATCH 12/23] Fix an issue with cleaning up nodes on their removal from leaves --- sycl/source/detail/scheduler/commands.cpp | 33 +++-- sycl/source/detail/scheduler/commands.hpp | 12 +- .../source/detail/scheduler/graph_builder.cpp | 127 ++++++++++-------- sycl/source/detail/scheduler/scheduler.hpp | 7 +- sycl/unittests/scheduler/BlockedCommands.cpp | 3 +- sycl/unittests/scheduler/FailedCommands.cpp | 3 +- sycl/unittests/scheduler/LeafLimit.cpp | 4 +- sycl/unittests/scheduler/utils.cpp | 4 +- 8 files changed, 115 insertions(+), 78 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 4a17b1579b4ef..9cd0b0ee49b39 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -518,7 +518,8 @@ void Command::makeTraceEventEpilog() { #endif } -Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { +Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, + std::vector &ToCleanUp) { const QueueImplPtr &WorkerQueue = getWorkerQueue(); const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr(); @@ -550,7 +551,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { // If contexts don't match we'll connect them using host task if (DepEventContext != WorkerContext && !WorkerContext->is_host()) { Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; - ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep); + ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp); } else MPreparedDepsEvents.push_back(std::move(DepEvent)); @@ -570,11 +571,12 @@ bool Command::supportsPostEnqueueCleanup() const { return !MUsers.empty() || !MDeps.empty(); } -Command *Command::addDep(DepDesc NewDep) { +Command *Command::addDep(DepDesc NewDep, std::vector &ToCleanUp) { Command *ConnectionCmd = nullptr; if (NewDep.MDepCommand) { - ConnectionCmd = processDepEvent(NewDep.MDepCommand->getEvent(), NewDep); + ConnectionCmd = + processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp); } MDeps.push_back(NewDep); #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -586,7 +588,8 @@ Command *Command::addDep(DepDesc NewDep) { return ConnectionCmd; } -Command *Command::addDep(EventImplPtr Event) { +Command *Command::addDep(EventImplPtr Event, + std::vector &ToCleanUp) { #ifdef XPTI_ENABLE_INSTRUMENTATION // We need this for just the instrumentation, so guarding it will prevent // unused variable warnings when instrumentation is turned off @@ -596,7 +599,8 @@ Command *Command::addDep(EventImplPtr Event) { emitEdgeEventForEventDependence(Cmd, PiEventAddr); #endif - return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr}); + return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr}, + ToCleanUp); } void Command::emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr) { @@ -802,8 +806,11 @@ AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req, // so this call must be before the addDep() call. emitInstrumentationDataProxy(); // "Nothing to depend on" - Command *ConnectionCmd = addDep(DepDesc(nullptr, getRequirement(), this)); + std::vector ToCleanUp; + Command *ConnectionCmd = + addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp); assert(ConnectionCmd == nullptr); + assert(ToCleanUp.empty()); (void)ConnectionCmd; } @@ -868,7 +875,8 @@ void AllocaCommand::printDot(std::ostream &Stream) const { AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, - std::vector &ToEnqueue) + std::vector &ToEnqueue, + std::vector &ToCleanUp) : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue), std::move(Req), /*LinkedAllocaCmd*/ nullptr), @@ -877,8 +885,8 @@ AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, // is added to this node, so this call must be before // the addDep() call. emitInstrumentationDataProxy(); - Command *ConnectionCmd = - addDep(DepDesc(MParentAlloca, getRequirement(), MParentAlloca)); + Command *ConnectionCmd = addDep( + DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp); if (ConnectionCmd) ToEnqueue.push_back(ConnectionCmd); } @@ -1448,8 +1456,11 @@ void EmptyCommand::addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, const Requirement *const StoredReq = &MRequirements.back(); // EmptyCommand is always host one, so we believe that result of addDep is nil - Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}); + std::vector ToCleanUp; + Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp); assert(Cmd == nullptr && "Conection command should be null for EmptyCommand"); + assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if " + "there's a connection command"); (void)Cmd; } diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 5217e58cc90c2..2184ab3ad4b91 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -108,10 +108,12 @@ class Command { Command(CommandType Type, QueueImplPtr Queue); /// \return an optional connection cmd to enqueue - [[nodiscard]] Command *addDep(DepDesc NewDep); + [[nodiscard]] Command *addDep(DepDesc NewDep, + std::vector &ToCleanUp); /// \return an optional connection cmd to enqueue - [[nodiscard]] Command *addDep(EventImplPtr Event); + [[nodiscard]] Command *addDep(EventImplPtr Event, + std::vector &ToCleanUp); void addUser(Command *NewUser) { MUsers.insert(NewUser); } @@ -224,7 +226,8 @@ class Command { /// /// Optionality of Dep is set by Dep.MDepCommand not equal to nullptr. [[nodiscard]] Command *processDepEvent(EventImplPtr DepEvent, - const DepDesc &Dep); + const DepDesc &Dep, + std::vector &ToCleanUp); /// Private interface. Derived classes should implement this method. virtual cl_int enqueueImp() = 0; @@ -414,7 +417,8 @@ class AllocaSubBufCommand : public AllocaCommandBase { public: AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, AllocaCommandBase *ParentAlloca, - std::vector &ToEnqueue); + std::vector &ToEnqueue, + std::vector &ToCleanUp); void *getMemAllocation() const final; void printDot(std::ostream &Stream) const final; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 27cc8b6987e85..321a8e7a8ff82 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -190,14 +190,17 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( // of the requirements for the current record DepDesc Dep = findDepForRecord(Dependant, Record); Dep.MDepCommand = Dependency; - if (Command *ConnectionCmd = Dependant->addDep(Dep)) + std::vector ToCleanUp; + if (Command *ConnectionCmd = Dependant->addDep(Dep, ToCleanUp)) ToEnqueue.push_back(ConnectionCmd); Dependency->addUser(Dependant); --(Dependency->MLeafCounter); if (Dependency->MLeafCounter == 0 && Dependency->isSuccessfullyEnqueued() && Dependency->supportsPostEnqueueCleanup()) - cleanupCommand(Dependency); + ToCleanUp.push_back(Dependency); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); }; const ContextImplPtr &InteropCtxPtr = Req->MSYCLMemObj->getInteropContext(); @@ -229,9 +232,10 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( return MemObject->MRecord.get(); } -void Scheduler::GraphBuilder::updateLeaves( - const std::set &Cmds, MemObjRecord *Record, - access::mode AccessMode, std::vector *CommandsToCleanUp) { +void Scheduler::GraphBuilder::updateLeaves(const std::set &Cmds, + MemObjRecord *Record, + access::mode AccessMode, + std::vector &ToCleanUp) { const bool ReadOnlyReq = AccessMode == access::mode::read; if (ReadOnlyReq) @@ -241,13 +245,9 @@ void Scheduler::GraphBuilder::updateLeaves( bool WasLeaf = Cmd->MLeafCounter > 0; Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd); Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd); - if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && + if (WasLeaf && Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() && Cmd->supportsPostEnqueueCleanup()) { - if (CommandsToCleanUp) { - if (WasLeaf) - CommandsToCleanUp->push_back(Cmd); - } else - cleanupCommand(Cmd); + ToCleanUp.push_back(Cmd); } } } @@ -276,15 +276,18 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( std::set Deps = findDepsForReq(Record, Req, Queue->getContextImplPtr()); + std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = - UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}); + UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); Dep->addUser(UpdateCommand); } - updateLeaves(Deps, Record, Req->MAccessMode); + updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp); addNodeToLeaves(Record, UpdateCommand, Req->MAccessMode, ToEnqueue); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); return UpdateCommand; } @@ -389,16 +392,18 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue()); } } - + std::vector ToCleanUp; for (Command *Dep : Deps) { - Command *ConnCmd = - NewCmd->addDep(DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}); + Command *ConnCmd = NewCmd->addDep( + DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); Dep->addUser(NewCmd); } - updateLeaves(Deps, Record, access::mode::read_write); + updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp); addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); Record->MCurContext = Queue->getContextImplPtr(); return NewCmd; } @@ -427,22 +432,25 @@ Command *Scheduler::GraphBuilder::remapMemoryObject( LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(), &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode); + std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = UnMapCmd->addDep( - DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}); + DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); Dep->addUser(UnMapCmd); } Command *ConnCmd = MapCmd->addDep( - DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}); + DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); UnMapCmd->addUser(MapCmd); - updateLeaves(Deps, Record, access::mode::read_write); + updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp); addNodeToLeaves(Record, MapCmd, access::mode::read_write, ToEnqueue); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); Record->MHostAccess = MapMode; return MapCmd; } @@ -475,16 +483,20 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req, throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY); MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release(); + + std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = MemCpyCmd->addDep( - DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}); + DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); Dep->addUser(MemCpyCmd); } - updateLeaves(Deps, Record, Req->MAccessMode); + updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp); addNodeToLeaves(Record, MemCpyCmd, Req->MAccessMode, ToEnqueue); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); if (MPrintOptionsArray[AfterAddCopyBack]) printGraphAsDot("after_addCopyBack"); return MemCpyCmd; @@ -664,6 +676,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( findAllocaForReq(Record, Req, Queue->getContextImplPtr()); if (!AllocaCmd) { + std::vector ToCleanUp; if (IsSuitableSubReq(Req)) { // Get parent requirement. It's hard to get right parents' range // so full parent requirement has range represented in bytes @@ -675,7 +688,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( auto *ParentAlloca = getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue); - AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue); + AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue, + ToCleanUp); } else { const Requirement FullReq(/*Offset*/ {0, 0, 0}, Req->MMemoryRange, @@ -761,8 +775,10 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // Update linked command if (LinkedAllocaCmd) { - Command *ConnCmd = AllocaCmd->addDep(DepDesc{ - LinkedAllocaCmd, AllocaCmd->getRequirement(), LinkedAllocaCmd}); + Command *ConnCmd = AllocaCmd->addDep( + DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(), + LinkedAllocaCmd}, + ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); LinkedAllocaCmd->addUser(AllocaCmd); @@ -771,7 +787,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // To ensure that the leader allocation is removed first ConnCmd = AllocaCmd->getReleaseCmd()->addDep( DepDesc(LinkedAllocaCmd->getReleaseCmd(), - AllocaCmd->getRequirement(), LinkedAllocaCmd)); + AllocaCmd->getRequirement(), LinkedAllocaCmd), + ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); @@ -788,13 +805,13 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( std::set Deps = findDepsForReq(Record, Req, Queue->getContextImplPtr()); for (Command *Dep : Deps) { - Command *ConnCmd = - AllocaCmd->addDep(DepDesc{Dep, Req, LinkedAllocaCmd}); + Command *ConnCmd = AllocaCmd->addDep( + DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); Dep->addUser(AllocaCmd); } - updateLeaves(Deps, Record, Req->MAccessMode); + updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp); addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue); } } @@ -803,6 +820,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( Record->MAllocaCommands.push_back(AllocaCmd); Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue); ++(AllocaCmd->MLeafCounter); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); } return AllocaCmd; } @@ -851,13 +870,16 @@ Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector &Reqs, Cmd->addUser(EmptyCmd); const std::vector &Deps = Cmd->MDeps; + std::vector ToCleanUp; for (const DepDesc &Dep : Deps) { const Requirement *Req = Dep.MDepRequirement; MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); - updateLeaves({Cmd}, Record, Req->MAccessMode); + updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp); addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue); } + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); return EmptyCmd; } @@ -911,6 +933,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, // AllocaCommand creation will be dependent on the access mode of the first // requirement. Combine these access modes to take all of them into account. combineAccessModesOfReqs(Reqs); + std::vector ToCleanUp; for (Requirement *Req : Reqs) { MemObjRecord *Record = nullptr; AllocaCommandBase *AllocaCmd = nullptr; @@ -968,7 +991,8 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, findDepsForReq(Record, Req, Queue->getContextImplPtr()); for (Command *Dep : Deps) - if (Command *ConnCmd = NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd})) + if (Command *ConnCmd = + NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp)) ToEnqueue.push_back(ConnCmd); } @@ -978,25 +1002,18 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, // FIXME employ a reference here to eliminate copying of a vector // Updating leaves might also clean up some of the dep commands, so update // their users first. - // FIXME there's probably a better way of handling cleanup & leaf/dep update - // here considering that some of the updated might be destroyed by cleanup - // immediately after. std::vector Deps = NewCmd->MDeps; - std::vector CommandsToCleanUp; for (DepDesc &Dep : Deps) { Dep.MDepCommand->addUser(NewCmd.get()); const Requirement *Req = Dep.MDepRequirement; MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); - updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, - &CommandsToCleanUp); + updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp); addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue); } - for (Command *Cmd : CommandsToCleanUp) - cleanupCommand(Cmd); // Register all the events as dependencies for (detail::EventImplPtr e : Events) { - if (Command *ConnCmd = NewCmd->addDep(e)) + if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp)) ToEnqueue.push_back(ConnCmd); } @@ -1008,6 +1025,8 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, if (MPrintOptionsArray[AfterAddCG]) printGraphAsDot("after_addCG"); + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); return NewCmd.release(); } @@ -1148,14 +1167,6 @@ void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) { } #endif - // TODO enable cleaning up host tasks after enqueue. - if (CmdT == Command::RUN_CG) { - auto *ExecCGCmd = static_cast(Cmd); - if (ExecCGCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask) { - return; - } - } - for (Command *UserCmd : Cmd->MUsers) { for (DepDesc &Dep : UserCmd->MDeps) { // Link the users of the command to the alloca command(s) instead @@ -1175,6 +1186,7 @@ void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) { Command *DepCmd = Dep.MDepCommand; DepCmd->MUsers.erase(Cmd); } + Cmd->getEvent()->setCommand(nullptr); Cmd->getEvent()->cleanupDependencyEvents(); delete Cmd; @@ -1266,9 +1278,9 @@ void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) { // requirement in Dep we make ConnectCmd depend on DepEvent's command with this // requirement. // Optionality of Dep is set by Dep.MDepCommand equal to nullptr. -Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, - EventImplPtr DepEvent, - const DepDesc &Dep) { +Command *Scheduler::GraphBuilder::connectDepEvent( + Command *const Cmd, EventImplPtr DepEvent, const DepDesc &Dep, + std::vector &ToCleanUp) { assert(Cmd->getWorkerContext() != DepEvent->getContextImpl()); // construct Host Task type command manually and make it depend on DepEvent @@ -1297,14 +1309,15 @@ Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, // make ConnectCmd depend on requirement // Dismiss the result here as it's not a connection now, // 'cause ConnectCmd is host one - (void)ConnectCmd->addDep(Dep); + (void)ConnectCmd->addDep(Dep, ToCleanUp); assert(reinterpret_cast(DepEvent->getCommand()) == Dep.MDepCommand); // add user to Dep.MDepCommand is already performed beyond this if branch MemObjRecord *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj); + updateLeaves({Dep.MDepCommand}, Record, Dep.MDepRequirement->MAccessMode, + ToCleanUp); - updateLeaves({Dep.MDepCommand}, Record, Dep.MDepRequirement->MAccessMode); std::vector ToEnqueue; addNodeToLeaves(Record, ConnectCmd, Dep.MDepRequirement->MAccessMode, ToEnqueue); @@ -1324,7 +1337,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, // Dismiss the result here as it's not a connection now, // 'cause EmptyCmd is host one - (void)Cmd->addDep(CmdDep); + (void)Cmd->addDep(CmdDep, ToCleanUp); } } else { std::vector ToEnqueue; @@ -1337,13 +1350,13 @@ Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, // ConnectCmd via its event. // Dismiss the result here as it's not a connection now, // 'cause ConnectCmd is host one. - (void)EmptyCmd->addDep(ConnectCmd->getEvent()); - (void)ConnectCmd->addDep(DepEvent); + (void)EmptyCmd->addDep(ConnectCmd->getEvent(), ToCleanUp); + (void)ConnectCmd->addDep(DepEvent, ToCleanUp); // Depend Cmd on empty command // Dismiss the result here as it's not a connection now, // 'cause EmptyCmd is host one - (void)Cmd->addDep(EmptyCmd->getEvent()); + (void)Cmd->addDep(EmptyCmd->getEvent(), ToCleanUp); } EmptyCmd->addUser(Cmd); diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index e7c11e6e8a814..30bb83a711c25 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -553,17 +553,20 @@ class Scheduler { /// Removes commands from leaves. void updateLeaves(const std::set &Cmds, MemObjRecord *Record, access::mode AccessMode, - std::vector *CommandsToCleanUp = nullptr); + std::vector &ToCleanUp); /// Perform connection of events in multiple contexts /// \param Cmd dependant command /// \param DepEvent event to depend on /// \param Dep optional DepDesc to perform connection properly + /// \param ToCleanUp container for commands that can be cleaned up due to + /// their removal from leaves /// \returns the connecting command which is to be enqueued /// /// Optionality of Dep is set by Dep.MDepCommand equal to nullptr. Command *connectDepEvent(Command *const Cmd, EventImplPtr DepEvent, - const DepDesc &Dep); + const DepDesc &Dep, + std::vector &ToCleanUp); std::vector MMemObjs; diff --git a/sycl/unittests/scheduler/BlockedCommands.cpp b/sycl/unittests/scheduler/BlockedCommands.cpp index 9ff6fce1460ef..c45b785980c8b 100644 --- a/sycl/unittests/scheduler/BlockedCommands.cpp +++ b/sycl/unittests/scheduler/BlockedCommands.cpp @@ -154,7 +154,8 @@ TEST_F(SchedulerTest, EnqueueHostDependency) { new cl::sycl::detail::event_impl(detail::getSyclObjImpl(MQueue))}; DepEvent->setCommand(&B); - (void)A.addDep(DepEvent); + std::vector ToCleanUp; + (void)A.addDep(DepEvent, ToCleanUp); // We have such a "graph": // diff --git a/sycl/unittests/scheduler/FailedCommands.cpp b/sycl/unittests/scheduler/FailedCommands.cpp index 37a7a71a4afdc..4d294b8eff741 100644 --- a/sycl/unittests/scheduler/FailedCommands.cpp +++ b/sycl/unittests/scheduler/FailedCommands.cpp @@ -16,7 +16,8 @@ TEST_F(SchedulerTest, FailedDependency) { MockCommand MDep(detail::getSyclObjImpl(MQueue)); MockCommand MUser(detail::getSyclObjImpl(MQueue)); MDep.addUser(&MUser); - (void)MUser.addDep(detail::DepDesc{&MDep, &MockReq, nullptr}); + std::vector ToCleanUp; + (void)MUser.addDep(detail::DepDesc{&MDep, &MockReq, nullptr}, ToCleanUp); MUser.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueReady; MDep.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueFailed; diff --git a/sycl/unittests/scheduler/LeafLimit.cpp b/sycl/unittests/scheduler/LeafLimit.cpp index d13e901d12314..333a06dca5bd3 100644 --- a/sycl/unittests/scheduler/LeafLimit.cpp +++ b/sycl/unittests/scheduler/LeafLimit.cpp @@ -52,10 +52,12 @@ TEST_F(SchedulerTest, LeafLimit) { std::make_unique(detail::getSyclObjImpl(MQueue), MockReq)); } // Create edges: all soon-to-be leaves are direct users of MockDep + std::vector ToCleanUp; for (auto &Leaf : LeavesToAdd) { MockDepCmd->addUser(Leaf.get()); (void)Leaf->addDep( - detail::DepDesc{MockDepCmd.get(), Leaf->getRequirement(), nullptr}); + detail::DepDesc{MockDepCmd.get(), Leaf->getRequirement(), nullptr}, + ToCleanUp); } std::vector ToEnqueue; // Add edges as leaves and exceed the leaf limit diff --git a/sycl/unittests/scheduler/utils.cpp b/sycl/unittests/scheduler/utils.cpp index b6bb23b4325d8..373b4572ecfb4 100644 --- a/sycl/unittests/scheduler/utils.cpp +++ b/sycl/unittests/scheduler/utils.cpp @@ -10,8 +10,10 @@ void addEdge(cl::sycl::detail::Command *User, cl::sycl::detail::Command *Dep, cl::sycl::detail::AllocaCommandBase *Alloca) { + std::vector ToCleanUp; (void)User->addDep( - cl::sycl::detail::DepDesc{Dep, User->getRequirement(), Alloca}); + cl::sycl::detail::DepDesc{Dep, User->getRequirement(), Alloca}, + ToCleanUp); Dep->addUser(User); } From 2c1e93994874246aaf76fd23c9c8b01ba791a5a6 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 15 Dec 2021 18:30:53 +0300 Subject: [PATCH 13/23] Minor non-functional updates --- sycl/doc/EnvironmentVariables.md | 1 + sycl/source/detail/config.def | 2 +- sycl/source/detail/scheduler/commands.cpp | 11 ++-- sycl/source/detail/scheduler/commands.hpp | 8 ++- .../source/detail/scheduler/graph_builder.cpp | 2 - .../detail/scheduler/graph_processor.cpp | 18 +++--- sycl/source/detail/scheduler/scheduler.cpp | 64 +++++++++---------- sycl/source/detail/scheduler/scheduler.hpp | 8 ++- .../scheduler/SchedulerTestUtils.hpp | 8 +-- 9 files changed, 63 insertions(+), 59 deletions(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 6de220275d116..0f6f4edb774da 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -92,6 +92,7 @@ variables in production code. | `SYCL_DEVICELIB_NO_FALLBACK` | Any(\*) | Disable loading and linking of device library images | | `SYCL_PRINT_EXECUTION_GRAPH` | Described [below](#sycl_print_execution_graph-options) | Print execution graph to DOT text file. | | `SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP` | Any(\*) | Disable cleanup of finished command nodes at host-device synchronization points. | +| `SYCL_DISABLE_POST_ENQUEUE_CLEANUP` | Any(\*) | Disable cleanup of enqueued command nodes. | | `SYCL_THROW_ON_BLOCK` | Any(\*) | Throw an exception on attempt to wait for a blocked command. | | `SYCL_DEVICELIB_INHIBIT_NATIVE` | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. | | `SYCL_PROGRAM_COMPILE_OPTIONS` | String of valid OpenCL compile options | Override compile options for all programs. | diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def index 0ed2d45d0b16c..95750092bfb2e 100644 --- a/sycl/source/detail/config.def +++ b/sycl/source/detail/config.def @@ -12,6 +12,7 @@ CONFIG(SYCL_PRINT_EXECUTION_GRAPH, 32, __SYCL_PRINT_EXECUTION_GRAPH) CONFIG(SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP, 1, __SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP) +CONFIG(SYCL_DISABLE_POST_ENQUEUE_CLEANUP, 1, __SYCL_DISABLE_POST_ENQUEUE_CLEANUP) CONFIG(SYCL_DEVICE_ALLOWLIST, 1024, __SYCL_DEVICE_ALLOWLIST) CONFIG(SYCL_BE, 16, __SYCL_BE) CONFIG(SYCL_PI_TRACE, 16, __SYCL_PI_TRACE) @@ -36,4 +37,3 @@ CONFIG(SYCL_CACHE_MAX_DEVICE_IMAGE_SIZE, 16, __SYCL_CACHE_MAX_DEVICE_IMAGE_SIZE) CONFIG(INTEL_ENABLE_OFFLOAD_ANNOTATIONS, 1, __SYCL_INTEL_ENABLE_OFFLOAD_ANNOTATIONS) CONFIG(SYCL_ENABLE_DEFAULT_CONTEXTS, 1, __SYCL_ENABLE_DEFAULT_CONTEXTS) CONFIG(SYCL_QUEUE_THREAD_POOL_SIZE, 4, __SYCL_QUEUE_THREAD_POOL_SIZE) -CONFIG(SYCL_DISABLE_POST_ENQUEUE_CLEANUP, 1, __SYCL_DISABLE_POST_ENQUEUE_CLEANUP) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index bbcbd84fcb997..1527862ae46db 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -267,7 +267,7 @@ class DispatchHostTask { // of empty command. // Also, it's possible to have record deallocated prior to enqueue process. // Thus we employ read-lock of graph. - std::vector CmdsToCleanUp; + std::vector ToCleanUp; Scheduler &Sched = Scheduler::getInstance(); { Scheduler::ReadLockT Lock(Sched.MGraphLock); @@ -280,10 +280,9 @@ class DispatchHostTask { EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; for (const DepDesc &Dep : Deps) - Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement, - CmdsToCleanUp); + Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement, ToCleanUp); } - Sched.cleanupCommands(CmdsToCleanUp); + Sched.cleanupCommands(ToCleanUp); } }; @@ -635,7 +634,7 @@ void Command::emitInstrumentation(uint16_t Type, const char *Txt) { } bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, - std::vector &CmdsToCleanUp) { + std::vector &ToCleanUp) { // Exit if already enqueued if (MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess) return true; @@ -708,7 +707,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, !SYCLConfig::get()) { assert(!MPostEnqueueCleanup); MPostEnqueueCleanup = true; - CmdsToCleanUp.push_back(this); + ToCleanUp.push_back(this); } } diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index d0bf97a2b73b9..dee4c4972f788 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -107,10 +107,14 @@ class Command { Command(CommandType Type, QueueImplPtr Queue); + /// \param NewDep dependency to be added + /// \param ToCleanUp container for commands that can be cleaned up. /// \return an optional connection cmd to enqueue [[nodiscard]] Command *addDep(DepDesc NewDep, std::vector &ToCleanUp); + /// \param NewDep dependency to be added + /// \param ToCleanUp container for commands that can be cleaned up. /// \return an optional connection cmd to enqueue [[nodiscard]] Command *addDep(EventImplPtr Event, std::vector &ToCleanUp); @@ -125,9 +129,10 @@ class Command { /// \param EnqueueResult is set to the specific status if enqueue failed. /// \param Blocking if this argument is true, function will wait for the /// command to be unblocked before calling enqueueImp. + /// \param ToCleanUp container for commands that can be cleaned up. /// \return true if the command is enqueued. virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, - std::vector &CmdsToCleanUp); + std::vector &ToCleanUp); bool isFinished(); @@ -218,6 +223,7 @@ class Command { /// Perform glueing of events from different contexts /// \param DepEvent event this commands should depend on /// \param Dep optional DepDesc to perform connection of events properly + /// \param ToCleanUp is a /// \return returns an optional connection command to enqueue /// /// Glueing (i.e. connecting) will be performed if and only if DepEvent is diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 321a8e7a8ff82..2de8e6fe2dc52 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1000,8 +1000,6 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, // Node dependencies can be modified further when adding the node to leaves, // iterate over their copy. // FIXME employ a reference here to eliminate copying of a vector - // Updating leaves might also clean up some of the dep commands, so update - // their users first. std::vector Deps = NewCmd->MDeps; for (DepDesc &Dep : Deps) { Dep.MDepCommand->addUser(NewCmd.get()); diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index 7c49697defe85..6e533df30a09c 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -21,9 +21,10 @@ static Command *getCommand(const EventImplPtr &Event) { return (Command *)Event->getCommand(); } -void Scheduler::GraphProcessor::waitForEvent( - EventImplPtr Event, ReadLockT &GraphReadLock, - std::vector &CmdsToCleanUp, bool LockTheLock) { +void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event, + ReadLockT &GraphReadLock, + std::vector &ToCleanUp, + bool LockTheLock) { Command *Cmd = getCommand(Event); // Command can be nullptr if user creates cl::sycl::event explicitly or the // event has been waited on by another thread @@ -31,7 +32,7 @@ void Scheduler::GraphProcessor::waitForEvent( return; EnqueueResultT Res; - bool Enqueued = enqueueCommand(Cmd, Res, CmdsToCleanUp, BLOCKING); + bool Enqueued = enqueueCommand(Cmd, Res, ToCleanUp, BLOCKING); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) // TODO: Reschedule commands. throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); @@ -47,7 +48,7 @@ void Scheduler::GraphProcessor::waitForEvent( bool Scheduler::GraphProcessor::enqueueCommand( Command *Cmd, EnqueueResultT &EnqueueResult, - std::vector &CmdsToCleanUp, BlockingT Blocking) { + std::vector &ToCleanUp, BlockingT Blocking) { if (!Cmd || Cmd->isSuccessfullyEnqueued()) return true; @@ -60,8 +61,7 @@ bool Scheduler::GraphProcessor::enqueueCommand( // Recursively enqueue all the dependencies first and // exit immediately if any of the commands cannot be enqueued. for (DepDesc &Dep : Cmd->MDeps) { - if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, CmdsToCleanUp, - Blocking)) + if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, ToCleanUp, Blocking)) return false; } @@ -77,7 +77,7 @@ bool Scheduler::GraphProcessor::enqueueCommand( // implemented. for (const EventImplPtr &Event : Cmd->getPreparedHostDepsEvents()) { if (Command *DepCmd = static_cast(Event->getCommand())) - if (!enqueueCommand(DepCmd, EnqueueResult, CmdsToCleanUp, Blocking)) + if (!enqueueCommand(DepCmd, EnqueueResult, ToCleanUp, Blocking)) return false; } @@ -94,7 +94,7 @@ bool Scheduler::GraphProcessor::enqueueCommand( // on completion of C and starts cleanup process. This thread is still in the // middle of enqueue of B. The other thread modifies dependency list of A by // removing C out of it. Iterators become invalid. - return Cmd->enqueue(EnqueueResult, Blocking, CmdsToCleanUp); + return Cmd->enqueue(EnqueueResult, Blocking, ToCleanUp); } } // namespace detail diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c4fef67665bc2..ea118470f9415 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -32,33 +32,32 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, // Will contain the list of dependencies for the Release Command std::set DepCommands; #endif - std::vector CmdsToCleanUp; + std::vector ToCleanUp; for (Command *Cmd : Record->MReadLeaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION // Capture the dependencies DepCommands.insert(Cmd); #endif - GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, CmdsToCleanUp); + GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, ToCleanUp); } for (Command *Cmd : Record->MWriteLeaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION DepCommands.insert(Cmd); #endif - GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, CmdsToCleanUp); + GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, ToCleanUp); } for (AllocaCommandBase *AllocaCmd : Record->MAllocaCommands) { Command *ReleaseCmd = AllocaCmd->getReleaseCmd(); EnqueueResultT Res; - bool Enqueued = - GraphProcessor::enqueueCommand(ReleaseCmd, Res, CmdsToCleanUp); + bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -67,7 +66,7 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, ReleaseCmd->resolveReleaseDependencies(DepCommands); #endif GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock, - CmdsToCleanUp); + ToCleanUp); } } @@ -111,7 +110,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, NewEvent = NewCmd->getEvent(); } - std::vector CmdsToCleanUp; + std::vector ToCleanUp; { ReadLockT Lock(MGraphLock); @@ -131,7 +130,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, }; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); try { if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Auxiliary enqueue process failed.", @@ -148,8 +147,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, // TODO: Check if lazy mode. EnqueueResultT Res; try { - bool Enqueued = - GraphProcessor::enqueueCommand(NewCmd, Res, CmdsToCleanUp); + bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } catch (...) { @@ -166,7 +164,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, CleanUp(); } } - cleanupCommands(CmdsToCleanUp); + cleanupCommands(ToCleanUp); for (auto StreamImplPtr : Streams) { StreamImplPtr->flush(); @@ -188,26 +186,26 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { return nullptr; } - std::vector CmdsToCleanUp; + std::vector ToCleanUp; try { ReadLockT Lock(MGraphLock); EnqueueResultT Res; bool Enqueued; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } - Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, CmdsToCleanUp); + Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } catch (...) { NewCmd->getQueue()->reportAsyncException(std::current_exception()); } EventImplPtr NewEvent = NewCmd->getEvent(); - cleanupCommands(CmdsToCleanUp); + cleanupCommands(ToCleanUp); return NewEvent; } @@ -219,10 +217,10 @@ void Scheduler::waitForEvent(EventImplPtr Event) { ReadLockT Lock(MGraphLock); // It's fine to leave the lock unlocked upon return from waitForEvent as // there's no more actions to do here with graph - std::vector CmdsToCleanUp; - GraphProcessor::waitForEvent(std::move(Event), Lock, CmdsToCleanUp, + std::vector ToCleanUp; + GraphProcessor::waitForEvent(std::move(Event), Lock, ToCleanUp, /*LockTheLock=*/false); - cleanupCommands(CmdsToCleanUp); + cleanupCommands(ToCleanUp); } static void deallocateStreams( @@ -305,32 +303,32 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { if (!NewCmd) return nullptr; - std::vector CmdsToCleanUp; + std::vector ToCleanUp; { ReadLockT ReadLock(MGraphLock); EnqueueResultT Res; bool Enqueued; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } - Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, CmdsToCleanUp); + Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } EventImplPtr NewEvent = NewCmd->getEvent(); - cleanupCommands(CmdsToCleanUp); + cleanupCommands(ToCleanUp); return NewEvent; } void Scheduler::releaseHostAccessor(Requirement *Req) { Command *const BlockedCmd = Req->MBlockedCmd; - std::vector CmdsToCleanUp; + std::vector ToCleanUp; { ReadLockT Lock(MGraphLock); @@ -338,20 +336,20 @@ void Scheduler::releaseHostAccessor(Requirement *Req) { BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; - enqueueLeavesOfReqUnlocked(Req, CmdsToCleanUp); + enqueueLeavesOfReqUnlocked(Req, ToCleanUp); } - cleanupCommands(CmdsToCleanUp); + cleanupCommands(ToCleanUp); } // static -void Scheduler::enqueueLeavesOfReqUnlocked( - const Requirement *const Req, std::vector &CmdsToCleanUp) { +void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req, + std::vector &ToCleanUp) { // FIXME handle this as well MemObjRecord *Record = Req->MSYCLMemObj->MRecord.get(); - auto EnqueueLeaves = [&CmdsToCleanUp](LeavesCollection &Leaves) { + auto EnqueueLeaves = [&ToCleanUp](LeavesCollection &Leaves) { for (Command *Cmd : Leaves) { EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, CmdsToCleanUp); + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } @@ -435,8 +433,8 @@ void Scheduler::cleanupCommands(const std::vector &Cmds) { if (Cmds.empty()) return; WriteLockT Lock(MGraphLock, std::try_to_lock); - // In order to avoid deadlocks related to block commands, defer cleanup if the - // lock wasn't acquired. + // In order to avoid deadlocks related to blocked commands, defer cleanup if + // the lock wasn't acquired. if (Lock.owns_lock()) { for (Command *Cmd : Cmds) { MGraphBuilder.cleanupCommand(Cmd); diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 30bb83a711c25..63e2cccaabfd2 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -462,7 +462,7 @@ class Scheduler { void cleanupCommands(const std::vector &Cmds); static void enqueueLeavesOfReqUnlocked(const Requirement *const Req, - std::vector &CmdsToCleanUp); + std::vector &ToCleanUp); /// Graph builder class. /// @@ -729,23 +729,25 @@ class Scheduler { public: /// Waits for the command, associated with Event passed, is completed. /// \param GraphReadLock read-lock which is already acquired for reading + /// \param ToCleanUp container for commands that can be cleaned up. /// \param LockTheLock selects if graph lock should be locked upon return /// /// The function may unlock and lock GraphReadLock as needed. Upon return /// the lock is left in locked state if and only if LockTheLock is true. static void waitForEvent(EventImplPtr Event, ReadLockT &GraphReadLock, - std::vector &EnqueueCommands, + std::vector &ToCleanUp, bool LockTheLock = true); /// Enqueues the command and all its dependencies. /// /// \param EnqueueResult is set to specific status if enqueue failed. + /// \param ToCleanUp container for commands that can be cleaned up. /// \return true if the command is successfully enqueued. /// /// The function may unlock and lock GraphReadLock as needed. Upon return /// the lock is left in locked state. static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult, - std::vector &CmdsToCleanUp, + std::vector &ToCleanUp, BlockingT Blocking = NON_BLOCKING); }; diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index 8cc3da53ed4f2..7c012fa8893cc 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -68,9 +68,9 @@ class MockCommand : public cl::sycl::detail::Command { std::vector &)); bool enqueueOrigin(cl::sycl::detail::EnqueueResultT &EnqueueResult, cl::sycl::detail::BlockingT Blocking, - std::vector &CmdsToCleanUp) { + std::vector &ToCleanUp) { return cl::sycl::detail::Command::enqueue(EnqueueResult, Blocking, - CmdsToCleanUp); + ToCleanUp); } cl_int MRetVal = CL_SUCCESS; @@ -130,8 +130,8 @@ class MockScheduler : public cl::sycl::detail::Scheduler { static bool enqueueCommand(cl::sycl::detail::Command *Cmd, cl::sycl::detail::EnqueueResultT &EnqueueResult, cl::sycl::detail::BlockingT Blocking) { - std::vector CmdsToCleanUp; - return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, CmdsToCleanUp, + std::vector ToCleanUp; + return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, ToCleanUp, Blocking); } From 140fce79bfb95b9965bd4483b474bc435e232987 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 15 Dec 2021 19:08:01 +0300 Subject: [PATCH 14/23] Adjust queue flushing unit test to the new changes --- sycl/unittests/scheduler/QueueFlushing.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp index 14e622dcdeb43..8078056803e7d 100644 --- a/sycl/unittests/scheduler/QueueFlushing.cpp +++ b/sycl/unittests/scheduler/QueueFlushing.cpp @@ -98,8 +98,9 @@ static void addDepAndEnqueue(detail::Command *Cmd, detail::QueueImplPtr &DepQueue, detail::Requirement &MockReq) { MockCommand DepCmd(DepQueue); + std::vector ToCleanUp; DepCmd.getEvent()->getHandleRef() = reinterpret_cast(new int{}); - (void)Cmd->addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}); + (void)Cmd->addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}, ToCleanUp); detail::EnqueueResultT Res; MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); @@ -159,6 +160,7 @@ TEST_F(SchedulerTest, QueueFlushing) { detail::AllocaCommand AllocaCmd = detail::AllocaCommand(QueueImplA, MockReq); void *MockHostPtr; detail::EnqueueResultT Res; + std::vector ToCleanUp; // Check that each of the non-blocking commands flush the dependency queue { @@ -194,7 +196,7 @@ TEST_F(SchedulerTest, QueueFlushing) { /*Events*/ {})}; detail::ExecCGCommand ExecCGCmd{std::move(CG), QueueImplA}; MockReq.MDims = 1; - (void)ExecCGCmd.addDep(detail::DepDesc(&AllocaCmd, &MockReq, &AllocaCmd)); + (void)ExecCGCmd.addDep(detail::DepDesc(&AllocaCmd, &MockReq, &AllocaCmd), ToCleanUp); testCommandEnqueue(&ExecCGCmd, QueueImplB, MockReq); } @@ -206,7 +208,7 @@ TEST_F(SchedulerTest, QueueFlushing) { detail::EventImplPtr DepEvent{new detail::event_impl(QueueImplB)}; DepEvent->setContextImpl(QueueImplB->getContextImplPtr()); DepEvent->getHandleRef() = reinterpret_cast(new int{}); - (void)Cmd.addDep(DepEvent); + (void)Cmd.addDep(DepEvent, ToCleanUp); MockScheduler::enqueueCommand(&Cmd, Res, detail::NON_BLOCKING); EXPECT_TRUE(QueueFlushed); } @@ -224,7 +226,7 @@ TEST_F(SchedulerTest, QueueFlushing) { DepEvent->setContextImpl(TempQueueImpl->getContextImplPtr()); DepEvent->getHandleRef() = reinterpret_cast(new int{}); } - (void)Cmd.addDep(DepEvent); + (void)Cmd.addDep(DepEvent, ToCleanUp); MockScheduler::enqueueCommand(&Cmd, Res, detail::NON_BLOCKING); EXPECT_FALSE(EventStatusQueried); EXPECT_FALSE(QueueFlushed); @@ -244,10 +246,10 @@ TEST_F(SchedulerTest, QueueFlushing) { access::mode::read_write}; MockCommand DepCmdA(QueueImplB); DepCmdA.getEvent()->getHandleRef() = reinterpret_cast(new int{}); - (void)Cmd.addDep(detail::DepDesc{&DepCmdA, &MockReq, nullptr}); + (void)Cmd.addDep(detail::DepDesc{&DepCmdA, &MockReq, nullptr}, ToCleanUp); MockCommand DepCmdB(QueueImplB); DepCmdB.getEvent()->getHandleRef() = reinterpret_cast(new int{}); - (void)Cmd.addDep(detail::DepDesc{&DepCmdB, &MockReq, nullptr}); + (void)Cmd.addDep(detail::DepDesc{&DepCmdB, &MockReq, nullptr}, ToCleanUp); // The check is performed in redefinedQueueFlush MockScheduler::enqueueCommand(&Cmd, Res, detail::NON_BLOCKING); } @@ -259,13 +261,13 @@ TEST_F(SchedulerTest, QueueFlushing) { access::mode::read_write}; MockCommand DepCmd(QueueImplB); DepCmd.getEvent()->getHandleRef() = reinterpret_cast(new int{}); - (void)CmdA.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}); + (void)CmdA.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}, ToCleanUp); MockScheduler::enqueueCommand(&CmdA, Res, detail::NON_BLOCKING); EventStatusQueried = false; detail::MapMemObject CmdB{&AllocaCmd, MockReq, &MockHostPtr, QueueImplA, access::mode::read_write}; - (void)CmdB.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}); + (void)CmdB.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}, ToCleanUp); MockScheduler::enqueueCommand(&CmdB, Res, detail::NON_BLOCKING); EXPECT_FALSE(EventStatusQueried); } From b4e94639f6aab93f6ca1356f699470ea31c00879 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 15 Dec 2021 19:11:32 +0300 Subject: [PATCH 15/23] Apply clang-format --- sycl/unittests/scheduler/QueueFlushing.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp index 8078056803e7d..194d7c14fa59a 100644 --- a/sycl/unittests/scheduler/QueueFlushing.cpp +++ b/sycl/unittests/scheduler/QueueFlushing.cpp @@ -196,7 +196,8 @@ TEST_F(SchedulerTest, QueueFlushing) { /*Events*/ {})}; detail::ExecCGCommand ExecCGCmd{std::move(CG), QueueImplA}; MockReq.MDims = 1; - (void)ExecCGCmd.addDep(detail::DepDesc(&AllocaCmd, &MockReq, &AllocaCmd), ToCleanUp); + (void)ExecCGCmd.addDep(detail::DepDesc(&AllocaCmd, &MockReq, &AllocaCmd), + ToCleanUp); testCommandEnqueue(&ExecCGCmd, QueueImplB, MockReq); } From 909a634b3e9bd568000395d63d743a04185eff32 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 16 Dec 2021 16:09:39 +0300 Subject: [PATCH 16/23] Remove unnecessary finished command cleanup & cut down on stored events --- sycl/source/detail/event_impl.hpp | 11 +++++++++++ sycl/source/detail/queue_impl.cpp | 19 ++++++++++++++----- sycl/source/detail/scheduler/commands.cpp | 7 ++++++- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index fb8214ddf6b8d..0e7b896a596f7 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -201,6 +201,11 @@ class event_impl { /// \return true if this event is discarded. bool isDiscarded() const { return MState == HES_Discarded; } + void setNeedsCleanupAfterWait(bool NeedsCleanupAfterWait) { + MNeedsCleanupAfterWait = NeedsCleanupAfterWait; + } + bool needsCleanupAfterWait() { return MNeedsCleanupAfterWait; } + private: // When instrumentation is enabled emits trace event for event wait begin and // returns the telemetry event generated for the wait @@ -231,6 +236,12 @@ class event_impl { // HostEventState enum. std::atomic MState; + // A temporary workaround for the current limitations of post enqueue graph + // cleanup. Indicates that the command associated with this event isn't + // handled by post enqueue cleanup yet and has to be deleted by cleanup after + // wait. + bool MNeedsCleanupAfterWait = false; + std::mutex MMutex; }; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c67645ec36c57..c7bd05481ad2f 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -134,8 +134,8 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, } void queue_impl::addEvent(const event &Event) { - EventImplPtr Eimpl = getSyclObjImpl(Event); - Command *Cmd = (Command *)(Eimpl->getCommand()); + EventImplPtr EImpl = getSyclObjImpl(Event); + Command *Cmd = (Command *)(EImpl->getCommand()); if (!Cmd) { // if there is no command on the event, we cannot track it with MEventsWeak // as that will leave it with no owner. Track in MEventsShared only if we're @@ -146,8 +146,16 @@ void queue_impl::addEvent(const event &Event) { if (is_host() || !MSupportOOO || getPlugin().getBackend() == backend::ext_oneapi_level_zero) addSharedEvent(Event); - } else { - std::weak_ptr EventWeakPtr{Eimpl}; + } + // As long as the queue supports piQueueFinish we only need to store events + // with command nodes in the following cases: + // 1. Unenqueued commands, since they aren't covered by piQueueFinish. + // 2. Kernels with streams, since they are not supported by post enqueue + // cleanup. + // 3. Host tasks, for both reasons. + else if (!is_host() || !MSupportOOO || EImpl->getHandleRef() == nullptr || + EImpl->needsCleanupAfterWait()) { + std::weak_ptr EventWeakPtr{EImpl}; std::lock_guard Lock{MMutex}; MEventsWeak.push_back(std::move(EventWeakPtr)); } @@ -323,7 +331,8 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { for (std::weak_ptr &EventImplWeakPtr : WeakEvents) if (std::shared_ptr EventImplSharedPtr = EventImplWeakPtr.lock()) - EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); + if (EventImplSharedPtr->needsCleanupAfterWait()) + EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); // FIXME these events are stored for level zero until as a workaround, // remove once piEventRelease no longer calls wait on the event in the // plugin. diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 1527862ae46db..d48a0d8f7b300 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1606,9 +1606,14 @@ ExecCGCommand::ExecCGCommand(std::unique_ptr CommandGroup, QueueImplPtr Queue) : Command(CommandType::RUN_CG, std::move(Queue)), MCommandGroup(std::move(CommandGroup)) { - if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) + if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) { MSubmittedQueue = static_cast(MCommandGroup.get())->MQueue; + MEvent->setNeedsCleanupAfterWait(true); + } else if (MCommandGroup->getType() == CG::CGTYPE::Kernel && + (static_cast(MCommandGroup.get()))->hasStreams()) + MEvent->setNeedsCleanupAfterWait(true); + emitInstrumentationDataProxy(); } From a2c30460e9b990e02a0bc20f0957c8c486b80c34 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 16 Dec 2021 16:43:52 +0300 Subject: [PATCH 17/23] Remove Level Zero workaround in queue::wait --- sycl/source/detail/queue_impl.cpp | 44 ++++--------------------------- 1 file changed, 5 insertions(+), 39 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c7bd05481ad2f..6d1c21eb9e7cf 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -74,10 +74,7 @@ event queue_impl::memset(const std::shared_ptr &Self, event ResEvent = prepareUSMEvent(Self, NativeEvent); // Track only if we won't be able to handle it with piQueueFinish. - // FIXME these events are stored for level zero until as a workaround, remove - // once piEventRelease no longer calls wait on the event in the plugin. - if (!MSupportOOO || - getPlugin().getBackend() == backend::ext_oneapi_level_zero) + if (!MSupportOOO) addSharedEvent(ResEvent); return MDiscardEvents ? createDiscardedEvent() : ResEvent; } @@ -99,10 +96,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, event ResEvent = prepareUSMEvent(Self, NativeEvent); // Track only if we won't be able to handle it with piQueueFinish. - // FIXME these events are stored for level zero until as a workaround, remove - // once piEventRelease no longer calls wait on the event in the plugin. - if (!MSupportOOO || - getPlugin().getBackend() == backend::ext_oneapi_level_zero) + if (!MSupportOOO) addSharedEvent(ResEvent); return MDiscardEvents ? createDiscardedEvent() : ResEvent; } @@ -125,10 +119,7 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, event ResEvent = prepareUSMEvent(Self, NativeEvent); // Track only if we won't be able to handle it with piQueueFinish. - // FIXME these events are stored for level zero until as a workaround, remove - // once piEventRelease no longer calls wait on the event in the plugin. - if (!MSupportOOO || - getPlugin().getBackend() == backend::ext_oneapi_level_zero) + if (!MSupportOOO) addSharedEvent(ResEvent); return MDiscardEvents ? createDiscardedEvent() : ResEvent; } @@ -140,11 +131,7 @@ void queue_impl::addEvent(const event &Event) { // if there is no command on the event, we cannot track it with MEventsWeak // as that will leave it with no owner. Track in MEventsShared only if we're // unable to call piQueueFinish during wait. - // FIXME these events are stored for level zero until as a workaround, - // remove once piEventRelease no longer calls wait on the event in the - // plugin. - if (is_host() || !MSupportOOO || - getPlugin().getBackend() == backend::ext_oneapi_level_zero) + if (is_host() || !MSupportOOO) addSharedEvent(Event); } // As long as the queue supports piQueueFinish we only need to store events @@ -165,10 +152,7 @@ void queue_impl::addEvent(const event &Event) { /// but some events have no other owner. In this case, /// addSharedEvent will have the queue track the events via a shared pointer. void queue_impl::addSharedEvent(const event &Event) { - // FIXME The assertion should be corrected once the Level Zero workaround is - // removed. - assert(is_host() || !MSupportOOO || - getPlugin().getBackend() == backend::ext_oneapi_level_zero); + assert(is_host() || !MSupportOOO); std::lock_guard Lock(MMutex); // Events stored in MEventsShared are not released anywhere else aside from // calls to queue::wait/wait_and_throw, which a user application might not @@ -301,17 +285,6 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { // directly. Otherwise, only wait for unenqueued or host task events, starting // from the latest submitted task in order to minimize total amount of calls, // then handle the rest with piQueueFinish. - // TODO the new workflow has worse performance with Level Zero, keep the old - // behavior until this is addressed - if (!is_host() && - getPlugin().getBackend() == backend::ext_oneapi_level_zero) { - for (std::weak_ptr &EventImplWeakPtr : WeakEvents) - if (std::shared_ptr EventImplSharedPtr = - EventImplWeakPtr.lock()) - EventImplSharedPtr->wait(EventImplSharedPtr); - for (event &Event : SharedEvents) - Event.wait(); - } else { bool SupportsPiFinish = !is_host() && MSupportOOO; for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { @@ -333,12 +306,6 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { EventImplWeakPtr.lock()) if (EventImplSharedPtr->needsCleanupAfterWait()) EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); - // FIXME these events are stored for level zero until as a workaround, - // remove once piEventRelease no longer calls wait on the event in the - // plugin. - if (Plugin.getBackend() == backend::ext_oneapi_level_zero) { - SharedEvents.clear(); - } assert(SharedEvents.empty() && "Queues that support calling piQueueFinish " "shouldn't have shared events"); @@ -346,7 +313,6 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { for (event &Event : SharedEvents) Event.wait(); } - } #ifdef XPTI_ENABLE_INSTRUMENTATION instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); #endif From 271210d43767422557e97672cc96d1d5da7bd847 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 16 Dec 2021 17:02:17 +0300 Subject: [PATCH 18/23] Apply clang-format --- sycl/source/detail/queue_impl.cpp | 50 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 6d1c21eb9e7cf..96107333588ae 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -285,34 +285,32 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { // directly. Otherwise, only wait for unenqueued or host task events, starting // from the latest submitted task in order to minimize total amount of calls, // then handle the rest with piQueueFinish. - bool SupportsPiFinish = !is_host() && MSupportOOO; - for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); - EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { - if (std::shared_ptr EventImplSharedPtr = - EventImplWeakPtrIt->lock()) { - // A nullptr PI event indicates that piQueueFinish will not cover it, - // either because it's a host task event or an unenqueued one. - if (!SupportsPiFinish || - nullptr == EventImplSharedPtr->getHandleRef()) { - EventImplSharedPtr->wait(EventImplSharedPtr); - } + bool SupportsPiFinish = !is_host() && MSupportOOO; + for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); + EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtrIt->lock()) { + // A nullptr PI event indicates that piQueueFinish will not cover it, + // either because it's a host task event or an unenqueued one. + if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) { + EventImplSharedPtr->wait(EventImplSharedPtr); } } - if (SupportsPiFinish) { - const detail::plugin &Plugin = getPlugin(); - Plugin.call(getHandleRef()); - for (std::weak_ptr &EventImplWeakPtr : WeakEvents) - if (std::shared_ptr EventImplSharedPtr = - EventImplWeakPtr.lock()) - if (EventImplSharedPtr->needsCleanupAfterWait()) - EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); - assert(SharedEvents.empty() && - "Queues that support calling piQueueFinish " - "shouldn't have shared events"); - } else { - for (event &Event : SharedEvents) - Event.wait(); - } + } + if (SupportsPiFinish) { + const detail::plugin &Plugin = getPlugin(); + Plugin.call(getHandleRef()); + for (std::weak_ptr &EventImplWeakPtr : WeakEvents) + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtr.lock()) + if (EventImplSharedPtr->needsCleanupAfterWait()) + EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); + assert(SharedEvents.empty() && "Queues that support calling piQueueFinish " + "shouldn't have shared events"); + } else { + for (event &Event : SharedEvents) + Event.wait(); + } #ifdef XPTI_ENABLE_INSTRUMENTATION instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); #endif From 1e40a05650c0fa1c3f02e5ebb0f2d0be96cf5bf5 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 21 Dec 2021 12:55:36 +0300 Subject: [PATCH 19/23] Apply some comments --- sycl/doc/EnvironmentVariables.md | 2 +- sycl/source/detail/device_image_impl.hpp | 6 ++++-- sycl/source/detail/queue_impl.cpp | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 0f6f4edb774da..bc0543a39c876 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -92,7 +92,7 @@ variables in production code. | `SYCL_DEVICELIB_NO_FALLBACK` | Any(\*) | Disable loading and linking of device library images | | `SYCL_PRINT_EXECUTION_GRAPH` | Described [below](#sycl_print_execution_graph-options) | Print execution graph to DOT text file. | | `SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP` | Any(\*) | Disable cleanup of finished command nodes at host-device synchronization points. | -| `SYCL_DISABLE_POST_ENQUEUE_CLEANUP` | Any(\*) | Disable cleanup of enqueued command nodes. | +| `SYCL_DISABLE_POST_ENQUEUE_CLEANUP` | Any(\*) | Disable cleanup of enqueued command nodes during submission. | | `SYCL_THROW_ON_BLOCK` | Any(\*) | Throw an exception on attempt to wait for a blocked command. | | `SYCL_DEVICELIB_INHIBIT_NATIVE` | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. | | `SYCL_PROGRAM_COMPILE_OPTIONS` | String of valid OpenCL compile options | Override compile options for all programs. | diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index 58da062ccb5c2..08746a8d9561e 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -185,8 +185,10 @@ class device_image_impl { std::lock_guard Lock{MSpecConstAccessMtx}; if (nullptr == MSpecConstsBuffer && !MSpecConstsBlob.empty()) { const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin(); - // Uses PI_MEM_FLAGS_HOST_PTR_COPY since post-enqueue cleanup might - // destroy MSpecConstsBuffer. + // Uses PI_MEM_FLAGS_HOST_PTR_COPY instead of PI_MEM_FLAGS_HOST_PTR_USE + // since post-enqueue cleanup might trigger destruction of + // device_image_impl and, as a result, destruction of MSpecConstsBlob + // while MSpecConstsBuffer is still in use. // TODO consider changing the lifetime of device_image_impl instead Plugin.call( detail::getSyclObjImpl(MContext)->getHandleRef(), diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 96107333588ae..1d4b01cef3c51 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -140,7 +140,7 @@ void queue_impl::addEvent(const event &Event) { // 2. Kernels with streams, since they are not supported by post enqueue // cleanup. // 3. Host tasks, for both reasons. - else if (!is_host() || !MSupportOOO || EImpl->getHandleRef() == nullptr || + else if (is_host() || !MSupportOOO || EImpl->getHandleRef() == nullptr || EImpl->needsCleanupAfterWait()) { std::weak_ptr EventWeakPtr{EImpl}; std::lock_guard Lock{MMutex}; @@ -285,7 +285,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { // directly. Otherwise, only wait for unenqueued or host task events, starting // from the latest submitted task in order to minimize total amount of calls, // then handle the rest with piQueueFinish. - bool SupportsPiFinish = !is_host() && MSupportOOO; + const bool SupportsPiFinish = !is_host() && MSupportOOO; for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { if (std::shared_ptr EventImplSharedPtr = From c720274d3e61182c2f5d393986cf3c7e106d9235 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 21 Dec 2021 15:16:53 +0300 Subject: [PATCH 20/23] Apply more comments --- sycl/source/detail/queue_impl.cpp | 2 +- sycl/source/detail/scheduler/commands.hpp | 2 +- sycl/source/detail/scheduler/scheduler.cpp | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 1d4b01cef3c51..d5427ffc52020 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -126,7 +126,7 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, void queue_impl::addEvent(const event &Event) { EventImplPtr EImpl = getSyclObjImpl(Event); - Command *Cmd = (Command *)(EImpl->getCommand()); + auto *Cmd = static_cast(EImpl->getCommand()); if (!Cmd) { // if there is no command on the event, we cannot track it with MEventsWeak // as that will leave it with no owner. Track in MEventsShared only if we're diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index dee4c4972f788..0f21a0e639491 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -223,7 +223,7 @@ class Command { /// Perform glueing of events from different contexts /// \param DepEvent event this commands should depend on /// \param Dep optional DepDesc to perform connection of events properly - /// \param ToCleanUp is a + /// \param ToCleanUp container for commands that can be cleaned up. /// \return returns an optional connection command to enqueue /// /// Glueing (i.e. connecting) will be performed if and only if DepEvent is diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index ea118470f9415..232ee0a5d6e47 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -341,10 +341,8 @@ void Scheduler::releaseHostAccessor(Requirement *Req) { cleanupCommands(ToCleanUp); } -// static void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req, std::vector &ToCleanUp) { - // FIXME handle this as well MemObjRecord *Record = Req->MSYCLMemObj->MRecord.get(); auto EnqueueLeaves = [&ToCleanUp](LeavesCollection &Leaves) { for (Command *Cmd : Leaves) { From abdcc62eaaae9d35fc6292b1f3ea30124a7ea8a1 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 21 Dec 2021 19:52:37 +0300 Subject: [PATCH 21/23] Fix unused variable with assertions disabled --- sycl/source/detail/scheduler/graph_builder.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 2de8e6fe2dc52..ec5f9713657c6 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1164,6 +1164,7 @@ void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) { } } #endif + (void)CmdT; for (Command *UserCmd : Cmd->MUsers) { for (DepDesc &Dep : UserCmd->MDeps) { From f43f1f1cfa71dfde5efaa606ddd3a4608ee08164 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 23 Dec 2021 15:18:22 +0300 Subject: [PATCH 22/23] Add unit tests --- sycl/source/detail/event_impl.cpp | 2 +- sycl/unittests/scheduler/CMakeLists.txt | 1 + .../scheduler/PostEnqueueCleanup.cpp | 280 ++++++++++++++++++ .../scheduler/SchedulerTestUtils.hpp | 38 +++ 4 files changed, 320 insertions(+), 1 deletion(-) create mode 100644 sycl/unittests/scheduler/PostEnqueueCleanup.cpp diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 4bafe6a55f38b..f615f37214208 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -364,7 +364,6 @@ std::vector event_impl::getWaitList() { } void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) { - assert(MEvent != nullptr); if (MIsFlushed) return; @@ -379,6 +378,7 @@ void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) { return; // Check if the task for this event has already been submitted. + assert(MEvent != nullptr); pi_event_status Status = PI_EVENT_QUEUED; getPlugin().call( MEvent, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(pi_int32), &Status, diff --git a/sycl/unittests/scheduler/CMakeLists.txt b/sycl/unittests/scheduler/CMakeLists.txt index 1ec8c7b4d894a..ceb7c8990c3f2 100644 --- a/sycl/unittests/scheduler/CMakeLists.txt +++ b/sycl/unittests/scheduler/CMakeLists.txt @@ -16,5 +16,6 @@ add_sycl_unittest(SchedulerTests OBJECT AllocaLinking.cpp RequiredWGSize.cpp QueueFlushing.cpp + PostEnqueueCleanup.cpp utils.cpp ) diff --git a/sycl/unittests/scheduler/PostEnqueueCleanup.cpp b/sycl/unittests/scheduler/PostEnqueueCleanup.cpp new file mode 100644 index 0000000000000..1376cd5b3875b --- /dev/null +++ b/sycl/unittests/scheduler/PostEnqueueCleanup.cpp @@ -0,0 +1,280 @@ +//==------------ QueueFlushing.cpp --- Scheduler unit tests ----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SchedulerTest.hpp" +#include "SchedulerTestUtils.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +using namespace sycl; + +inline constexpr auto HostUnifiedMemoryName = "SYCL_HOST_UNIFIED_MEMORY"; + +int val; +static pi_result redefinedEnqueueMemBufferMap( + pi_queue command_queue, pi_mem buffer, pi_bool blocking_map, + pi_map_flags map_flags, size_t offset, size_t size, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event, void **ret_map) { + *event = reinterpret_cast(new int{}); + *ret_map = &val; + return PI_SUCCESS; +} + +static pi_result redefinedEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, + void *mapped_ptr, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, + pi_event *event) { + *event = reinterpret_cast(new int{}); + return PI_SUCCESS; +} + +static pi_result redefinedEnqueueMemBufferFill( + pi_queue command_queue, pi_mem buffer, const void *pattern, + size_t pattern_size, size_t offset, size_t size, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event) { + *event = reinterpret_cast(new int{}); + return PI_SUCCESS; +} + +static void verifyCleanup(detail::MemObjRecord *Record, + detail::AllocaCommandBase *AllocaCmd, + detail::Command *DeletedCmd, bool &CmdDeletedFlag) { + EXPECT_TRUE(CmdDeletedFlag); + CmdDeletedFlag = false; + EXPECT_EQ( + std::find(AllocaCmd->MUsers.begin(), AllocaCmd->MUsers.end(), DeletedCmd), + AllocaCmd->MUsers.end()); + detail::Command *Leaf = *Record->MWriteLeaves.begin(); + EXPECT_FALSE(std::any_of(Leaf->MDeps.begin(), Leaf->MDeps.end(), + [&](const detail::DepDesc &Dep) { + return Dep.MDepCommand == DeletedCmd; + })); +} + +// Check that any non-leaf commands enqueued as part of high level scheduler +// calls are cleaned up. +static void checkCleanupOnEnqueue(MockScheduler &MS, + detail::QueueImplPtr &QueueImpl, + buffer &Buf, + detail::Requirement &MockReq) { + bool CommandDeleted = false; + std::vector AuxCmds; + std::vector ToCleanUp; + std::vector ToEnqueue; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QueueImpl, &MockReq, AuxCmds); + detail::AllocaCommandBase *AllocaCmd = + MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, AuxCmds); + std::function Callback = [&CommandDeleted]() { + CommandDeleted = true; + }; + + // Check addCG. + MockCommand *MockCmd = + new MockCommandWithCallback(QueueImpl, MockReq, Callback); + (void)MockCmd->addDep(detail::DepDesc(AllocaCmd, &MockReq, nullptr), + ToCleanUp); + EXPECT_TRUE(ToCleanUp.empty()); + MS.addNodeToLeaves(Record, MockCmd, access::mode::read_write, ToEnqueue); + MS.updateLeaves({AllocaCmd}, Record, access::mode::read_write, ToCleanUp); + + EXPECT_TRUE(ToCleanUp.empty()); + std::unique_ptr CG{new detail::CGFill(/*Pattern*/ {}, &MockReq, + /*ArgsStorage*/ {}, + /*AccStorage*/ {}, + /*SharedPtrStorage*/ {}, + /*Requirements*/ {&MockReq}, + /*Events*/ {})}; + detail::EventImplPtr Event = MS.addCG(std::move(CG), QueueImpl); + auto *Cmd = static_cast(Event->getCommand()); + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + + // Check add/releaseHostAccessor. + CommandDeleted = false; + MockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback); + addEdge(MockCmd, Cmd, AllocaCmd); + MS.addNodeToLeaves(Record, MockCmd, access::mode::read_write, ToEnqueue); + MS.updateLeaves({Cmd}, Record, access::mode::read_write, ToCleanUp); + MS.addHostAccessor(&MockReq); + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + + CommandDeleted = false; + MockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback); + addEdge(MockCmd, AllocaCmd, AllocaCmd); + MockCommand *LeafMockCmd = + new MockCommandWithCallback(QueueImpl, MockReq, Callback); + addEdge(LeafMockCmd, MockCmd, AllocaCmd); + MS.addNodeToLeaves(Record, LeafMockCmd, access::mode::read_write, ToEnqueue); + MS.releaseHostAccessor(&MockReq); + MockReq.MBlockedCmd = nullptr; + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + + auto addNewMockCmds = [&]() -> MockCommand * { + CommandDeleted = false; + MockCmd = LeafMockCmd; + LeafMockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback); + addEdge(LeafMockCmd, MockCmd, AllocaCmd); + MS.addNodeToLeaves(Record, LeafMockCmd, access::mode::read_write, + ToEnqueue); + // Since this mock command has already been enqueued, it's expected to be + // cleaned up during removal from leaves. + ToCleanUp.clear(); + MS.updateLeaves({MockCmd}, Record, access::mode::read_write, ToCleanUp); + EXPECT_EQ(ToCleanUp.size(), 1U); + EXPECT_EQ(ToCleanUp[0], MockCmd); + MS.cleanupCommands({MockCmd}); + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + CommandDeleted = false; + MockCmd = LeafMockCmd; + LeafMockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback); + addEdge(LeafMockCmd, MockCmd, AllocaCmd); + MS.addNodeToLeaves(Record, LeafMockCmd, access::mode::read_write, + ToEnqueue); + MS.updateLeaves({MockCmd}, Record, access::mode::read_write, ToCleanUp); + return MockCmd; + }; + + // Check waitForEvent + MockCmd = addNewMockCmds(); + MS.waitForEvent(LeafMockCmd->getEvent()); + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + + // Check addCopyBack + MockCmd = addNewMockCmds(); + LeafMockCmd->getEvent()->getHandleRef() = + reinterpret_cast(new int{}); + MS.addCopyBack(&MockReq); + verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted); + + MS.removeRecordForMemObj(detail::getSyclObjImpl(Buf).get()); +} + +static void checkCleanupOnLeafUpdate( + MockScheduler &MS, detail::QueueImplPtr &QueueImpl, buffer &Buf, + detail::Requirement &MockReq, + std::function SchedulerCall) { + bool CommandDeleted = false; + std::vector AuxCmds; + std::vector ToCleanUp; + std::vector ToEnqueue; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QueueImpl, &MockReq, AuxCmds); + detail::AllocaCommandBase *AllocaCmd = + MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, AuxCmds); + std::function Callback = [&CommandDeleted]() { + CommandDeleted = true; + }; + + // Add a mock command as a leaf and enqueue it. + MockCommand *MockCmd = + new MockCommandWithCallback(QueueImpl, MockReq, Callback); + (void)MockCmd->addDep(detail::DepDesc(AllocaCmd, &MockReq, nullptr), + ToCleanUp); + EXPECT_TRUE(ToCleanUp.empty()); + MS.addNodeToLeaves(Record, MockCmd, access::mode::read_write, ToEnqueue); + MS.updateLeaves({AllocaCmd}, Record, access::mode::read_write, ToCleanUp); + detail::EnqueueResultT Res; + MockScheduler::enqueueCommand(MockCmd, Res, detail::BLOCKING); + + EXPECT_FALSE(CommandDeleted); + SchedulerCall(Record); + EXPECT_TRUE(CommandDeleted); + MS.removeRecordForMemObj(detail::getSyclObjImpl(Buf).get()); +} + +TEST_F(SchedulerTest, PostEnqueueCleanup) { + // Enforce creation of linked commands to test all sites of calling cleanup. + unittest::ScopedEnvVar HostUnifiedMemoryVar{ + HostUnifiedMemoryName, "1", + detail::SYCLConfig::reset}; + default_selector Selector; + platform Plt{default_selector()}; + if (Plt.is_host()) { + std::cout << "Not run due to host-only environment\n"; + return; + } + + unittest::PiMock Mock{Plt}; + setupDefaultMockAPIs(Mock); + Mock.redefine( + redefinedEnqueueMemBufferMap); + Mock.redefine(redefinedEnqueueMemUnmap); + Mock.redefine( + redefinedEnqueueMemBufferFill); + + context Ctx{Plt}; + queue Queue{Ctx, Selector}; + detail::QueueImplPtr QueueImpl = detail::getSyclObjImpl(Queue); + MockScheduler MS; + + buffer Buf{range<1>(1)}; + std::shared_ptr BufImpl = detail::getSyclObjImpl(Buf); + detail::Requirement MockReq = getMockRequirement(Buf); + MockReq.MDims = 1; + MockReq.MSYCLMemObj = BufImpl.get(); + + checkCleanupOnEnqueue(MS, QueueImpl, Buf, MockReq); + std::vector ToEnqueue; + checkCleanupOnLeafUpdate(MS, QueueImpl, Buf, MockReq, + [&](detail::MemObjRecord *Record) { + MS.decrementLeafCountersForRecord(Record); + }); + checkCleanupOnLeafUpdate( + MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) { + MS.insertMemoryMove(Record, &MockReq, QueueImpl, ToEnqueue); + }); + checkCleanupOnLeafUpdate(MS, QueueImpl, Buf, MockReq, + [&](detail::MemObjRecord *Record) { + Record->MMemModified = true; + MS.addCopyBack(&MockReq, ToEnqueue); + }); + checkCleanupOnLeafUpdate( + MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) { + detail::Command *Leaf = *Record->MWriteLeaves.begin(); + MS.addEmptyCmd(Leaf, {&MockReq}, QueueImpl, + detail::Command::BlockReason::HostTask, ToEnqueue); + }); + device HostDevice; + detail::QueueImplPtr DefaultHostQueue{ + new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; + checkCleanupOnLeafUpdate( + MS, DefaultHostQueue, Buf, MockReq, [&](detail::MemObjRecord *Record) { + MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, ToEnqueue); + }); + // Check cleanup on exceeding leaf limit. + checkCleanupOnLeafUpdate( + MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) { + std::vector> Leaves; + for (std::size_t I = 0; + I < Record->MWriteLeaves.genericCommandsCapacity(); ++I) + Leaves.push_back(std::make_unique(QueueImpl, MockReq)); + + detail::AllocaCommandBase *AllocaCmd = Record->MAllocaCommands[0]; + std::vector ToCleanUp; + for (std::unique_ptr &MockCmd : Leaves) { + (void)MockCmd->addDep(detail::DepDesc(AllocaCmd, &MockReq, AllocaCmd), + ToCleanUp); + MS.addNodeToLeaves(Record, MockCmd.get(), access::mode::read_write, + ToEnqueue); + } + for (std::unique_ptr &MockCmd : Leaves) + MS.updateLeaves({MockCmd.get()}, Record, access::mode::read_write, + ToCleanUp); + EXPECT_TRUE(ToCleanUp.empty()); + }); +} diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index 7c012fa8893cc..60f5cbf05e04a 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -103,6 +103,10 @@ class MockCommandWithCallback : public MockCommand { class MockScheduler : public cl::sycl::detail::Scheduler { public: + using cl::sycl::detail::Scheduler::addCG; + using cl::sycl::detail::Scheduler::addCopyBack; + using cl::sycl::detail::Scheduler::cleanupCommands; + cl::sycl::detail::MemObjRecord * getOrInsertMemObjRecord(const cl::sycl::detail::QueueImplPtr &Queue, cl::sycl::detail::Requirement *Req, @@ -110,6 +114,10 @@ class MockScheduler : public cl::sycl::detail::Scheduler { return MGraphBuilder.getOrInsertMemObjRecord(Queue, Req, ToEnqueue); } + void decrementLeafCountersForRecord(cl::sycl::detail::MemObjRecord *Rec) { + MGraphBuilder.decrementLeafCountersForRecord(Rec); + } + void removeRecordForMemObj(cl::sycl::detail::SYCLMemObjI *MemObj) { MGraphBuilder.removeRecordForMemObj(MemObj); } @@ -127,6 +135,13 @@ class MockScheduler : public cl::sycl::detail::Scheduler { return MGraphBuilder.addNodeToLeaves(Rec, Cmd, Mode, ToEnqueue); } + void updateLeaves(const std::set &Cmds, + cl::sycl::detail::MemObjRecord *Record, + cl::sycl::access::mode AccessMode, + std::vector &ToCleanUp) { + return MGraphBuilder.updateLeaves(Cmds, Record, AccessMode, ToCleanUp); + } + static bool enqueueCommand(cl::sycl::detail::Command *Cmd, cl::sycl::detail::EnqueueResultT &EnqueueResult, cl::sycl::detail::BlockingT Blocking) { @@ -153,6 +168,29 @@ class MockScheduler : public cl::sycl::detail::Scheduler { return MGraphBuilder.insertMemoryMove(Record, Req, Queue, ToEnqueue); } + cl::sycl::detail::Command * + addCopyBack(cl::sycl::detail::Requirement *Req, + std::vector &ToEnqueue) { + return MGraphBuilder.addCopyBack(Req, ToEnqueue); + } + + cl::sycl::detail::UpdateHostRequirementCommand * + insertUpdateHostReqCmd(cl::sycl::detail::MemObjRecord *Record, + cl::sycl::detail::Requirement *Req, + const cl::sycl::detail::QueueImplPtr &Queue, + std::vector &ToEnqueue) { + return MGraphBuilder.insertUpdateHostReqCmd(Record, Req, Queue, ToEnqueue); + } + + cl::sycl::detail::EmptyCommand * + addEmptyCmd(cl::sycl::detail::Command *Cmd, + const std::vector &Reqs, + const cl::sycl::detail::QueueImplPtr &Queue, + cl::sycl::detail::Command::BlockReason Reason, + std::vector &ToEnqueue) { + return MGraphBuilder.addEmptyCmd(Cmd, Reqs, Queue, Reason, ToEnqueue); + } + cl::sycl::detail::Command * addCG(std::unique_ptr CommandGroup, cl::sycl::detail::QueueImplPtr Queue, From 5275fdf706851aeeb956aea30dd771dd978cc337 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 23 Dec 2021 17:32:22 +0300 Subject: [PATCH 23/23] Non-functional test changes --- sycl/unittests/scheduler/PostEnqueueCleanup.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/unittests/scheduler/PostEnqueueCleanup.cpp b/sycl/unittests/scheduler/PostEnqueueCleanup.cpp index 1376cd5b3875b..8386af6394628 100644 --- a/sycl/unittests/scheduler/PostEnqueueCleanup.cpp +++ b/sycl/unittests/scheduler/PostEnqueueCleanup.cpp @@ -1,4 +1,4 @@ -//==------------ QueueFlushing.cpp --- Scheduler unit tests ----------------==// +//==--------- PostEnqueueCleanup.cpp --- Scheduler unit tests --------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -198,10 +198,6 @@ static void checkCleanupOnLeafUpdate( } TEST_F(SchedulerTest, PostEnqueueCleanup) { - // Enforce creation of linked commands to test all sites of calling cleanup. - unittest::ScopedEnvVar HostUnifiedMemoryVar{ - HostUnifiedMemoryName, "1", - detail::SYCLConfig::reset}; default_selector Selector; platform Plt{default_selector()}; if (Plt.is_host()) { @@ -209,6 +205,10 @@ TEST_F(SchedulerTest, PostEnqueueCleanup) { return; } + // Enforce creation of linked commands to test all sites of calling cleanup. + unittest::ScopedEnvVar HostUnifiedMemoryVar{ + HostUnifiedMemoryName, "1", + detail::SYCLConfig::reset}; unittest::PiMock Mock{Plt}; setupDefaultMockAPIs(Mock); Mock.redefine(