diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 809814cbf71ef..7b81850917bc6 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -92,6 +92,7 @@ variables in production code.</span>
 | `SYCL_DEVICELIB_NO_FALLBACK` | Any(\*) | Disable loading and linking of device library images |
 | `SYCL_PRINT_EXECUTION_GRAPH` | Described [below](#sycl_print_execution_graph-options) | Print execution graph to DOT text file. |
 | `SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP` | Any(\*) | Disable cleanup of finished command nodes at host-device synchronization points. |
+| `SYCL_DISABLE_POST_ENQUEUE_CLEANUP` | Any(\*) | Disable cleanup of enqueued command nodes during submission. |
 | `SYCL_THROW_ON_BLOCK` | Any(\*) | Throw an exception on attempt to wait for a blocked command.  |
 | `SYCL_DEVICELIB_INHIBIT_NATIVE` | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. |
 | `SYCL_PROGRAM_COMPILE_OPTIONS` | String of valid OpenCL compile options | Override compile options for all programs. |
diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
index 170c0f39906c3..f4938aa43120d 100644
--- a/sycl/include/CL/sycl/detail/cg.hpp
+++ b/sycl/include/CL/sycl/detail/cg.hpp
@@ -290,6 +290,7 @@ class CGExecKernel : public CG {
   }
 
   void clearStreams() { MStreams.clear(); }
+  bool hasStreams() { return !MStreams.empty(); }
 };
 
 /// "Copy memory" command group class.
diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def
index fcfe345b52975..95750092bfb2e 100644
--- a/sycl/source/detail/config.def
+++ b/sycl/source/detail/config.def
@@ -12,6 +12,7 @@
 
 CONFIG(SYCL_PRINT_EXECUTION_GRAPH, 32, __SYCL_PRINT_EXECUTION_GRAPH)
 CONFIG(SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP, 1, __SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP)
+CONFIG(SYCL_DISABLE_POST_ENQUEUE_CLEANUP, 1, __SYCL_DISABLE_POST_ENQUEUE_CLEANUP)
 CONFIG(SYCL_DEVICE_ALLOWLIST, 1024, __SYCL_DEVICE_ALLOWLIST)
 CONFIG(SYCL_BE, 16, __SYCL_BE)
 CONFIG(SYCL_PI_TRACE, 16, __SYCL_PI_TRACE)
diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp
index 503c71af62823..24b2809955fb0 100644
--- a/sycl/source/detail/device_image_impl.hpp
+++ b/sycl/source/detail/device_image_impl.hpp
@@ -186,9 +186,14 @@ class device_image_impl {
     std::lock_guard<std::mutex> Lock{MSpecConstAccessMtx};
     if (nullptr == MSpecConstsBuffer && !MSpecConstsBlob.empty()) {
       const detail::plugin &Plugin = getSyclObjImpl(MContext)->getPlugin();
+      // Uses PI_MEM_FLAGS_HOST_PTR_COPY instead of PI_MEM_FLAGS_HOST_PTR_USE
+      // since post-enqueue cleanup might trigger destruction of
+      // device_image_impl and, as a result, destruction of MSpecConstsBlob
+      // while MSpecConstsBuffer is still in use.
+      // TODO consider changing the lifetime of device_image_impl instead
       memBufferCreateHelper(Plugin,
                             detail::getSyclObjImpl(MContext)->getHandleRef(),
-                            PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_USE,
+                            PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY,
                             MSpecConstsBlob.size(), MSpecConstsBlob.data(),
                             &MSpecConstsBuffer, nullptr);
     }
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index 4bafe6a55f38b..f615f37214208 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -364,7 +364,6 @@ std::vector<EventImplPtr> event_impl::getWaitList() {
 }
 
 void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) {
-  assert(MEvent != nullptr);
   if (MIsFlushed)
     return;
 
@@ -379,6 +378,7 @@ void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) {
     return;
 
   // Check if the task for this event has already been submitted.
+  assert(MEvent != nullptr);
   pi_event_status Status = PI_EVENT_QUEUED;
   getPlugin().call<PiApiKind::piEventGetInfo>(
       MEvent, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(pi_int32), &Status,
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index fb8214ddf6b8d..0e7b896a596f7 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -201,6 +201,11 @@ class event_impl {
   /// \return true if this event is discarded.
   bool isDiscarded() const { return MState == HES_Discarded; }
 
+  void setNeedsCleanupAfterWait(bool NeedsCleanupAfterWait) {
+    MNeedsCleanupAfterWait = NeedsCleanupAfterWait;
+  }
+  bool needsCleanupAfterWait() { return MNeedsCleanupAfterWait; }
+
 private:
   // When instrumentation is enabled emits trace event for event wait begin and
   // returns the telemetry event generated for the wait
@@ -231,6 +236,12 @@ class event_impl {
   // HostEventState enum.
   std::atomic<int> MState;
 
+  // A temporary workaround for the current limitations of post enqueue graph
+  // cleanup. Indicates that the command associated with this event isn't
+  // handled by post enqueue cleanup yet and has to be deleted by cleanup after
+  // wait.
+  bool MNeedsCleanupAfterWait = false;
+
   std::mutex MMutex;
 };
 
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index c67645ec36c57..d5427ffc52020 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -74,10 +74,7 @@ event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
 
   event ResEvent = prepareUSMEvent(Self, NativeEvent);
   // Track only if we won't be able to handle it with piQueueFinish.
-  // FIXME these events are stored for level zero until as a workaround, remove
-  // once piEventRelease no longer calls wait on the event in the plugin.
-  if (!MSupportOOO ||
-      getPlugin().getBackend() == backend::ext_oneapi_level_zero)
+  if (!MSupportOOO)
     addSharedEvent(ResEvent);
   return MDiscardEvents ? createDiscardedEvent() : ResEvent;
 }
@@ -99,10 +96,7 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
 
   event ResEvent = prepareUSMEvent(Self, NativeEvent);
   // Track only if we won't be able to handle it with piQueueFinish.
-  // FIXME these events are stored for level zero until as a workaround, remove
-  // once piEventRelease no longer calls wait on the event in the plugin.
-  if (!MSupportOOO ||
-      getPlugin().getBackend() == backend::ext_oneapi_level_zero)
+  if (!MSupportOOO)
     addSharedEvent(ResEvent);
   return MDiscardEvents ? createDiscardedEvent() : ResEvent;
 }
@@ -125,29 +119,30 @@ event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
 
   event ResEvent = prepareUSMEvent(Self, NativeEvent);
   // Track only if we won't be able to handle it with piQueueFinish.
-  // FIXME these events are stored for level zero until as a workaround, remove
-  // once piEventRelease no longer calls wait on the event in the plugin.
-  if (!MSupportOOO ||
-      getPlugin().getBackend() == backend::ext_oneapi_level_zero)
+  if (!MSupportOOO)
     addSharedEvent(ResEvent);
   return MDiscardEvents ? createDiscardedEvent() : ResEvent;
 }
 
 void queue_impl::addEvent(const event &Event) {
-  EventImplPtr Eimpl = getSyclObjImpl(Event);
-  Command *Cmd = (Command *)(Eimpl->getCommand());
+  EventImplPtr EImpl = getSyclObjImpl(Event);
+  auto *Cmd = static_cast<Command *>(EImpl->getCommand());
   if (!Cmd) {
     // if there is no command on the event, we cannot track it with MEventsWeak
     // as that will leave it with no owner. Track in MEventsShared only if we're
     // unable to call piQueueFinish during wait.
-    // FIXME these events are stored for level zero until as a workaround,
-    // remove once piEventRelease no longer calls wait on the event in the
-    // plugin.
-    if (is_host() || !MSupportOOO ||
-        getPlugin().getBackend() == backend::ext_oneapi_level_zero)
+    if (is_host() || !MSupportOOO)
       addSharedEvent(Event);
-  } else {
-    std::weak_ptr<event_impl> EventWeakPtr{Eimpl};
+  }
+  // As long as the queue supports piQueueFinish we only need to store events
+  // with command nodes in the following cases:
+  // 1. Unenqueued commands, since they aren't covered by piQueueFinish.
+  // 2. Kernels with streams, since they are not supported by post enqueue
+  // cleanup.
+  // 3. Host tasks, for both reasons.
+  else if (is_host() || !MSupportOOO || EImpl->getHandleRef() == nullptr ||
+           EImpl->needsCleanupAfterWait()) {
+    std::weak_ptr<event_impl> EventWeakPtr{EImpl};
     std::lock_guard<std::mutex> Lock{MMutex};
     MEventsWeak.push_back(std::move(EventWeakPtr));
   }
@@ -157,10 +152,7 @@ void queue_impl::addEvent(const event &Event) {
 /// but some events have no other owner. In this case,
 /// addSharedEvent will have the queue track the events via a shared pointer.
 void queue_impl::addSharedEvent(const event &Event) {
-  // FIXME The assertion should be corrected once the Level Zero workaround is
-  // removed.
-  assert(is_host() || !MSupportOOO ||
-         getPlugin().getBackend() == backend::ext_oneapi_level_zero);
+  assert(is_host() || !MSupportOOO);
   std::lock_guard<std::mutex> Lock(MMutex);
   // Events stored in MEventsShared are not released anywhere else aside from
   // calls to queue::wait/wait_and_throw, which a user application might not
@@ -293,50 +285,31 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
   // directly. Otherwise, only wait for unenqueued or host task events, starting
   // from the latest submitted task in order to minimize total amount of calls,
   // then handle the rest with piQueueFinish.
-  // TODO the new workflow has worse performance with Level Zero, keep the old
-  // behavior until this is addressed
-  if (!is_host() &&
-      getPlugin().getBackend() == backend::ext_oneapi_level_zero) {
+  const bool SupportsPiFinish = !is_host() && MSupportOOO;
+  for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
+       EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
+    if (std::shared_ptr<event_impl> EventImplSharedPtr =
+            EventImplWeakPtrIt->lock()) {
+      // A nullptr PI event indicates that piQueueFinish will not cover it,
+      // either because it's a host task event or an unenqueued one.
+      if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) {
+        EventImplSharedPtr->wait(EventImplSharedPtr);
+      }
+    }
+  }
+  if (SupportsPiFinish) {
+    const detail::plugin &Plugin = getPlugin();
+    Plugin.call<detail::PiApiKind::piQueueFinish>(getHandleRef());
     for (std::weak_ptr<event_impl> &EventImplWeakPtr : WeakEvents)
       if (std::shared_ptr<event_impl> EventImplSharedPtr =
               EventImplWeakPtr.lock())
-        EventImplSharedPtr->wait(EventImplSharedPtr);
+        if (EventImplSharedPtr->needsCleanupAfterWait())
+          EventImplSharedPtr->cleanupCommand(EventImplSharedPtr);
+    assert(SharedEvents.empty() && "Queues that support calling piQueueFinish "
+                                   "shouldn't have shared events");
+  } else {
     for (event &Event : SharedEvents)
       Event.wait();
-  } else {
-    bool SupportsPiFinish = !is_host() && MSupportOOO;
-    for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
-         EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
-      if (std::shared_ptr<event_impl> EventImplSharedPtr =
-              EventImplWeakPtrIt->lock()) {
-        // A nullptr PI event indicates that piQueueFinish will not cover it,
-        // either because it's a host task event or an unenqueued one.
-        if (!SupportsPiFinish ||
-            nullptr == EventImplSharedPtr->getHandleRef()) {
-          EventImplSharedPtr->wait(EventImplSharedPtr);
-        }
-      }
-    }
-    if (SupportsPiFinish) {
-      const detail::plugin &Plugin = getPlugin();
-      Plugin.call<detail::PiApiKind::piQueueFinish>(getHandleRef());
-      for (std::weak_ptr<event_impl> &EventImplWeakPtr : WeakEvents)
-        if (std::shared_ptr<event_impl> EventImplSharedPtr =
-                EventImplWeakPtr.lock())
-          EventImplSharedPtr->cleanupCommand(EventImplSharedPtr);
-      // FIXME these events are stored for level zero until as a workaround,
-      // remove once piEventRelease no longer calls wait on the event in the
-      // plugin.
-      if (Plugin.getBackend() == backend::ext_oneapi_level_zero) {
-        SharedEvents.clear();
-      }
-      assert(SharedEvents.empty() &&
-             "Queues that support calling piQueueFinish "
-             "shouldn't have shared events");
-    } else {
-      for (event &Event : SharedEvents)
-        Event.wait();
-    }
   }
 #ifdef XPTI_ENABLE_INSTRUMENTATION
   instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index c5ff9eb30dfaa..b7ab6c4d69f84 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -267,8 +267,9 @@ class DispatchHostTask {
     // of empty command.
     // Also, it's possible to have record deallocated prior to enqueue process.
     // Thus we employ read-lock of graph.
+    std::vector<Command *> ToCleanUp;
+    Scheduler &Sched = Scheduler::getInstance();
     {
-      Scheduler &Sched = Scheduler::getInstance();
       Scheduler::ReadLockT Lock(Sched.MGraphLock);
 
       std::vector<DepDesc> Deps = MThisCmd->MDeps;
@@ -279,8 +280,9 @@ class DispatchHostTask {
       EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady;
 
       for (const DepDesc &Dep : Deps)
-        Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement);
+        Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement, ToCleanUp);
     }
+    Sched.cleanupCommands(ToCleanUp);
   }
 };
 
@@ -523,7 +525,8 @@ void Command::makeTraceEventEpilog() {
 #endif
 }
 
-Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) {
+Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
+                                  std::vector<Command *> &ToCleanUp) {
   const QueueImplPtr &WorkerQueue = getWorkerQueue();
   const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
 
@@ -555,7 +558,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) {
   // If contexts don't match we'll connect them using host task
   if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
     Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder;
-    ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep);
+    ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
   } else
     MPreparedDepsEvents.push_back(std::move(DepEvent));
 
@@ -570,11 +573,17 @@ const QueueImplPtr &Command::getWorkerQueue() const { return MQueue; }
 
 bool Command::producesPiEvent() const { return true; }
 
-Command *Command::addDep(DepDesc NewDep) {
+bool Command::supportsPostEnqueueCleanup() const {
+  // Isolated commands are cleaned up separately
+  return !MUsers.empty() || !MDeps.empty();
+}
+
+Command *Command::addDep(DepDesc NewDep, std::vector<Command *> &ToCleanUp) {
   Command *ConnectionCmd = nullptr;
 
   if (NewDep.MDepCommand) {
-    ConnectionCmd = processDepEvent(NewDep.MDepCommand->getEvent(), NewDep);
+    ConnectionCmd =
+        processDepEvent(NewDep.MDepCommand->getEvent(), NewDep, ToCleanUp);
   }
   MDeps.push_back(NewDep);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
@@ -586,7 +595,8 @@ Command *Command::addDep(DepDesc NewDep) {
   return ConnectionCmd;
 }
 
-Command *Command::addDep(EventImplPtr Event) {
+Command *Command::addDep(EventImplPtr Event,
+                         std::vector<Command *> &ToCleanUp) {
 #ifdef XPTI_ENABLE_INSTRUMENTATION
   // We need this for just the instrumentation, so guarding it will prevent
   // unused variable warnings when instrumentation is turned off
@@ -596,7 +606,8 @@ Command *Command::addDep(EventImplPtr Event) {
   emitEdgeEventForEventDependence(Cmd, PiEventAddr);
 #endif
 
-  return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr});
+  return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr},
+                         ToCleanUp);
 }
 
 void Command::emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr) {
@@ -622,7 +633,8 @@ void Command::emitInstrumentation(uint16_t Type, const char *Txt) {
 #endif
 }
 
-bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) {
+bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
+                      std::vector<Command *> &ToCleanUp) {
   // Exit if already enqueued
   if (MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess)
     return true;
@@ -691,6 +703,12 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) {
     // Consider the command is successfully enqueued if return code is
     // CL_SUCCESS
     MEnqueueStatus = EnqueueResultT::SyclEnqueueSuccess;
+    if (MLeafCounter == 0 && supportsPostEnqueueCleanup() &&
+        !SYCLConfig<SYCL_DISABLE_POST_ENQUEUE_CLEANUP>::get()) {
+      assert(!MPostEnqueueCleanup);
+      MPostEnqueueCleanup = true;
+      ToCleanUp.push_back(this);
+    }
   }
 
   // Emit this correlation signal before the task end
@@ -783,6 +801,8 @@ void AllocaCommandBase::emitInstrumentationData() {
 
 bool AllocaCommandBase::producesPiEvent() const { return false; }
 
+bool AllocaCommandBase::supportsPostEnqueueCleanup() const { return false; }
+
 AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req,
                              bool InitFromUserData,
                              AllocaCommandBase *LinkedAllocaCmd)
@@ -793,8 +813,11 @@ AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req,
   // so this call must be before the addDep() call.
   emitInstrumentationDataProxy();
   // "Nothing to depend on"
-  Command *ConnectionCmd = addDep(DepDesc(nullptr, getRequirement(), this));
+  std::vector<Command *> ToCleanUp;
+  Command *ConnectionCmd =
+      addDep(DepDesc(nullptr, getRequirement(), this), ToCleanUp);
   assert(ConnectionCmd == nullptr);
+  assert(ToCleanUp.empty());
   (void)ConnectionCmd;
 }
 
@@ -859,7 +882,8 @@ void AllocaCommand::printDot(std::ostream &Stream) const {
 
 AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req,
                                          AllocaCommandBase *ParentAlloca,
-                                         std::vector<Command *> &ToEnqueue)
+                                         std::vector<Command *> &ToEnqueue,
+                                         std::vector<Command *> &ToCleanUp)
     : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
                         std::move(Req),
                         /*LinkedAllocaCmd*/ nullptr),
@@ -868,8 +892,8 @@ AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req,
   // is added to this node, so this call must be before
   // the addDep() call.
   emitInstrumentationDataProxy();
-  Command *ConnectionCmd =
-      addDep(DepDesc(MParentAlloca, getRequirement(), MParentAlloca));
+  Command *ConnectionCmd = addDep(
+      DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp);
   if (ConnectionCmd)
     ToEnqueue.push_back(ConnectionCmd);
 }
@@ -1049,6 +1073,8 @@ void ReleaseCommand::printDot(std::ostream &Stream) const {
 
 bool ReleaseCommand::producesPiEvent() const { return false; }
 
+bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; }
+
 MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req,
                            void **DstPtr, QueueImplPtr Queue,
                            access::mode MapMode)
@@ -1441,8 +1467,11 @@ void EmptyCommand::addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd,
   const Requirement *const StoredReq = &MRequirements.back();
 
   // EmptyCommand is always host one, so we believe that result of addDep is nil
-  Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd});
+  std::vector<Command *> ToCleanUp;
+  Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
   assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
+  assert(ToCleanUp.empty() && "addDep should add a command for cleanup only if "
+                              "there's a connection command");
   (void)Cmd;
 }
 
@@ -1577,9 +1606,14 @@ ExecCGCommand::ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup,
                              QueueImplPtr Queue)
     : Command(CommandType::RUN_CG, std::move(Queue)),
       MCommandGroup(std::move(CommandGroup)) {
-  if (MCommandGroup->getType() == detail::CG::CodeplayHostTask)
+  if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) {
     MSubmittedQueue =
         static_cast<detail::CGHostTask *>(MCommandGroup.get())->MQueue;
+    MEvent->setNeedsCleanupAfterWait(true);
+  } else if (MCommandGroup->getType() == CG::CGTYPE::Kernel &&
+             (static_cast<CGExecKernel *>(MCommandGroup.get()))->hasStreams())
+    MEvent->setNeedsCleanupAfterWait(true);
+
   emitInstrumentationDataProxy();
 }
 
@@ -2355,6 +2389,15 @@ bool ExecCGCommand::producesPiEvent() const {
   return MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask;
 }
 
+bool ExecCGCommand::supportsPostEnqueueCleanup() const {
+  // TODO enable cleaning up host task commands and kernels with streams after
+  // enqueue
+  return Command::supportsPostEnqueueCleanup() &&
+         (MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask) &&
+         (MCommandGroup->getType() != CG::CGTYPE::Kernel ||
+          !(static_cast<CGExecKernel *>(MCommandGroup.get()))->hasStreams());
+}
+
 } // namespace detail
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index e90ff2422d3bc..0f21a0e639491 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -107,11 +107,17 @@ class Command {
 
   Command(CommandType Type, QueueImplPtr Queue);
 
+  /// \param NewDep dependency to be added
+  /// \param ToCleanUp container for commands that can be cleaned up.
   /// \return an optional connection cmd to enqueue
-  [[nodiscard]] Command *addDep(DepDesc NewDep);
+  [[nodiscard]] Command *addDep(DepDesc NewDep,
+                                std::vector<Command *> &ToCleanUp);
 
+  /// \param NewDep dependency to be added
+  /// \param ToCleanUp container for commands that can be cleaned up.
   /// \return an optional connection cmd to enqueue
-  [[nodiscard]] Command *addDep(EventImplPtr Event);
+  [[nodiscard]] Command *addDep(EventImplPtr Event,
+                                std::vector<Command *> &ToCleanUp);
 
   void addUser(Command *NewUser) { MUsers.insert(NewUser); }
 
@@ -123,8 +129,10 @@ class Command {
   /// \param EnqueueResult is set to the specific status if enqueue failed.
   /// \param Blocking if this argument is true, function will wait for the
   ///        command to be unblocked before calling enqueueImp.
+  /// \param ToCleanUp container for commands that can be cleaned up.
   /// \return true if the command is enqueued.
-  virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking);
+  virtual bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
+                       std::vector<Command *> &ToCleanUp);
 
   bool isFinished();
 
@@ -194,6 +202,9 @@ class Command {
   /// Returns true iff the command produces a PI event on non-host devices.
   virtual bool producesPiEvent() const;
 
+  /// Returns true iff this command can be freed by post enqueue cleanup.
+  virtual bool supportsPostEnqueueCleanup() const;
+
 protected:
   QueueImplPtr MQueue;
   QueueImplPtr MSubmittedQueue;
@@ -212,6 +223,7 @@ class Command {
   /// Perform glueing of events from different contexts
   /// \param DepEvent event this commands should depend on
   /// \param Dep optional DepDesc to perform connection of events properly
+  /// \param ToCleanUp container for commands that can be cleaned up.
   /// \return returns an optional connection command to enqueue
   ///
   /// Glueing (i.e. connecting) will be performed if and only if DepEvent is
@@ -220,7 +232,8 @@ class Command {
   ///
   /// Optionality of Dep is set by Dep.MDepCommand not equal to nullptr.
   [[nodiscard]] Command *processDepEvent(EventImplPtr DepEvent,
-                                         const DepDesc &Dep);
+                                         const DepDesc &Dep,
+                                         std::vector<Command *> &ToCleanUp);
 
   /// Private interface. Derived classes should implement this method.
   virtual cl_int enqueueImp() = 0;
@@ -297,6 +310,10 @@ class Command {
   // By default the flag is set to true due to most of host operations are
   // synchronous. The only asynchronous operation currently is host-task.
   bool MShouldCompleteEventIfPossible = true;
+
+  /// Indicates that the node will be freed by cleanup after enqueue. Such nodes
+  /// should be ignored by other cleanup mechanisms.
+  bool MPostEnqueueCleanup = false;
 };
 
 /// The empty command does nothing during enqueue. The task can be used to
@@ -332,6 +349,7 @@ class ReleaseCommand : public Command {
   void printDot(std::ostream &Stream) const final;
   void emitInstrumentationData() override;
   bool producesPiEvent() const final;
+  bool supportsPostEnqueueCleanup() const final;
 
 private:
   cl_int enqueueImp() final;
@@ -358,6 +376,8 @@ class AllocaCommandBase : public Command {
 
   bool producesPiEvent() const final;
 
+  bool supportsPostEnqueueCleanup() const final;
+
   void *MMemAllocation = nullptr;
 
   /// Alloca command linked with current command.
@@ -403,7 +423,8 @@ class AllocaSubBufCommand : public AllocaCommandBase {
 public:
   AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req,
                       AllocaCommandBase *ParentAlloca,
-                      std::vector<Command *> &ToEnqueue);
+                      std::vector<Command *> &ToEnqueue,
+                      std::vector<Command *> &ToCleanUp);
 
   void *getMemAllocation() const final;
   void printDot(std::ostream &Stream) const final;
@@ -541,6 +562,8 @@ class ExecCGCommand : public Command {
 
   bool producesPiEvent() const final;
 
+  bool supportsPostEnqueueCleanup() const final;
+
 private:
   cl_int enqueueImp() final;
 
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 61f10b41845cd..ec5f9713657c6 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -190,10 +190,17 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord(
         // of the requirements for the current record
         DepDesc Dep = findDepForRecord(Dependant, Record);
         Dep.MDepCommand = Dependency;
-        if (Command *ConnectionCmd = Dependant->addDep(Dep))
+        std::vector<Command *> ToCleanUp;
+        if (Command *ConnectionCmd = Dependant->addDep(Dep, ToCleanUp))
           ToEnqueue.push_back(ConnectionCmd);
         Dependency->addUser(Dependant);
         --(Dependency->MLeafCounter);
+        if (Dependency->MLeafCounter == 0 &&
+            Dependency->isSuccessfullyEnqueued() &&
+            Dependency->supportsPostEnqueueCleanup())
+          ToCleanUp.push_back(Dependency);
+        for (Command *Cmd : ToCleanUp)
+          cleanupCommand(Cmd);
       };
 
   const ContextImplPtr &InteropCtxPtr = Req->MSYCLMemObj->getInteropContext();
@@ -227,15 +234,21 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord(
 
 void Scheduler::GraphBuilder::updateLeaves(const std::set<Command *> &Cmds,
                                            MemObjRecord *Record,
-                                           access::mode AccessMode) {
+                                           access::mode AccessMode,
+                                           std::vector<Command *> &ToCleanUp) {
 
   const bool ReadOnlyReq = AccessMode == access::mode::read;
   if (ReadOnlyReq)
     return;
 
   for (Command *Cmd : Cmds) {
+    bool WasLeaf = Cmd->MLeafCounter > 0;
     Cmd->MLeafCounter -= Record->MReadLeaves.remove(Cmd);
     Cmd->MLeafCounter -= Record->MWriteLeaves.remove(Cmd);
+    if (WasLeaf && Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() &&
+        Cmd->supportsPostEnqueueCleanup()) {
+      ToCleanUp.push_back(Cmd);
+    }
   }
 }
 
@@ -263,15 +276,18 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
 
   std::set<Command *> Deps =
       findDepsForReq(Record, Req, Queue->getContextImplPtr());
+  std::vector<Command *> ToCleanUp;
   for (Command *Dep : Deps) {
     Command *ConnCmd =
-        UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd});
+        UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}, ToCleanUp);
     if (ConnCmd)
       ToEnqueue.push_back(ConnCmd);
     Dep->addUser(UpdateCommand);
   }
-  updateLeaves(Deps, Record, Req->MAccessMode);
+  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
   addNodeToLeaves(Record, UpdateCommand, Req->MAccessMode, ToEnqueue);
+  for (Command *Cmd : ToCleanUp)
+    cleanupCommand(Cmd);
   return UpdateCommand;
 }
 
@@ -376,16 +392,18 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
                             AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
     }
   }
-
+  std::vector<Command *> ToCleanUp;
   for (Command *Dep : Deps) {
-    Command *ConnCmd =
-        NewCmd->addDep(DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst});
+    Command *ConnCmd = NewCmd->addDep(
+        DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}, ToCleanUp);
     if (ConnCmd)
       ToEnqueue.push_back(ConnCmd);
     Dep->addUser(NewCmd);
   }
-  updateLeaves(Deps, Record, access::mode::read_write);
+  updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp);
   addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue);
+  for (Command *Cmd : ToCleanUp)
+    cleanupCommand(Cmd);
   Record->MCurContext = Queue->getContextImplPtr();
   return NewCmd;
 }
@@ -414,22 +432,25 @@ Command *Scheduler::GraphBuilder::remapMemoryObject(
       LinkedAllocaCmd, *LinkedAllocaCmd->getRequirement(),
       &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode);
 
+  std::vector<Command *> ToCleanUp;
   for (Command *Dep : Deps) {
     Command *ConnCmd = UnMapCmd->addDep(
-        DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd});
+        DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}, ToCleanUp);
     if (ConnCmd)
       ToEnqueue.push_back(ConnCmd);
     Dep->addUser(UnMapCmd);
   }
 
   Command *ConnCmd = MapCmd->addDep(
-      DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd});
+      DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}, ToCleanUp);
   if (ConnCmd)
     ToEnqueue.push_back(ConnCmd);
   UnMapCmd->addUser(MapCmd);
 
-  updateLeaves(Deps, Record, access::mode::read_write);
+  updateLeaves(Deps, Record, access::mode::read_write, ToCleanUp);
   addNodeToLeaves(Record, MapCmd, access::mode::read_write, ToEnqueue);
+  for (Command *Cmd : ToCleanUp)
+    cleanupCommand(Cmd);
   Record->MHostAccess = MapMode;
   return MapCmd;
 }
@@ -462,16 +483,20 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req,
     throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
 
   MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release();
+
+  std::vector<Command *> ToCleanUp;
   for (Command *Dep : Deps) {
     Command *ConnCmd = MemCpyCmd->addDep(
-        DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd});
+        DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}, ToCleanUp);
     if (ConnCmd)
       ToEnqueue.push_back(ConnCmd);
     Dep->addUser(MemCpyCmd);
   }
 
-  updateLeaves(Deps, Record, Req->MAccessMode);
+  updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
   addNodeToLeaves(Record, MemCpyCmd, Req->MAccessMode, ToEnqueue);
+  for (Command *Cmd : ToCleanUp)
+    cleanupCommand(Cmd);
   if (MPrintOptionsArray[AfterAddCopyBack])
     printGraphAsDot("after_addCopyBack");
   return MemCpyCmd;
@@ -651,6 +676,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
       findAllocaForReq(Record, Req, Queue->getContextImplPtr());
 
   if (!AllocaCmd) {
+    std::vector<Command *> ToCleanUp;
     if (IsSuitableSubReq(Req)) {
       // Get parent requirement. It's hard to get right parents' range
       // so full parent requirement has range represented in bytes
@@ -662,7 +688,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
 
       auto *ParentAlloca =
           getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue);
-      AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue);
+      AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue,
+                                          ToCleanUp);
     } else {
 
       const Requirement FullReq(/*Offset*/ {0, 0, 0}, Req->MMemoryRange,
@@ -748,8 +775,10 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
 
       // Update linked command
       if (LinkedAllocaCmd) {
-        Command *ConnCmd = AllocaCmd->addDep(DepDesc{
-            LinkedAllocaCmd, AllocaCmd->getRequirement(), LinkedAllocaCmd});
+        Command *ConnCmd = AllocaCmd->addDep(
+            DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(),
+                    LinkedAllocaCmd},
+            ToCleanUp);
         if (ConnCmd)
           ToEnqueue.push_back(ConnCmd);
         LinkedAllocaCmd->addUser(AllocaCmd);
@@ -758,7 +787,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
         // To ensure that the leader allocation is removed first
         ConnCmd = AllocaCmd->getReleaseCmd()->addDep(
             DepDesc(LinkedAllocaCmd->getReleaseCmd(),
-                    AllocaCmd->getRequirement(), LinkedAllocaCmd));
+                    AllocaCmd->getRequirement(), LinkedAllocaCmd),
+            ToCleanUp);
         if (ConnCmd)
           ToEnqueue.push_back(ConnCmd);
 
@@ -775,13 +805,13 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
           std::set<Command *> Deps =
               findDepsForReq(Record, Req, Queue->getContextImplPtr());
           for (Command *Dep : Deps) {
-            Command *ConnCmd =
-                AllocaCmd->addDep(DepDesc{Dep, Req, LinkedAllocaCmd});
+            Command *ConnCmd = AllocaCmd->addDep(
+                DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
             if (ConnCmd)
               ToEnqueue.push_back(ConnCmd);
             Dep->addUser(AllocaCmd);
           }
-          updateLeaves(Deps, Record, Req->MAccessMode);
+          updateLeaves(Deps, Record, Req->MAccessMode, ToCleanUp);
           addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue);
         }
       }
@@ -790,6 +820,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
     Record->MAllocaCommands.push_back(AllocaCmd);
     Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue);
     ++(AllocaCmd->MLeafCounter);
+    for (Command *Cmd : ToCleanUp)
+      cleanupCommand(Cmd);
   }
   return AllocaCmd;
 }
@@ -838,13 +870,16 @@ Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector<T *> &Reqs,
   Cmd->addUser(EmptyCmd);
 
   const std::vector<DepDesc> &Deps = Cmd->MDeps;
+  std::vector<Command *> ToCleanUp;
   for (const DepDesc &Dep : Deps) {
     const Requirement *Req = Dep.MDepRequirement;
     MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
 
-    updateLeaves({Cmd}, Record, Req->MAccessMode);
+    updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
     addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
   }
+  for (Command *Cmd : ToCleanUp)
+    cleanupCommand(Cmd);
 
   return EmptyCmd;
 }
@@ -898,6 +933,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
   // AllocaCommand creation will be dependent on the access mode of the first
   // requirement. Combine these access modes to take all of them into account.
   combineAccessModesOfReqs(Reqs);
+  std::vector<Command *> ToCleanUp;
   for (Requirement *Req : Reqs) {
     MemObjRecord *Record = nullptr;
     AllocaCommandBase *AllocaCmd = nullptr;
@@ -955,7 +991,8 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
         findDepsForReq(Record, Req, Queue->getContextImplPtr());
 
     for (Command *Dep : Deps)
-      if (Command *ConnCmd = NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}))
+      if (Command *ConnCmd =
+              NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}, ToCleanUp))
         ToEnqueue.push_back(ConnCmd);
   }
 
@@ -968,13 +1005,13 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
     Dep.MDepCommand->addUser(NewCmd.get());
     const Requirement *Req = Dep.MDepRequirement;
     MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
-    updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode);
+    updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode, ToCleanUp);
     addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue);
   }
 
   // Register all the events as dependencies
   for (detail::EventImplPtr e : Events) {
-    if (Command *ConnCmd = NewCmd->addDep(e))
+    if (Command *ConnCmd = NewCmd->addDep(e, ToCleanUp))
       ToEnqueue.push_back(ConnCmd);
   }
 
@@ -986,6 +1023,8 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
   if (MPrintOptionsArray[AfterAddCG])
     printGraphAsDot("after_addCG");
 
+  for (Command *Cmd : ToCleanUp)
+    cleanupCommand(Cmd);
   return NewCmd.release();
 }
 
@@ -993,9 +1032,15 @@ void Scheduler::GraphBuilder::decrementLeafCountersForRecord(
     MemObjRecord *Record) {
   for (Command *Cmd : Record->MReadLeaves) {
     --(Cmd->MLeafCounter);
+    if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() &&
+        Cmd->supportsPostEnqueueCleanup())
+      cleanupCommand(Cmd);
   }
   for (Command *Cmd : Record->MWriteLeaves) {
     --(Cmd->MLeafCounter);
+    if (Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued() &&
+        Cmd->supportsPostEnqueueCleanup())
+      cleanupCommand(Cmd);
   }
 }
 
@@ -1088,14 +1133,64 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(
     // If all dependencies have been removed this way, mark the command for
     // deletion
     if (Cmd->MDeps.empty()) {
-      Cmd->MMarks.MToBeDeleted = true;
       Cmd->MUsers.clear();
+      // Do not delete the node if it's scheduled for post-enqueue cleanup to
+      // avoid double free.
+      if (!Cmd->MPostEnqueueCleanup)
+        Cmd->MMarks.MToBeDeleted = true;
     }
   }
 
   handleVisitedNodes(MVisitedCmds);
 }
 
+void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) {
+  if (SYCLConfig<SYCL_DISABLE_POST_ENQUEUE_CLEANUP>::get())
+    return;
+  assert(Cmd->MLeafCounter == 0 && Cmd->isSuccessfullyEnqueued());
+  Command::CommandType CmdT = Cmd->getType();
+
+  assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF);
+  assert(CmdT != Command::RELEASE);
+  assert(CmdT != Command::RUN_CG ||
+         (static_cast<ExecCGCommand *>(Cmd))->getCG().getType() !=
+             CG::CGTYPE::CodeplayHostTask);
+#ifndef NDEBUG
+  if (CmdT == Command::RUN_CG) {
+    auto *ExecCGCmd = static_cast<ExecCGCommand *>(Cmd);
+    if (ExecCGCmd->getCG().getType() == CG::CGTYPE::Kernel) {
+      auto *ExecKernelCG = static_cast<CGExecKernel *>(&ExecCGCmd->getCG());
+      assert(!ExecKernelCG->hasStreams());
+    }
+  }
+#endif
+  (void)CmdT;
+
+  for (Command *UserCmd : Cmd->MUsers) {
+    for (DepDesc &Dep : UserCmd->MDeps) {
+      // Link the users of the command to the alloca command(s) instead
+      if (Dep.MDepCommand == Cmd) {
+        // ... unless the user is the alloca itself.
+        if (Dep.MAllocaCmd == UserCmd) {
+          Dep.MDepCommand = nullptr;
+        } else {
+          Dep.MDepCommand = Dep.MAllocaCmd;
+          Dep.MDepCommand->MUsers.insert(UserCmd);
+        }
+      }
+    }
+  }
+  // Update dependency users
+  for (DepDesc &Dep : Cmd->MDeps) {
+    Command *DepCmd = Dep.MDepCommand;
+    DepCmd->MUsers.erase(Cmd);
+  }
+
+  Cmd->getEvent()->setCommand(nullptr);
+  Cmd->getEvent()->cleanupDependencyEvents();
+  delete Cmd;
+}
+
 void Scheduler::GraphBuilder::cleanupFinishedCommands(
     Command *FinishedCmd,
     std::vector<std::shared_ptr<stream_impl>> &StreamsToDeallocate) {
@@ -1148,7 +1243,14 @@ void Scheduler::GraphBuilder::cleanupFinishedCommands(
       DepCmd->MUsers.erase(Cmd);
     }
 
-    Cmd->MMarks.MToBeDeleted = true;
+    // Isolate the node instead of deleting it if it's scheduled for
+    // post-enqueue cleanup to avoid double free.
+    if (Cmd->MPostEnqueueCleanup) {
+      Cmd->MDeps.clear();
+      Cmd->MUsers.clear();
+    } else {
+      Cmd->MMarks.MToBeDeleted = true;
+    }
   }
   handleVisitedNodes(MVisitedCmds);
 }
@@ -1175,9 +1277,9 @@ void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) {
 // requirement in Dep we make ConnectCmd depend on DepEvent's command with this
 // requirement.
 // Optionality of Dep is set by Dep.MDepCommand equal to nullptr.
-Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd,
-                                                  EventImplPtr DepEvent,
-                                                  const DepDesc &Dep) {
+Command *Scheduler::GraphBuilder::connectDepEvent(
+    Command *const Cmd, EventImplPtr DepEvent, const DepDesc &Dep,
+    std::vector<Command *> &ToCleanUp) {
   assert(Cmd->getWorkerContext() != DepEvent->getContextImpl());
 
   // construct Host Task type command manually and make it depend on DepEvent
@@ -1206,14 +1308,15 @@ Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd,
     // make ConnectCmd depend on requirement
     // Dismiss the result here as it's not a connection now,
     // 'cause ConnectCmd is host one
-    (void)ConnectCmd->addDep(Dep);
+    (void)ConnectCmd->addDep(Dep, ToCleanUp);
     assert(reinterpret_cast<Command *>(DepEvent->getCommand()) ==
            Dep.MDepCommand);
     // add user to Dep.MDepCommand is already performed beyond this if branch
 
     MemObjRecord *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj);
+    updateLeaves({Dep.MDepCommand}, Record, Dep.MDepRequirement->MAccessMode,
+                 ToCleanUp);
 
-    updateLeaves({Dep.MDepCommand}, Record, Dep.MDepRequirement->MAccessMode);
     std::vector<Command *> ToEnqueue;
     addNodeToLeaves(Record, ConnectCmd, Dep.MDepRequirement->MAccessMode,
                     ToEnqueue);
@@ -1233,7 +1336,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd,
 
       // Dismiss the result here as it's not a connection now,
       // 'cause EmptyCmd is host one
-      (void)Cmd->addDep(CmdDep);
+      (void)Cmd->addDep(CmdDep, ToCleanUp);
     }
   } else {
     std::vector<Command *> ToEnqueue;
@@ -1246,13 +1349,13 @@ Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd,
     // ConnectCmd via its event.
     // Dismiss the result here as it's not a connection now,
     // 'cause ConnectCmd is host one.
-    (void)EmptyCmd->addDep(ConnectCmd->getEvent());
-    (void)ConnectCmd->addDep(DepEvent);
+    (void)EmptyCmd->addDep(ConnectCmd->getEvent(), ToCleanUp);
+    (void)ConnectCmd->addDep(DepEvent, ToCleanUp);
 
     // Depend Cmd on empty command
     // Dismiss the result here as it's not a connection now,
     // 'cause EmptyCmd is host one
-    (void)Cmd->addDep(EmptyCmd->getEvent());
+    (void)Cmd->addDep(EmptyCmd->getEvent(), ToCleanUp);
   }
 
   EmptyCmd->addUser(Cmd);
diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp
index 69dbb626f9dc2..6e533df30a09c 100644
--- a/sycl/source/detail/scheduler/graph_processor.cpp
+++ b/sycl/source/detail/scheduler/graph_processor.cpp
@@ -23,6 +23,7 @@ static Command *getCommand(const EventImplPtr &Event) {
 
 void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event,
                                              ReadLockT &GraphReadLock,
+                                             std::vector<Command *> &ToCleanUp,
                                              bool LockTheLock) {
   Command *Cmd = getCommand(Event);
   // Command can be nullptr if user creates cl::sycl::event explicitly or the
@@ -31,7 +32,7 @@ void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event,
     return;
 
   EnqueueResultT Res;
-  bool Enqueued = enqueueCommand(Cmd, Res, BLOCKING);
+  bool Enqueued = enqueueCommand(Cmd, Res, ToCleanUp, BLOCKING);
   if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
     // TODO: Reschedule commands.
     throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
@@ -45,9 +46,9 @@ void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event,
     GraphReadLock.lock();
 }
 
-bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd,
-                                               EnqueueResultT &EnqueueResult,
-                                               BlockingT Blocking) {
+bool Scheduler::GraphProcessor::enqueueCommand(
+    Command *Cmd, EnqueueResultT &EnqueueResult,
+    std::vector<Command *> &ToCleanUp, BlockingT Blocking) {
   if (!Cmd || Cmd->isSuccessfullyEnqueued())
     return true;
 
@@ -60,7 +61,7 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd,
   // Recursively enqueue all the dependencies first and
   // exit immediately if any of the commands cannot be enqueued.
   for (DepDesc &Dep : Cmd->MDeps) {
-    if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, Blocking))
+    if (!enqueueCommand(Dep.MDepCommand, EnqueueResult, ToCleanUp, Blocking))
       return false;
   }
 
@@ -76,7 +77,7 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd,
   // implemented.
   for (const EventImplPtr &Event : Cmd->getPreparedHostDepsEvents()) {
     if (Command *DepCmd = static_cast<Command *>(Event->getCommand()))
-      if (!enqueueCommand(DepCmd, EnqueueResult, Blocking))
+      if (!enqueueCommand(DepCmd, EnqueueResult, ToCleanUp, Blocking))
         return false;
   }
 
@@ -93,7 +94,7 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd,
   // on completion of C and starts cleanup process. This thread is still in the
   // middle of enqueue of B. The other thread modifies dependency list of A by
   // removing C out of it. Iterators become invalid.
-  return Cmd->enqueue(EnqueueResult, Blocking);
+  return Cmd->enqueue(EnqueueResult, Blocking, ToCleanUp);
 }
 
 } // namespace detail
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index c17beb8a3621d..232ee0a5d6e47 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -32,31 +32,32 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record,
   // Will contain the list of dependencies for the Release Command
   std::set<Command *> DepCommands;
 #endif
+  std::vector<Command *> ToCleanUp;
   for (Command *Cmd : Record->MReadLeaves) {
     EnqueueResultT Res;
-    bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res);
+    bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
     // Capture the dependencies
     DepCommands.insert(Cmd);
 #endif
-    GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock);
+    GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, ToCleanUp);
   }
   for (Command *Cmd : Record->MWriteLeaves) {
     EnqueueResultT Res;
-    bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res);
+    bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
     DepCommands.insert(Cmd);
 #endif
-    GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock);
+    GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, ToCleanUp);
   }
   for (AllocaCommandBase *AllocaCmd : Record->MAllocaCommands) {
     Command *ReleaseCmd = AllocaCmd->getReleaseCmd();
     EnqueueResultT Res;
-    bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, Res);
+    bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, Res, ToCleanUp);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
@@ -64,7 +65,8 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record,
     // reported as edges
     ReleaseCmd->resolveReleaseDependencies(DepCommands);
 #endif
-    GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock);
+    GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock,
+                                 ToCleanUp);
   }
 }
 
@@ -108,6 +110,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr<detail::CG> CommandGroup,
     NewEvent = NewCmd->getEvent();
   }
 
+  std::vector<Command *> ToCleanUp;
   {
     ReadLockT Lock(MGraphLock);
 
@@ -127,7 +130,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr<detail::CG> CommandGroup,
     };
 
     for (Command *Cmd : AuxiliaryCmds) {
-      Enqueued = GraphProcessor::enqueueCommand(Cmd, Res);
+      Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp);
       try {
         if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
           throw runtime_error("Auxiliary enqueue process failed.",
@@ -144,7 +147,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr<detail::CG> CommandGroup,
       // TODO: Check if lazy mode.
       EnqueueResultT Res;
       try {
-        bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res);
+        bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp);
         if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
           throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
       } catch (...) {
@@ -161,6 +164,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr<detail::CG> CommandGroup,
       CleanUp();
     }
   }
+  cleanupCommands(ToCleanUp);
 
   for (auto StreamImplPtr : Streams) {
     StreamImplPtr->flush();
@@ -182,24 +186,27 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) {
       return nullptr;
   }
 
+  std::vector<Command *> ToCleanUp;
   try {
     ReadLockT Lock(MGraphLock);
     EnqueueResultT Res;
     bool Enqueued;
 
     for (Command *Cmd : AuxiliaryCmds) {
-      Enqueued = GraphProcessor::enqueueCommand(Cmd, Res);
+      Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp);
       if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
         throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
     }
 
-    Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res);
+    Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
   } catch (...) {
     NewCmd->getQueue()->reportAsyncException(std::current_exception());
   }
-  return NewCmd->getEvent();
+  EventImplPtr NewEvent = NewCmd->getEvent();
+  cleanupCommands(ToCleanUp);
+  return NewEvent;
 }
 
 Scheduler &Scheduler::getInstance() {
@@ -210,7 +217,10 @@ void Scheduler::waitForEvent(EventImplPtr Event) {
   ReadLockT Lock(MGraphLock);
   // It's fine to leave the lock unlocked upon return from waitForEvent as
   // there's no more actions to do here with graph
-  GraphProcessor::waitForEvent(std::move(Event), Lock, /*LockTheLock=*/false);
+  std::vector<Command *> ToCleanUp;
+  GraphProcessor::waitForEvent(std::move(Event), Lock, ToCleanUp,
+                               /*LockTheLock=*/false);
+  cleanupCommands(ToCleanUp);
 }
 
 static void deallocateStreams(
@@ -293,44 +303,51 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) {
   if (!NewCmd)
     return nullptr;
 
+  std::vector<Command *> ToCleanUp;
   {
     ReadLockT ReadLock(MGraphLock);
     EnqueueResultT Res;
     bool Enqueued;
 
     for (Command *Cmd : AuxiliaryCmds) {
-      Enqueued = GraphProcessor::enqueueCommand(Cmd, Res);
+      Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp);
       if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
         throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
     }
 
-    Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res);
+    Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res, ToCleanUp);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
   }
 
-  return NewCmd->getEvent();
+  EventImplPtr NewEvent = NewCmd->getEvent();
+  cleanupCommands(ToCleanUp);
+  return NewEvent;
 }
 
 void Scheduler::releaseHostAccessor(Requirement *Req) {
   Command *const BlockedCmd = Req->MBlockedCmd;
 
-  ReadLockT Lock(MGraphLock);
+  std::vector<Command *> ToCleanUp;
+  {
+    ReadLockT Lock(MGraphLock);
 
-  assert(BlockedCmd && "Can't find appropriate command to unblock");
+    assert(BlockedCmd && "Can't find appropriate command to unblock");
 
-  BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady;
+    BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady;
 
-  enqueueLeavesOfReqUnlocked(Req);
+    enqueueLeavesOfReqUnlocked(Req, ToCleanUp);
+  }
+  cleanupCommands(ToCleanUp);
 }
 
-// static
-void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req) {
+void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req,
+                                           std::vector<Command *> &ToCleanUp) {
   MemObjRecord *Record = Req->MSYCLMemObj->MRecord.get();
-  auto EnqueueLeaves = [](LeavesCollection &Leaves) {
+  auto EnqueueLeaves = [&ToCleanUp](LeavesCollection &Leaves) {
     for (Command *Cmd : Leaves) {
       EnqueueResultT Res;
-      bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res);
+      bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res, ToCleanUp);
       if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
         throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
     }
@@ -379,6 +396,10 @@ Scheduler::~Scheduler() {
           "not all resources were released. Please be sure that all kernels "
           "have synchronization points.\n\n");
   }
+  // There might be some commands scheduled for post enqueue cleanup that
+  // haven't been freed because of the graph mutex being locked at the time,
+  // clean them up now.
+  cleanupCommands({});
 }
 
 void Scheduler::acquireWriteLock(WriteLockT &Lock) {
@@ -406,6 +427,32 @@ MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) {
   return Req->MSYCLMemObj->MRecord.get();
 }
 
+void Scheduler::cleanupCommands(const std::vector<Command *> &Cmds) {
+  if (Cmds.empty())
+    return;
+  WriteLockT Lock(MGraphLock, std::try_to_lock);
+  // In order to avoid deadlocks related to blocked commands, defer cleanup if
+  // the lock wasn't acquired.
+  if (Lock.owns_lock()) {
+    for (Command *Cmd : Cmds) {
+      MGraphBuilder.cleanupCommand(Cmd);
+    }
+    std::vector<Command *> DeferredCleanupCommands;
+    {
+      std::lock_guard<std::mutex> Lock{MDeferredCleanupMutex};
+      std::swap(DeferredCleanupCommands, MDeferredCleanupCommands);
+    }
+    for (Command *Cmd : DeferredCleanupCommands) {
+      MGraphBuilder.cleanupCommand(Cmd);
+    }
+
+  } else {
+    std::lock_guard<std::mutex> Lock{MDeferredCleanupMutex};
+    MDeferredCleanupCommands.insert(MDeferredCleanupCommands.end(),
+                                    Cmds.begin(), Cmds.end());
+  }
+}
+
 } // namespace detail
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 66e17d7862301..63e2cccaabfd2 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -459,7 +459,10 @@ class Scheduler {
   /// \param Lock is an instance of WriteLockT, created with \c std::defer_lock
   void acquireWriteLock(WriteLockT &Lock);
 
-  static void enqueueLeavesOfReqUnlocked(const Requirement *const Req);
+  void cleanupCommands(const std::vector<Command *> &Cmds);
+
+  static void enqueueLeavesOfReqUnlocked(const Requirement *const Req,
+                                         std::vector<Command *> &ToCleanUp);
 
   /// Graph builder class.
   ///
@@ -505,6 +508,8 @@ class Scheduler {
     /// with Event passed and its dependencies.
     void optimize(EventImplPtr Event);
 
+    void cleanupCommand(Command *Cmd);
+
     /// Removes finished non-leaf non-alloca commands from the subgraph
     /// (assuming that all its commands have been waited for).
     void cleanupFinishedCommands(
@@ -547,17 +552,21 @@ class Scheduler {
 
     /// Removes commands from leaves.
     void updateLeaves(const std::set<Command *> &Cmds, MemObjRecord *Record,
-                      access::mode AccessMode);
+                      access::mode AccessMode,
+                      std::vector<Command *> &ToCleanUp);
 
     /// Perform connection of events in multiple contexts
     /// \param Cmd dependant command
     /// \param DepEvent event to depend on
     /// \param Dep optional DepDesc to perform connection properly
+    /// \param ToCleanUp container for commands that can be cleaned up due to
+    /// their removal from leaves
     /// \returns the connecting command which is to be enqueued
     ///
     /// Optionality of Dep is set by Dep.MDepCommand equal to nullptr.
     Command *connectDepEvent(Command *const Cmd, EventImplPtr DepEvent,
-                             const DepDesc &Dep);
+                             const DepDesc &Dep,
+                             std::vector<Command *> &ToCleanUp);
 
     std::vector<SYCLMemObjI *> MMemObjs;
 
@@ -720,21 +729,25 @@ class Scheduler {
   public:
     /// Waits for the command, associated with Event passed, is completed.
     /// \param GraphReadLock read-lock which is already acquired for reading
+    /// \param ToCleanUp container for commands that can be cleaned up.
     /// \param LockTheLock selects if graph lock should be locked upon return
     ///
     /// The function may unlock and lock GraphReadLock as needed. Upon return
     /// the lock is left in locked state if and only if LockTheLock is true.
     static void waitForEvent(EventImplPtr Event, ReadLockT &GraphReadLock,
+                             std::vector<Command *> &ToCleanUp,
                              bool LockTheLock = true);
 
     /// Enqueues the command and all its dependencies.
     ///
     /// \param EnqueueResult is set to specific status if enqueue failed.
+    /// \param ToCleanUp container for commands that can be cleaned up.
     /// \return true if the command is successfully enqueued.
     ///
     /// The function may unlock and lock GraphReadLock as needed. Upon return
     /// the lock is left in locked state.
     static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult,
+                               std::vector<Command *> &ToCleanUp,
                                BlockingT Blocking = NON_BLOCKING);
   };
 
@@ -751,6 +764,9 @@ class Scheduler {
   GraphBuilder MGraphBuilder;
   RWLockT MGraphLock;
 
+  std::vector<Command *> MDeferredCleanupCommands;
+  std::mutex MDeferredCleanupMutex;
+
   QueueImplPtr DefaultHostQueue;
 
   friend class Command;
diff --git a/sycl/unittests/scheduler/BlockedCommands.cpp b/sycl/unittests/scheduler/BlockedCommands.cpp
index 967b3ee75531c..c45b785980c8b 100644
--- a/sycl/unittests/scheduler/BlockedCommands.cpp
+++ b/sycl/unittests/scheduler/BlockedCommands.cpp
@@ -81,10 +81,10 @@ TEST_F(SchedulerTest, DontEnqueueDepsIfOneOfThemIsBlocked) {
   //
   // If C is blocked, we should not try to enqueue D.
 
-  EXPECT_CALL(A, enqueue(_, _)).Times(0);
-  EXPECT_CALL(B, enqueue(_, _)).Times(1);
-  EXPECT_CALL(C, enqueue(_, _)).Times(0);
-  EXPECT_CALL(D, enqueue(_, _)).Times(0);
+  EXPECT_CALL(A, enqueue).Times(0);
+  EXPECT_CALL(B, enqueue).Times(1);
+  EXPECT_CALL(C, enqueue).Times(0);
+  EXPECT_CALL(D, enqueue).Times(0);
 
   MockScheduler MS;
   auto Lock = MS.acquireGraphReadLock();
@@ -113,8 +113,8 @@ TEST_F(SchedulerTest, EnqueueBlockedCommandEarlyExit) {
   //
   // If A is blocked, we should not try to enqueue B.
 
-  EXPECT_CALL(A, enqueue(_, _)).Times(0);
-  EXPECT_CALL(B, enqueue(_, _)).Times(0);
+  EXPECT_CALL(A, enqueue).Times(0);
+  EXPECT_CALL(B, enqueue).Times(0);
 
   MockScheduler MS;
   auto Lock = MS.acquireGraphReadLock();
@@ -127,8 +127,8 @@ TEST_F(SchedulerTest, EnqueueBlockedCommandEarlyExit) {
 
   // But if the enqueue type is blocking we should not exit early.
 
-  EXPECT_CALL(A, enqueue(_, _)).Times(0);
-  EXPECT_CALL(B, enqueue(_, _)).Times(1);
+  EXPECT_CALL(A, enqueue).Times(0);
+  EXPECT_CALL(B, enqueue).Times(1);
 
   Enqueued = MockScheduler::enqueueCommand(&A, Res, detail::BLOCKING);
   ASSERT_FALSE(Enqueued) << "Blocked command should not be enqueued\n";
@@ -154,7 +154,8 @@ TEST_F(SchedulerTest, EnqueueHostDependency) {
       new cl::sycl::detail::event_impl(detail::getSyclObjImpl(MQueue))};
   DepEvent->setCommand(&B);
 
-  (void)A.addDep(DepEvent);
+  std::vector<detail::Command *> ToCleanUp;
+  (void)A.addDep(DepEvent, ToCleanUp);
 
   // We have such a "graph":
   //
@@ -166,8 +167,8 @@ TEST_F(SchedulerTest, EnqueueHostDependency) {
   // "Graph" is quoted as we don't have this dependency in MDeps. Instead, we
   // have this dependecy as result of handler::depends_on() call.
 
-  EXPECT_CALL(A, enqueue(_, _)).Times(1);
-  EXPECT_CALL(B, enqueue(_, _)).Times(1);
+  EXPECT_CALL(A, enqueue).Times(1);
+  EXPECT_CALL(B, enqueue).Times(1);
 
   MockScheduler MS;
   auto Lock = MS.acquireGraphReadLock();
diff --git a/sycl/unittests/scheduler/CMakeLists.txt b/sycl/unittests/scheduler/CMakeLists.txt
index 1ec8c7b4d894a..ceb7c8990c3f2 100644
--- a/sycl/unittests/scheduler/CMakeLists.txt
+++ b/sycl/unittests/scheduler/CMakeLists.txt
@@ -16,5 +16,6 @@ add_sycl_unittest(SchedulerTests OBJECT
     AllocaLinking.cpp
     RequiredWGSize.cpp
     QueueFlushing.cpp
+    PostEnqueueCleanup.cpp
     utils.cpp
 )
diff --git a/sycl/unittests/scheduler/FailedCommands.cpp b/sycl/unittests/scheduler/FailedCommands.cpp
index 37a7a71a4afdc..4d294b8eff741 100644
--- a/sycl/unittests/scheduler/FailedCommands.cpp
+++ b/sycl/unittests/scheduler/FailedCommands.cpp
@@ -16,7 +16,8 @@ TEST_F(SchedulerTest, FailedDependency) {
   MockCommand MDep(detail::getSyclObjImpl(MQueue));
   MockCommand MUser(detail::getSyclObjImpl(MQueue));
   MDep.addUser(&MUser);
-  (void)MUser.addDep(detail::DepDesc{&MDep, &MockReq, nullptr});
+  std::vector<detail::Command *> ToCleanUp;
+  (void)MUser.addDep(detail::DepDesc{&MDep, &MockReq, nullptr}, ToCleanUp);
   MUser.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueReady;
   MDep.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueFailed;
 
diff --git a/sycl/unittests/scheduler/LeafLimit.cpp b/sycl/unittests/scheduler/LeafLimit.cpp
index ffed74ba0e1ec..333a06dca5bd3 100644
--- a/sycl/unittests/scheduler/LeafLimit.cpp
+++ b/sycl/unittests/scheduler/LeafLimit.cpp
@@ -9,6 +9,9 @@
 #include "SchedulerTest.hpp"
 #include "SchedulerTestUtils.hpp"
 
+#include <detail/config.hpp>
+#include <helpers/ScopedEnvVar.hpp>
+
 #include <algorithm>
 #include <cstddef>
 #include <memory>
@@ -16,10 +19,19 @@
 
 using namespace cl::sycl;
 
+inline constexpr auto DisablePostEnqueueCleanupName =
+    "SYCL_DISABLE_POST_ENQUEUE_CLEANUP";
+
 // Checks that scheduler's (or graph-builder's) addNodeToLeaves method works
 // correctly with dependency tracking when leaf-limit for generic commands is
 // overflowed.
 TEST_F(SchedulerTest, LeafLimit) {
+  // All of the mock commands are owned on the test side, prevent post enqueue
+  // cleanup from deleting some of them.
+  unittest::ScopedEnvVar DisabledCleanup{
+      DisablePostEnqueueCleanupName, "1",
+      detail::SYCLConfig<detail::SYCL_DISABLE_POST_ENQUEUE_CLEANUP>::reset};
+  cl::sycl::queue HQueue(host_selector{});
   MockScheduler MS;
   std::vector<std::unique_ptr<MockCommand>> LeavesToAdd;
   std::unique_ptr<MockCommand> MockDepCmd;
@@ -40,10 +52,12 @@ TEST_F(SchedulerTest, LeafLimit) {
         std::make_unique<MockCommand>(detail::getSyclObjImpl(MQueue), MockReq));
   }
   // Create edges: all soon-to-be leaves are direct users of MockDep
+  std::vector<detail::Command *> ToCleanUp;
   for (auto &Leaf : LeavesToAdd) {
     MockDepCmd->addUser(Leaf.get());
     (void)Leaf->addDep(
-        detail::DepDesc{MockDepCmd.get(), Leaf->getRequirement(), nullptr});
+        detail::DepDesc{MockDepCmd.get(), Leaf->getRequirement(), nullptr},
+        ToCleanUp);
   }
   std::vector<cl::sycl::detail::Command *> ToEnqueue;
   // Add edges as leaves and exceed the leaf limit
diff --git a/sycl/unittests/scheduler/PostEnqueueCleanup.cpp b/sycl/unittests/scheduler/PostEnqueueCleanup.cpp
new file mode 100644
index 0000000000000..8386af6394628
--- /dev/null
+++ b/sycl/unittests/scheduler/PostEnqueueCleanup.cpp
@@ -0,0 +1,280 @@
+//==--------- PostEnqueueCleanup.cpp --- Scheduler unit tests --------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SchedulerTest.hpp"
+#include "SchedulerTestUtils.hpp"
+
+#include <helpers/CommonRedefinitions.hpp>
+#include <helpers/PiMock.hpp>
+#include <helpers/ScopedEnvVar.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+using namespace sycl;
+
+inline constexpr auto HostUnifiedMemoryName = "SYCL_HOST_UNIFIED_MEMORY";
+
+int val;
+static pi_result redefinedEnqueueMemBufferMap(
+    pi_queue command_queue, pi_mem buffer, pi_bool blocking_map,
+    pi_map_flags map_flags, size_t offset, size_t size,
+    pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
+    pi_event *event, void **ret_map) {
+  *event = reinterpret_cast<pi_event>(new int{});
+  *ret_map = &val;
+  return PI_SUCCESS;
+}
+
+static pi_result redefinedEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
+                                          void *mapped_ptr,
+                                          pi_uint32 num_events_in_wait_list,
+                                          const pi_event *event_wait_list,
+                                          pi_event *event) {
+  *event = reinterpret_cast<pi_event>(new int{});
+  return PI_SUCCESS;
+}
+
+static pi_result redefinedEnqueueMemBufferFill(
+    pi_queue command_queue, pi_mem buffer, const void *pattern,
+    size_t pattern_size, size_t offset, size_t size,
+    pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
+    pi_event *event) {
+  *event = reinterpret_cast<pi_event>(new int{});
+  return PI_SUCCESS;
+}
+
+static void verifyCleanup(detail::MemObjRecord *Record,
+                          detail::AllocaCommandBase *AllocaCmd,
+                          detail::Command *DeletedCmd, bool &CmdDeletedFlag) {
+  EXPECT_TRUE(CmdDeletedFlag);
+  CmdDeletedFlag = false;
+  EXPECT_EQ(
+      std::find(AllocaCmd->MUsers.begin(), AllocaCmd->MUsers.end(), DeletedCmd),
+      AllocaCmd->MUsers.end());
+  detail::Command *Leaf = *Record->MWriteLeaves.begin();
+  EXPECT_FALSE(std::any_of(Leaf->MDeps.begin(), Leaf->MDeps.end(),
+                           [&](const detail::DepDesc &Dep) {
+                             return Dep.MDepCommand == DeletedCmd;
+                           }));
+}
+
+// Check that any non-leaf commands enqueued as part of high level scheduler
+// calls are cleaned up.
+static void checkCleanupOnEnqueue(MockScheduler &MS,
+                                  detail::QueueImplPtr &QueueImpl,
+                                  buffer<int, 1> &Buf,
+                                  detail::Requirement &MockReq) {
+  bool CommandDeleted = false;
+  std::vector<detail::Command *> AuxCmds;
+  std::vector<detail::Command *> ToCleanUp;
+  std::vector<detail::Command *> ToEnqueue;
+  detail::MemObjRecord *Record =
+      MS.getOrInsertMemObjRecord(QueueImpl, &MockReq, AuxCmds);
+  detail::AllocaCommandBase *AllocaCmd =
+      MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, AuxCmds);
+  std::function<void()> Callback = [&CommandDeleted]() {
+    CommandDeleted = true;
+  };
+
+  // Check addCG.
+  MockCommand *MockCmd =
+      new MockCommandWithCallback(QueueImpl, MockReq, Callback);
+  (void)MockCmd->addDep(detail::DepDesc(AllocaCmd, &MockReq, nullptr),
+                        ToCleanUp);
+  EXPECT_TRUE(ToCleanUp.empty());
+  MS.addNodeToLeaves(Record, MockCmd, access::mode::read_write, ToEnqueue);
+  MS.updateLeaves({AllocaCmd}, Record, access::mode::read_write, ToCleanUp);
+
+  EXPECT_TRUE(ToCleanUp.empty());
+  std::unique_ptr<detail::CG> CG{new detail::CGFill(/*Pattern*/ {}, &MockReq,
+                                                    /*ArgsStorage*/ {},
+                                                    /*AccStorage*/ {},
+                                                    /*SharedPtrStorage*/ {},
+                                                    /*Requirements*/ {&MockReq},
+                                                    /*Events*/ {})};
+  detail::EventImplPtr Event = MS.addCG(std::move(CG), QueueImpl);
+  auto *Cmd = static_cast<detail::Command *>(Event->getCommand());
+  verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted);
+
+  // Check add/releaseHostAccessor.
+  CommandDeleted = false;
+  MockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback);
+  addEdge(MockCmd, Cmd, AllocaCmd);
+  MS.addNodeToLeaves(Record, MockCmd, access::mode::read_write, ToEnqueue);
+  MS.updateLeaves({Cmd}, Record, access::mode::read_write, ToCleanUp);
+  MS.addHostAccessor(&MockReq);
+  verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted);
+
+  CommandDeleted = false;
+  MockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback);
+  addEdge(MockCmd, AllocaCmd, AllocaCmd);
+  MockCommand *LeafMockCmd =
+      new MockCommandWithCallback(QueueImpl, MockReq, Callback);
+  addEdge(LeafMockCmd, MockCmd, AllocaCmd);
+  MS.addNodeToLeaves(Record, LeafMockCmd, access::mode::read_write, ToEnqueue);
+  MS.releaseHostAccessor(&MockReq);
+  MockReq.MBlockedCmd = nullptr;
+  verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted);
+
+  auto addNewMockCmds = [&]() -> MockCommand * {
+    CommandDeleted = false;
+    MockCmd = LeafMockCmd;
+    LeafMockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback);
+    addEdge(LeafMockCmd, MockCmd, AllocaCmd);
+    MS.addNodeToLeaves(Record, LeafMockCmd, access::mode::read_write,
+                       ToEnqueue);
+    // Since this mock command has already been enqueued, it's expected to be
+    // cleaned up during removal from leaves.
+    ToCleanUp.clear();
+    MS.updateLeaves({MockCmd}, Record, access::mode::read_write, ToCleanUp);
+    EXPECT_EQ(ToCleanUp.size(), 1U);
+    EXPECT_EQ(ToCleanUp[0], MockCmd);
+    MS.cleanupCommands({MockCmd});
+    verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted);
+    CommandDeleted = false;
+    MockCmd = LeafMockCmd;
+    LeafMockCmd = new MockCommandWithCallback(QueueImpl, MockReq, Callback);
+    addEdge(LeafMockCmd, MockCmd, AllocaCmd);
+    MS.addNodeToLeaves(Record, LeafMockCmd, access::mode::read_write,
+                       ToEnqueue);
+    MS.updateLeaves({MockCmd}, Record, access::mode::read_write, ToCleanUp);
+    return MockCmd;
+  };
+
+  // Check waitForEvent
+  MockCmd = addNewMockCmds();
+  MS.waitForEvent(LeafMockCmd->getEvent());
+  verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted);
+
+  // Check addCopyBack
+  MockCmd = addNewMockCmds();
+  LeafMockCmd->getEvent()->getHandleRef() =
+      reinterpret_cast<pi_event>(new int{});
+  MS.addCopyBack(&MockReq);
+  verifyCleanup(Record, AllocaCmd, MockCmd, CommandDeleted);
+
+  MS.removeRecordForMemObj(detail::getSyclObjImpl(Buf).get());
+}
+
+static void checkCleanupOnLeafUpdate(
+    MockScheduler &MS, detail::QueueImplPtr &QueueImpl, buffer<int, 1> &Buf,
+    detail::Requirement &MockReq,
+    std::function<void(detail::MemObjRecord *)> SchedulerCall) {
+  bool CommandDeleted = false;
+  std::vector<detail::Command *> AuxCmds;
+  std::vector<detail::Command *> ToCleanUp;
+  std::vector<detail::Command *> ToEnqueue;
+  detail::MemObjRecord *Record =
+      MS.getOrInsertMemObjRecord(QueueImpl, &MockReq, AuxCmds);
+  detail::AllocaCommandBase *AllocaCmd =
+      MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, AuxCmds);
+  std::function<void()> Callback = [&CommandDeleted]() {
+    CommandDeleted = true;
+  };
+
+  // Add a mock command as a leaf and enqueue it.
+  MockCommand *MockCmd =
+      new MockCommandWithCallback(QueueImpl, MockReq, Callback);
+  (void)MockCmd->addDep(detail::DepDesc(AllocaCmd, &MockReq, nullptr),
+                        ToCleanUp);
+  EXPECT_TRUE(ToCleanUp.empty());
+  MS.addNodeToLeaves(Record, MockCmd, access::mode::read_write, ToEnqueue);
+  MS.updateLeaves({AllocaCmd}, Record, access::mode::read_write, ToCleanUp);
+  detail::EnqueueResultT Res;
+  MockScheduler::enqueueCommand(MockCmd, Res, detail::BLOCKING);
+
+  EXPECT_FALSE(CommandDeleted);
+  SchedulerCall(Record);
+  EXPECT_TRUE(CommandDeleted);
+  MS.removeRecordForMemObj(detail::getSyclObjImpl(Buf).get());
+}
+
+TEST_F(SchedulerTest, PostEnqueueCleanup) {
+  default_selector Selector;
+  platform Plt{default_selector()};
+  if (Plt.is_host()) {
+    std::cout << "Not run due to host-only environment\n";
+    return;
+  }
+
+  // Enforce creation of linked commands to test all sites of calling cleanup.
+  unittest::ScopedEnvVar HostUnifiedMemoryVar{
+      HostUnifiedMemoryName, "1",
+      detail::SYCLConfig<detail::SYCL_HOST_UNIFIED_MEMORY>::reset};
+  unittest::PiMock Mock{Plt};
+  setupDefaultMockAPIs(Mock);
+  Mock.redefine<detail::PiApiKind::piEnqueueMemBufferMap>(
+      redefinedEnqueueMemBufferMap);
+  Mock.redefine<detail::PiApiKind::piEnqueueMemUnmap>(redefinedEnqueueMemUnmap);
+  Mock.redefine<detail::PiApiKind::piEnqueueMemBufferFill>(
+      redefinedEnqueueMemBufferFill);
+
+  context Ctx{Plt};
+  queue Queue{Ctx, Selector};
+  detail::QueueImplPtr QueueImpl = detail::getSyclObjImpl(Queue);
+  MockScheduler MS;
+
+  buffer<int, 1> Buf{range<1>(1)};
+  std::shared_ptr<detail::buffer_impl> BufImpl = detail::getSyclObjImpl(Buf);
+  detail::Requirement MockReq = getMockRequirement(Buf);
+  MockReq.MDims = 1;
+  MockReq.MSYCLMemObj = BufImpl.get();
+
+  checkCleanupOnEnqueue(MS, QueueImpl, Buf, MockReq);
+  std::vector<detail::Command *> ToEnqueue;
+  checkCleanupOnLeafUpdate(MS, QueueImpl, Buf, MockReq,
+                           [&](detail::MemObjRecord *Record) {
+                             MS.decrementLeafCountersForRecord(Record);
+                           });
+  checkCleanupOnLeafUpdate(
+      MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) {
+        MS.insertMemoryMove(Record, &MockReq, QueueImpl, ToEnqueue);
+      });
+  checkCleanupOnLeafUpdate(MS, QueueImpl, Buf, MockReq,
+                           [&](detail::MemObjRecord *Record) {
+                             Record->MMemModified = true;
+                             MS.addCopyBack(&MockReq, ToEnqueue);
+                           });
+  checkCleanupOnLeafUpdate(
+      MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) {
+        detail::Command *Leaf = *Record->MWriteLeaves.begin();
+        MS.addEmptyCmd(Leaf, {&MockReq}, QueueImpl,
+                       detail::Command::BlockReason::HostTask, ToEnqueue);
+      });
+  device HostDevice;
+  detail::QueueImplPtr DefaultHostQueue{
+      new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})};
+  checkCleanupOnLeafUpdate(
+      MS, DefaultHostQueue, Buf, MockReq, [&](detail::MemObjRecord *Record) {
+        MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, ToEnqueue);
+      });
+  // Check cleanup on exceeding leaf limit.
+  checkCleanupOnLeafUpdate(
+      MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) {
+        std::vector<std::unique_ptr<MockCommand>> Leaves;
+        for (std::size_t I = 0;
+             I < Record->MWriteLeaves.genericCommandsCapacity(); ++I)
+          Leaves.push_back(std::make_unique<MockCommand>(QueueImpl, MockReq));
+
+        detail::AllocaCommandBase *AllocaCmd = Record->MAllocaCommands[0];
+        std::vector<detail::Command *> ToCleanUp;
+        for (std::unique_ptr<MockCommand> &MockCmd : Leaves) {
+          (void)MockCmd->addDep(detail::DepDesc(AllocaCmd, &MockReq, AllocaCmd),
+                                ToCleanUp);
+          MS.addNodeToLeaves(Record, MockCmd.get(), access::mode::read_write,
+                             ToEnqueue);
+        }
+        for (std::unique_ptr<MockCommand> &MockCmd : Leaves)
+          MS.updateLeaves({MockCmd.get()}, Record, access::mode::read_write,
+                          ToCleanUp);
+        EXPECT_TRUE(ToCleanUp.empty());
+      });
+}
diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp
index 14e622dcdeb43..194d7c14fa59a 100644
--- a/sycl/unittests/scheduler/QueueFlushing.cpp
+++ b/sycl/unittests/scheduler/QueueFlushing.cpp
@@ -98,8 +98,9 @@ static void addDepAndEnqueue(detail::Command *Cmd,
                              detail::QueueImplPtr &DepQueue,
                              detail::Requirement &MockReq) {
   MockCommand DepCmd(DepQueue);
+  std::vector<detail::Command *> ToCleanUp;
   DepCmd.getEvent()->getHandleRef() = reinterpret_cast<pi_event>(new int{});
-  (void)Cmd->addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr});
+  (void)Cmd->addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}, ToCleanUp);
 
   detail::EnqueueResultT Res;
   MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING);
@@ -159,6 +160,7 @@ TEST_F(SchedulerTest, QueueFlushing) {
   detail::AllocaCommand AllocaCmd = detail::AllocaCommand(QueueImplA, MockReq);
   void *MockHostPtr;
   detail::EnqueueResultT Res;
+  std::vector<detail::Command *> ToCleanUp;
 
   // Check that each of the non-blocking commands flush the dependency queue
   {
@@ -194,7 +196,8 @@ TEST_F(SchedulerTest, QueueFlushing) {
                                                       /*Events*/ {})};
     detail::ExecCGCommand ExecCGCmd{std::move(CG), QueueImplA};
     MockReq.MDims = 1;
-    (void)ExecCGCmd.addDep(detail::DepDesc(&AllocaCmd, &MockReq, &AllocaCmd));
+    (void)ExecCGCmd.addDep(detail::DepDesc(&AllocaCmd, &MockReq, &AllocaCmd),
+                           ToCleanUp);
     testCommandEnqueue(&ExecCGCmd, QueueImplB, MockReq);
   }
 
@@ -206,7 +209,7 @@ TEST_F(SchedulerTest, QueueFlushing) {
     detail::EventImplPtr DepEvent{new detail::event_impl(QueueImplB)};
     DepEvent->setContextImpl(QueueImplB->getContextImplPtr());
     DepEvent->getHandleRef() = reinterpret_cast<pi_event>(new int{});
-    (void)Cmd.addDep(DepEvent);
+    (void)Cmd.addDep(DepEvent, ToCleanUp);
     MockScheduler::enqueueCommand(&Cmd, Res, detail::NON_BLOCKING);
     EXPECT_TRUE(QueueFlushed);
   }
@@ -224,7 +227,7 @@ TEST_F(SchedulerTest, QueueFlushing) {
       DepEvent->setContextImpl(TempQueueImpl->getContextImplPtr());
       DepEvent->getHandleRef() = reinterpret_cast<pi_event>(new int{});
     }
-    (void)Cmd.addDep(DepEvent);
+    (void)Cmd.addDep(DepEvent, ToCleanUp);
     MockScheduler::enqueueCommand(&Cmd, Res, detail::NON_BLOCKING);
     EXPECT_FALSE(EventStatusQueried);
     EXPECT_FALSE(QueueFlushed);
@@ -244,10 +247,10 @@ TEST_F(SchedulerTest, QueueFlushing) {
                                 access::mode::read_write};
     MockCommand DepCmdA(QueueImplB);
     DepCmdA.getEvent()->getHandleRef() = reinterpret_cast<pi_event>(new int{});
-    (void)Cmd.addDep(detail::DepDesc{&DepCmdA, &MockReq, nullptr});
+    (void)Cmd.addDep(detail::DepDesc{&DepCmdA, &MockReq, nullptr}, ToCleanUp);
     MockCommand DepCmdB(QueueImplB);
     DepCmdB.getEvent()->getHandleRef() = reinterpret_cast<pi_event>(new int{});
-    (void)Cmd.addDep(detail::DepDesc{&DepCmdB, &MockReq, nullptr});
+    (void)Cmd.addDep(detail::DepDesc{&DepCmdB, &MockReq, nullptr}, ToCleanUp);
     // The check is performed in redefinedQueueFlush
     MockScheduler::enqueueCommand(&Cmd, Res, detail::NON_BLOCKING);
   }
@@ -259,13 +262,13 @@ TEST_F(SchedulerTest, QueueFlushing) {
                               access::mode::read_write};
     MockCommand DepCmd(QueueImplB);
     DepCmd.getEvent()->getHandleRef() = reinterpret_cast<pi_event>(new int{});
-    (void)CmdA.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr});
+    (void)CmdA.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}, ToCleanUp);
     MockScheduler::enqueueCommand(&CmdA, Res, detail::NON_BLOCKING);
 
     EventStatusQueried = false;
     detail::MapMemObject CmdB{&AllocaCmd, MockReq, &MockHostPtr, QueueImplA,
                               access::mode::read_write};
-    (void)CmdB.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr});
+    (void)CmdB.addDep(detail::DepDesc{&DepCmd, &MockReq, nullptr}, ToCleanUp);
     MockScheduler::enqueueCommand(&CmdB, Res, detail::NON_BLOCKING);
     EXPECT_FALSE(EventStatusQueried);
   }
diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
index bd80f24820f8f..60f5cbf05e04a 100644
--- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp
+++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
@@ -10,6 +10,7 @@
 #include <CL/sycl.hpp>
 #include <CL/sycl/detail/cl.h>
 #include <detail/queue_impl.hpp>
+#include <detail/scheduler/commands.hpp>
 #include <detail/scheduler/scheduler.hpp>
 #include <detail/stream_impl.hpp>
 
@@ -38,9 +39,9 @@ class MockCommand : public cl::sycl::detail::Command {
                   cl::sycl::detail::Command::RUN_CG)
       : Command{Type, Queue}, MRequirement{std::move(Req)} {
     using namespace testing;
-    ON_CALL(*this, enqueue(_, _))
+    ON_CALL(*this, enqueue)
         .WillByDefault(Invoke(this, &MockCommand::enqueueOrigin));
-    EXPECT_CALL(*this, enqueue(_, _)).Times(AnyNumber());
+    EXPECT_CALL(*this, enqueue).Times(AnyNumber());
   }
 
   MockCommand(cl::sycl::detail::QueueImplPtr Queue,
@@ -48,9 +49,9 @@ class MockCommand : public cl::sycl::detail::Command {
                   cl::sycl::detail::Command::RUN_CG)
       : Command{Type, Queue}, MRequirement{std::move(getMockRequirement())} {
     using namespace testing;
-    ON_CALL(*this, enqueue(_, _))
+    ON_CALL(*this, enqueue)
         .WillByDefault(Invoke(this, &MockCommand::enqueueOrigin));
-    EXPECT_CALL(*this, enqueue(_, _)).Times(AnyNumber());
+    EXPECT_CALL(*this, enqueue).Times(AnyNumber());
   }
 
   void printDot(std::ostream &) const override {}
@@ -62,11 +63,14 @@ class MockCommand : public cl::sycl::detail::Command {
 
   cl_int enqueueImp() override { return MRetVal; }
 
-  MOCK_METHOD2(enqueue, bool(cl::sycl::detail::EnqueueResultT &,
-                             cl::sycl::detail::BlockingT));
+  MOCK_METHOD3(enqueue, bool(cl::sycl::detail::EnqueueResultT &,
+                             cl::sycl::detail::BlockingT,
+                             std::vector<cl::sycl::detail::Command *> &));
   bool enqueueOrigin(cl::sycl::detail::EnqueueResultT &EnqueueResult,
-                     cl::sycl::detail::BlockingT Blocking) {
-    return cl::sycl::detail::Command::enqueue(EnqueueResult, Blocking);
+                     cl::sycl::detail::BlockingT Blocking,
+                     std::vector<cl::sycl::detail::Command *> &ToCleanUp) {
+    return cl::sycl::detail::Command::enqueue(EnqueueResult, Blocking,
+                                              ToCleanUp);
   }
 
   cl_int MRetVal = CL_SUCCESS;
@@ -99,6 +103,10 @@ class MockCommandWithCallback : public MockCommand {
 
 class MockScheduler : public cl::sycl::detail::Scheduler {
 public:
+  using cl::sycl::detail::Scheduler::addCG;
+  using cl::sycl::detail::Scheduler::addCopyBack;
+  using cl::sycl::detail::Scheduler::cleanupCommands;
+
   cl::sycl::detail::MemObjRecord *
   getOrInsertMemObjRecord(const cl::sycl::detail::QueueImplPtr &Queue,
                           cl::sycl::detail::Requirement *Req,
@@ -106,6 +114,10 @@ class MockScheduler : public cl::sycl::detail::Scheduler {
     return MGraphBuilder.getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
   }
 
+  void decrementLeafCountersForRecord(cl::sycl::detail::MemObjRecord *Rec) {
+    MGraphBuilder.decrementLeafCountersForRecord(Rec);
+  }
+
   void removeRecordForMemObj(cl::sycl::detail::SYCLMemObjI *MemObj) {
     MGraphBuilder.removeRecordForMemObj(MemObj);
   }
@@ -123,10 +135,19 @@ class MockScheduler : public cl::sycl::detail::Scheduler {
     return MGraphBuilder.addNodeToLeaves(Rec, Cmd, Mode, ToEnqueue);
   }
 
+  void updateLeaves(const std::set<cl::sycl::detail::Command *> &Cmds,
+                    cl::sycl::detail::MemObjRecord *Record,
+                    cl::sycl::access::mode AccessMode,
+                    std::vector<cl::sycl::detail::Command *> &ToCleanUp) {
+    return MGraphBuilder.updateLeaves(Cmds, Record, AccessMode, ToCleanUp);
+  }
+
   static bool enqueueCommand(cl::sycl::detail::Command *Cmd,
                              cl::sycl::detail::EnqueueResultT &EnqueueResult,
                              cl::sycl::detail::BlockingT Blocking) {
-    return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, Blocking);
+    std::vector<cl::sycl::detail::Command *> ToCleanUp;
+    return GraphProcessor::enqueueCommand(Cmd, EnqueueResult, ToCleanUp,
+                                          Blocking);
   }
 
   cl::sycl::detail::AllocaCommandBase *
@@ -147,6 +168,29 @@ class MockScheduler : public cl::sycl::detail::Scheduler {
     return MGraphBuilder.insertMemoryMove(Record, Req, Queue, ToEnqueue);
   }
 
+  cl::sycl::detail::Command *
+  addCopyBack(cl::sycl::detail::Requirement *Req,
+              std::vector<cl::sycl::detail::Command *> &ToEnqueue) {
+    return MGraphBuilder.addCopyBack(Req, ToEnqueue);
+  }
+
+  cl::sycl::detail::UpdateHostRequirementCommand *
+  insertUpdateHostReqCmd(cl::sycl::detail::MemObjRecord *Record,
+                         cl::sycl::detail::Requirement *Req,
+                         const cl::sycl::detail::QueueImplPtr &Queue,
+                         std::vector<cl::sycl::detail::Command *> &ToEnqueue) {
+    return MGraphBuilder.insertUpdateHostReqCmd(Record, Req, Queue, ToEnqueue);
+  }
+
+  cl::sycl::detail::EmptyCommand *
+  addEmptyCmd(cl::sycl::detail::Command *Cmd,
+              const std::vector<cl::sycl::detail::Requirement *> &Reqs,
+              const cl::sycl::detail::QueueImplPtr &Queue,
+              cl::sycl::detail::Command::BlockReason Reason,
+              std::vector<cl::sycl::detail::Command *> &ToEnqueue) {
+    return MGraphBuilder.addEmptyCmd(Cmd, Reqs, Queue, Reason, ToEnqueue);
+  }
+
   cl::sycl::detail::Command *
   addCG(std::unique_ptr<cl::sycl::detail::CG> CommandGroup,
         cl::sycl::detail::QueueImplPtr Queue,
diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
index c992919bab32b..478465603199a 100644
--- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
+++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
@@ -9,10 +9,15 @@
 #include "SchedulerTest.hpp"
 #include "SchedulerTestUtils.hpp"
 
+#include <detail/config.hpp>
 #include <detail/scheduler/scheduler_helpers.hpp>
+#include <helpers/ScopedEnvVar.hpp>
 
 using namespace cl::sycl;
 
+inline constexpr auto DisablePostEnqueueCleanupName =
+    "SYCL_DISABLE_POST_ENQUEUE_CLEANUP";
+
 class MockHandler : public sycl::handler {
 public:
   MockHandler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost)
@@ -91,6 +96,11 @@ static bool ValidateDepCommandsTree(const detail::Command *Cmd,
 }
 
 TEST_F(SchedulerTest, StreamInitDependencyOnHost) {
+  // Disable post enqueue cleanup so that it doesn't interfere with dependency
+  // checks.
+  unittest::ScopedEnvVar DisabledCleanup{
+      DisablePostEnqueueCleanupName, "1",
+      detail::SYCLConfig<detail::SYCL_DISABLE_POST_ENQUEUE_CLEANUP>::reset};
   cl::sycl::queue HQueue(host_selector{});
   detail::QueueImplPtr HQueueImpl = detail::getSyclObjImpl(HQueue);
 
diff --git a/sycl/unittests/scheduler/utils.cpp b/sycl/unittests/scheduler/utils.cpp
index b6bb23b4325d8..373b4572ecfb4 100644
--- a/sycl/unittests/scheduler/utils.cpp
+++ b/sycl/unittests/scheduler/utils.cpp
@@ -10,8 +10,10 @@
 
 void addEdge(cl::sycl::detail::Command *User, cl::sycl::detail::Command *Dep,
              cl::sycl::detail::AllocaCommandBase *Alloca) {
+  std::vector<cl::sycl::detail::Command *> ToCleanUp;
   (void)User->addDep(
-      cl::sycl::detail::DepDesc{Dep, User->getRequirement(), Alloca});
+      cl::sycl::detail::DepDesc{Dep, User->getRequirement(), Alloca},
+      ToCleanUp);
   Dep->addUser(User);
 }