From c1d95954805e5612f1493e51bf0480efd27020cc Mon Sep 17 00:00:00 2001
From: Ben Tracy <ben.tracy@codeplay.com>
Date: Tue, 6 May 2025 17:30:49 +0100
Subject: [PATCH 1/5] [SYCL][Graph] Memory reuse for graph allocations in a
 single graph

- Optimize memory use for allocations within a single graph by reusing memory where possible
- New handler impl member for node dependency access with the CGF
- New E2E tests for memory reuse
- Add design doc information on memory reuse
- Add missing CGType -> string conversion for graph printing alloc and free nodes
---
 sycl/doc/design/CommandGraph.md               |  20 ++
 sycl/source/CMakeLists.txt                    |   1 +
 sycl/source/detail/async_alloc.cpp            |  52 ++++-
 sycl/source/detail/graph_impl.cpp             |  20 +-
 sycl/source/detail/graph_impl.hpp             |  36 +++
 sycl/source/detail/graph_memory_pool.cpp      | 189 ++++++++++++++++
 sycl/source/detail/graph_memory_pool.hpp      |  89 ++++----
 sycl/source/detail/handler_impl.hpp           |   3 +
 sycl/source/queue.cpp                         |   6 +-
 .../async_alloc_device_memory_reuse.cpp       |  10 +
 ...ync_alloc_device_memory_reuse_multiple.cpp |  10 +
 ...nc_alloc_device_memory_reuse_zero_init.cpp |  10 +
 .../async_alloc_device_memory_reuse.cpp       | 200 ++++++++++++++++
 ...ync_alloc_device_memory_reuse_multiple.cpp | 163 ++++++++++++++
 ...nc_alloc_device_memory_reuse_zero_init.cpp | 213 ++++++++++++++++++
 .../async_alloc_device_memory_reuse.cpp       |  10 +
 ...ync_alloc_device_memory_reuse_in_order.cpp | 161 +++++++++++++
 ...ync_alloc_device_memory_reuse_multiple.cpp |  10 +
 ...nc_alloc_device_memory_reuse_zero_init.cpp |  10 +
 19 files changed, 1150 insertions(+), 63 deletions(-)
 create mode 100644 sycl/source/detail/graph_memory_pool.cpp
 create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp
 create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp
 create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp
 create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp
 create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp
 create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp
 create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp
 create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp
 create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp
 create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp

diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md
index 956082b31097f..c02c73a66d2bb 100644
--- a/sycl/doc/design/CommandGraph.md
+++ b/sycl/doc/design/CommandGraph.md
@@ -349,6 +349,26 @@ safely assumed to be more performant. It is not likely we'll try to allow
 in-order execution in more scenarios through a complicated (and imperfect)
 heuristic but rather expose this as a hint the user can provide.
 
+### Graph Allocation Memory Reuse
+
+Within a single graph memory allocations which have previously been freed are
+checked when adding a new allocation to see if they can be reused. Because we
+have to return a pointer to the user immediately when the CGF for a node is
+processed when adding the node, we have to do this eagerly anytime
+`async_malloc()` is called. Allocations track the last free node associated with
+them to represent the most recent use of that allocation.
+
+ To be reused the two allocations must meet these criteria:
+
+- They must be of the same allocation type (device/host/shared).
+- They must have a matching size.
+- They must have the same properties (currently only read-only matters).
+- There must be a path from the last free node associated with a given
+  allocation to the new allocation node being added.
+
+If these criteria are met we update the last free node for the allocation and
+return the existing pointer to the user.
+
 ## Backend Implementation
 
 Implementation of UR command-buffers for each of the supported SYCL 2020
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index dac4e71af45af..40686655b71b4 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -322,6 +322,7 @@ set(SYCL_COMMON_SOURCES
     "detail/memory_pool_impl.cpp"
     "detail/async_alloc.cpp"
     "detail/memory_pool.cpp"
+    "detail/graph_memory_pool.cpp"
     "$<$<PLATFORM_ID:Windows>:detail/windows_ur.cpp>"
     "$<$<OR:$<PLATFORM_ID:Linux>,$<PLATFORM_ID:Darwin>>:detail/posix_ur.cpp>"
 )
diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp
index b21f6d13c42e3..b3be1e4ac6c78 100644
--- a/sycl/source/detail/async_alloc.cpp
+++ b/sycl/source/detail/async_alloc.cpp
@@ -46,21 +46,36 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) {
 
   auto &Adapter = h.getContextImplPtr()->getAdapter();
 
-  // Get events to wait on.
-  auto depEvents = getUrEvents(h.impl->CGData.MEvents);
-  uint32_t numEvents = h.impl->CGData.MEvents.size();
+  // Get CG event dependencies for this allocation.
+  const auto &DepEvents = h.impl->CGData.MEvents;
+  auto UREvents = getUrEvents(DepEvents);
+  uint32_t numEvents = DepEvents.size();
 
   void *alloc = nullptr;
 
   ur_event_handle_t Event = nullptr;
   // If a graph is present do the allocation from the graph memory pool instead.
   if (auto Graph = h.getCommandGraph(); Graph) {
-    alloc = Graph->getMemPool().malloc(size, kind);
+    // Get dependent graph nodes from any events
+    auto DepNodes = Graph->getNodesForEvents(DepEvents);
+    // If this node was added explicitly we may have node deps in the handler as
+    // well, so add them to the list
+    DepNodes.insert(DepNodes.end(), h.impl->MNodeDeps.begin(),
+                    h.impl->MNodeDeps.end());
+    // If this is being recorded from an in-order queue we need to get the last
+    // in-order node if any, since this will later become a dependency of the
+    // node being processed here.
+    if (const auto &LastInOrderNode = Graph->getLastInorderNode(h.MQueue);
+        LastInOrderNode) {
+      DepNodes.push_back(LastInOrderNode);
+    }
+
+    alloc = Graph->getMemPool().malloc(size, kind, DepNodes);
   } else {
     auto &Q = h.MQueue->getHandleRef();
     Adapter->call<sycl::errc::runtime,
                   sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
-        Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, depEvents.data(),
+        Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, UREvents.data(),
         &alloc, &Event);
   }
 
@@ -95,24 +110,38 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size,
   auto &Adapter = h.getContextImplPtr()->getAdapter();
   auto &memPoolImpl = sycl::detail::getSyclObjImpl(pool);
 
-  // Get events to wait on.
-  auto depEvents = getUrEvents(h.impl->CGData.MEvents);
-  uint32_t numEvents = h.impl->CGData.MEvents.size();
+  // Get CG event dependencies for this allocation.
+  const auto &DepEvents = h.impl->CGData.MEvents;
+  auto UREvents = getUrEvents(DepEvents);
+  uint32_t numEvents = DepEvents.size();
 
   void *alloc = nullptr;
 
   ur_event_handle_t Event = nullptr;
   // If a graph is present do the allocation from the graph memory pool instead.
   if (auto Graph = h.getCommandGraph(); Graph) {
+    // Get dependent graph nodes from any events
+    auto DepNodes = Graph->getNodesForEvents(DepEvents);
+    // If this node was added explicitly we may have node deps in the handler as
+    // well, so add them to the list
+    DepNodes.insert(DepNodes.end(), h.impl->MNodeDeps.begin(),
+                    h.impl->MNodeDeps.end());
+    // If this is being recorded from an in-order queue we need to get the last
+    // in-order node if any, since this will later become a dependency of the
+    // node being processed here.
+    if (const auto &LastInOrderNode = Graph->getLastInorderNode(h.MQueue);
+        LastInOrderNode) {
+      DepNodes.push_back(LastInOrderNode);
+    }
     // Memory pool is passed as the graph may use some properties of it.
-    alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(),
+    alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), DepNodes,
                                        sycl::detail::getSyclObjImpl(pool));
   } else {
     auto &Q = h.MQueue->getHandleRef();
     Adapter->call<sycl::errc::runtime,
                   sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
         Q, memPoolImpl.get()->get_handle(), size, nullptr, numEvents,
-        depEvents.data(), &alloc, &Event);
+        UREvents.data(), &alloc, &Event);
   }
   // Async malloc must return a void* immediately.
   // Set up CommandGroup which is a no-op and pass the event from the alloc.
@@ -140,6 +169,9 @@ async_malloc_from_pool(const sycl::queue &q, size_t size,
 }
 
 __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr) {
+  // We only check for errors for the graph here because marking the allocation
+  // as free in the graph memory pool requires a node object which doesn't exist
+  // at this point.
   if (auto Graph = h.getCommandGraph(); Graph) {
     // Check if the pointer to be freed has an associated allocation node, and
     // error if not
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index 252fc220df70d..e3ba7f6846098 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -344,7 +344,8 @@ graph_impl::graph_impl(const sycl::context &SyclContext,
                        const sycl::device &SyclDevice,
                        const sycl::property_list &PropList)
     : MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(),
-      MEventsMap(), MInorderQueueMap(), MGraphMemPool(SyclContext, SyclDevice),
+      MEventsMap(), MInorderQueueMap(),
+      MGraphMemPool(*this, SyclContext, SyclDevice),
       MID(NextAvailableID.fetch_add(1, std::memory_order_relaxed)) {
   checkGraphPropertiesAndThrow(PropList);
   if (PropList.has_property<property::graph::no_cycle_check>()) {
@@ -504,6 +505,10 @@ graph_impl::add(std::function<void(handler &)> CGF,
   (void)Args;
   sycl::handler Handler{shared_from_this()};
 
+  // Pass the node deps to the handler so they are available when processing the
+  // CGF, need for async_malloc nodes.
+  Handler.impl->MNodeDeps = Deps;
+
 #if XPTI_ENABLE_INSTRUMENTATION
   // Save code location if one was set in TLS.
   // Ideally it would be nice to capture user's call code location
@@ -527,6 +532,10 @@ graph_impl::add(std::function<void(handler &)> CGF,
 
   Handler.finalize();
 
+  // In explicit mode the handler processing of the CGF does not need a write
+  // lock as it does not modify the graph, we extract information from it here
+  // and modify the graph.
+  graph_impl::WriteLock Lock(MMutex);
   node_type NodeType =
       Handler.impl->MUserFacingNodeType !=
               ext::oneapi::experimental::node_type::empty
@@ -597,6 +606,14 @@ graph_impl::add(node_type NodeType,
 
   addDepsToNode(NodeImpl, Deps);
 
+  if (NodeType == node_type::async_free) {
+    auto AsyncFreeCG =
+        static_cast<CGAsyncFree *>(NodeImpl->MCommandGroup.get());
+    // If this is an async free node mark that it is now available for reuse,
+    // and pass the async free node for tracking.
+    MGraphMemPool.markAllocationAsAvailable(AsyncFreeCG->getPtr(), NodeImpl);
+  }
+
   return NodeImpl;
 }
 
@@ -1778,7 +1795,6 @@ node modifiable_command_graph::addImpl(std::function<void(handler &)> CGF,
     DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
   }
 
-  graph_impl::WriteLock Lock(impl->MMutex);
   std::shared_ptr<detail::node_impl> NodeImpl = impl->add(CGF, {}, DepImpls);
   return sycl::detail::createSyclObjFromImpl<node>(std::move(NodeImpl));
 }
diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
index ccc6455f6296f..2fc9ec93c9798 100644
--- a/sycl/source/detail/graph_impl.hpp
+++ b/sycl/source/detail/graph_impl.hpp
@@ -735,6 +735,12 @@ class node_impl : public std::enable_shared_from_this<node_impl> {
     case sycl::detail::CGType::EnqueueNativeCommand:
       Stream << "CGNativeCommand \\n";
       break;
+    case sycl::detail::CGType::AsyncAlloc:
+      Stream << "CGAsyncAlloc \\n";
+      break;
+    case sycl::detail::CGType::AsyncFree:
+      Stream << "CGAsyncFree \\n";
+      break;
     default:
       Stream << "Other \\n";
       break;
@@ -937,6 +943,30 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
         "No node in this graph is associated with this event");
   }
 
+  /// Find the nodes associated with a list of SYCL events. Throws if no node is
+  /// found for a given event.
+  /// @param Events Events to find nodes for.
+  /// @return A list of node counterparts for each event, in the same order.
+  std::vector<std::shared_ptr<node_impl>> getNodesForEvents(
+      const std::vector<std::shared_ptr<sycl::detail::event_impl>> Events) {
+    std::vector<std::shared_ptr<node_impl>> NodeList{Events.size()};
+
+    ReadLock Lock(MMutex);
+
+    for (size_t i = 0; i < Events.size(); i++) {
+      if (auto NodeFound = MEventsMap.find(Events[i]);
+          NodeFound != std::end(MEventsMap)) {
+        NodeList[i] = NodeFound->second;
+      } else {
+        throw sycl::exception(
+            sycl::make_error_code(errc::invalid),
+            "No node in this graph is associated with this event");
+      }
+    }
+
+    return NodeList;
+  }
+
   /// Query for the context tied to this graph.
   /// @return Context associated with graph.
   sycl::context getContext() const { return MContext; }
@@ -1191,6 +1221,12 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
   /// this graph.
   size_t getExecGraphCount() const { return MExecGraphCount; }
 
+  void resetNodeVisitedEdges() {
+    for (auto &Node : MNodeStorage) {
+      Node->MTotalVisitedEdges = 0;
+    }
+  }
+
 private:
   /// Check the graph for cycles by performing a depth-first search of the
   /// graph. If a node is visited more than once in a given path through the
diff --git a/sycl/source/detail/graph_memory_pool.cpp b/sycl/source/detail/graph_memory_pool.cpp
new file mode 100644
index 0000000000000..56d813e6294e7
--- /dev/null
+++ b/sycl/source/detail/graph_memory_pool.cpp
@@ -0,0 +1,189 @@
+//==--------- graph_memory_pool.cpp --- SYCL graph extension ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "graph_memory_pool.hpp"
+
+#include <optional>
+#include <queue>
+
+#include "graph_impl.hpp"
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+namespace detail {
+
+void *
+graph_mem_pool::malloc(size_t Size, usm::alloc AllocType,
+                       const std::vector<std::shared_ptr<node_impl>> &DepNodes,
+                       const std::shared_ptr<memory_pool_impl> &MemPool) {
+  // We are potentially modifying contents of this memory pool and the owning
+  // graph, so take a lock here.
+  graph_impl::WriteLock Lock(MGraph.MMutex);
+
+  void *Alloc = nullptr;
+  alloc_info AllocInfo = {};
+  AllocInfo.Kind = AllocType;
+  // Collect relevant properties from memory pool
+  if (MemPool) {
+    auto PropList = MemPool->getPropList();
+    if (PropList.has_property<property::memory_pool::zero_init>()) {
+      AllocInfo.ZeroInit = true;
+    }
+    if (PropList.has_property<property::memory_pool::read_only>()) {
+      AllocInfo.ReadOnly = true;
+    }
+  }
+
+  switch (AllocType) {
+  case usm::alloc::device: {
+
+    auto &CtxImpl = sycl::detail::getSyclObjImpl(MContext);
+    auto &Adapter = CtxImpl->getAdapter();
+
+    size_t Granularity = get_mem_granularity(MDevice, MContext);
+    uintptr_t StartPtr = 0;
+    size_t AlignedSize = alignByteSize(Size, Granularity);
+    // See if we can find an allocation to reuse
+    auto AllocOpt = tryReuseExistingAllocation(AlignedSize, AllocType,
+                                               AllocInfo.ReadOnly, DepNodes);
+
+    // If we got a value back then an allocation was available for reuse and we
+    // can just return that pointer
+    if (AllocOpt) {
+      return AllocOpt.value().Ptr;
+    }
+
+    // If no allocation could be reused, do a new virtual reservation
+    Adapter->call<sycl::errc::runtime,
+                  sycl::detail::UrApiKind::urVirtualMemReserve>(
+        CtxImpl->getHandleRef(), reinterpret_cast<void *>(StartPtr),
+        AlignedSize, &Alloc);
+
+    AllocInfo.Size = AlignedSize;
+    AllocInfo.Ptr = Alloc;
+
+    MAllocations[Alloc] = AllocInfo;
+    break;
+  }
+
+  default:
+    throw sycl::exception(sycl::make_error_code(errc::feature_not_supported),
+                          "Only device allocations are currently supported "
+                          "in graph allocation nodes!");
+    break;
+  }
+
+  return Alloc;
+}
+
+std::optional<graph_mem_pool::alloc_info>
+graph_mem_pool::tryReuseExistingAllocation(
+    size_t Size, usm::alloc AllocType, bool ReadOnly,
+    const std::vector<std::shared_ptr<node_impl>> &DepNodes) {
+
+  std::vector<alloc_info> CompatibleAllocs;
+  // Compatible allocs can only be as big as MFreeAllocations
+  CompatibleAllocs.reserve(MFreeAllocations.size());
+
+  // Loop over free allocation list, search for ones that are compatible for
+  // reuse Currently that means they have the same alloc kind, size and read
+  // only property.
+
+  for (auto &Ptr : MFreeAllocations) {
+    alloc_info &Info = MAllocations.at(Ptr);
+    if (Info.Kind == AllocType && Info.Size == Size &&
+        Info.ReadOnly == ReadOnly) {
+      // Store the alloc info since it is compatible
+      CompatibleAllocs.push_back(Info);
+    }
+  }
+
+  // Traverse graph back from each DepNode to try and find any of the suitable
+  // free nodes. We do this in a breadth-first approach because we want to find
+  // the shortest path to a reusable allocation.
+
+  std::queue<std::weak_ptr<node_impl>> NodesToCheck;
+
+  // Add all the dependent nodes to the queue, they will be popped first
+  for (auto &Dep : DepNodes) {
+    NodesToCheck.push(Dep);
+  }
+
+  std::optional<alloc_info> AllocInfo = {};
+
+  // Called when traversing over nodes to check if the current node
+  auto CheckNodeEqual =
+      [&CompatibleAllocs,
+       &AllocInfo](const std::shared_ptr<node_impl> &CurrentNode) -> bool {
+    for (auto &Alloc : CompatibleAllocs) {
+      const auto &AllocFreeNode = Alloc.LastFreeNode;
+      // Compare control blocks without having to lock AllocFreeNode to check
+      // for node equality
+      if (!CurrentNode.owner_before(AllocFreeNode) &&
+          !AllocFreeNode.owner_before(CurrentNode)) {
+        Alloc.LastFreeNode.reset();
+        AllocInfo = Alloc;
+        return true;
+      }
+    }
+    return false;
+  };
+
+  while (!NodesToCheck.empty()) {
+    auto CurrentNode = NodesToCheck.front().lock();
+    NodesToCheck.pop();
+
+    if (CurrentNode->MTotalVisitedEdges > 0) {
+      continue;
+    }
+
+    // Check if the node is a free node, and if so is a free node for any of the
+    // allocations which are free for reuse. We should not bother checking nodes
+    // that are free nodes, so we continue and check their predecessors.
+    if (CurrentNode->MNodeType == node_type::async_free &&
+        CheckNodeEqual(CurrentNode)) {
+      // If we found an allocation AllocInfo has already been populated in
+      // CheckNodeEqual(), so we simply break out of the loop
+      break;
+    }
+
+    // Add CurrentNode predecessors to queue
+    for (auto &Pred : CurrentNode->MPredecessors) {
+      NodesToCheck.push(Pred);
+    }
+
+    // Mark node as visited
+    CurrentNode->MTotalVisitedEdges = 1;
+  }
+  // Reset visited nodes tracking
+  MGraph.resetNodeVisitedEdges();
+  // If we found an allocation, remove it from the free list.
+  if (AllocInfo) {
+    MFreeAllocations.erase(std::find(MFreeAllocations.begin(),
+                                     MFreeAllocations.end(),
+                                     AllocInfo.value().Ptr));
+  }
+
+  return AllocInfo;
+}
+
+void graph_mem_pool::markAllocationAsAvailable(
+    void *Ptr, const std::shared_ptr<node_impl> &FreeNode) {
+  MFreeAllocations.push_back(Ptr);
+  MAllocations.at(Ptr).LastFreeNode = FreeNode;
+}
+
+} // namespace detail
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/source/detail/graph_memory_pool.hpp b/sycl/source/detail/graph_memory_pool.hpp
index e9133bb7f7d0c..6da1b14526b12 100644
--- a/sycl/source/detail/graph_memory_pool.hpp
+++ b/sycl/source/detail/graph_memory_pool.hpp
@@ -20,15 +20,20 @@ namespace oneapi {
 namespace experimental {
 namespace detail {
 
+// Forward declarations
+class node_impl;
+
 /// Class handling graph-owned memory allocations. Device allocations are
 /// managed using virtual memory.
 class graph_mem_pool {
   // Info descriptor for various properties of an allocation
   struct alloc_info {
+    // Pointer to the start of the allocation
+    void *Ptr = nullptr;
     // Allocation kind
-    usm::alloc Kind;
+    usm::alloc Kind = usm::alloc::unknown;
     // Size of the allocation
-    size_t Size;
+    size_t Size = 0;
     // Is currently mapped to physical memory
     bool Mapped = false;
     // Index into the array of physical memory
@@ -37,11 +42,14 @@ class graph_mem_pool {
     bool ReadOnly = false;
     // Should the allocation be zero initialized during initial allocation
     bool ZeroInit = false;
+    // Last free node for this allocation in the graph
+    std::weak_ptr<node_impl> LastFreeNode = {};
   };
 
 public:
-  graph_mem_pool(const context &Context, const device &Device)
-      : MContext(Context), MDevice(Device) {}
+  graph_mem_pool(graph_impl &GraphImpl, const context &Context,
+                 const device &Device)
+      : MGraph(GraphImpl), MContext(Context), MDevice(Device) {}
   ~graph_mem_pool() {
     for (auto &[Ptr, AllocInfo] : MAllocations) {
       // Unmap allocations if required before physical memory is released
@@ -64,55 +72,14 @@ class graph_mem_pool {
   /// memory before use by calling allocateAndMapAll()
   /// @param Size Size of the allocation
   /// @param AllocType Type of the allocation
+  /// @param DepNodes Any node dependencies for this allocation, used to
+  /// identify allocations that can be reused.
   /// @param MemPool Optional memory pool from which allocations will not be
   /// made directly but properties may be respected.
   /// @return A pointer to the start of the allocation
   void *malloc(size_t Size, usm::alloc AllocType,
-               const std::shared_ptr<memory_pool_impl> &MemPool = nullptr) {
-    void *Alloc = nullptr;
-    switch (AllocType) {
-    case usm::alloc::device: {
-
-      auto &CtxImpl = sycl::detail::getSyclObjImpl(MContext);
-      auto &Adapter = CtxImpl->getAdapter();
-
-      size_t Granularity = get_mem_granularity(MDevice, MContext);
-      uintptr_t StartPtr = 0;
-      size_t AlignedSize = alignByteSize(Size, Granularity);
-      // Do virtual reservation
-      Adapter->call<sycl::errc::runtime,
-                    sycl::detail::UrApiKind::urVirtualMemReserve>(
-          CtxImpl->getHandleRef(), reinterpret_cast<void *>(StartPtr),
-          AlignedSize, &Alloc);
-
-      alloc_info AllocInfo = {};
-      AllocInfo.Kind = AllocType;
-      AllocInfo.Size = AlignedSize;
-
-      // Collect relevant properties from memory pool
-      if (MemPool) {
-        auto PropList = MemPool->getPropList();
-        if (PropList.has_property<property::memory_pool::zero_init>()) {
-          AllocInfo.ZeroInit = true;
-        }
-        if (PropList.has_property<property::memory_pool::read_only>()) {
-          AllocInfo.ReadOnly = true;
-        }
-      }
-
-      MAllocations[Alloc] = AllocInfo;
-      break;
-    }
-
-    default:
-      throw sycl::exception(sycl::make_error_code(errc::feature_not_supported),
-                            "Only device allocations are currently supported "
-                            "in graph allocation nodes!");
-      break;
-    }
-
-    return Alloc;
-  }
+               const std::vector<std::shared_ptr<node_impl>> &DepNodes,
+               const std::shared_ptr<memory_pool_impl> &MemPool = nullptr);
 
   /// Return the total amount of memory being used by this pool
   size_t getMemUseCurrent() const {
@@ -184,7 +151,27 @@ class graph_mem_pool {
     return MAllocations.find(Ptr) != MAllocations.end();
   }
 
+  /// Mark that this allocation has been freed and is available for reuse.
+  /// @param Ptr The pointer to the allocation.
+  /// @param FreeNode The graph node of node_type::async_free which is freeing
+  /// the allocation.
+  void markAllocationAsAvailable(void *Ptr,
+                                 const std::shared_ptr<node_impl> &FreeNode);
+
 private:
+  /// Tries to reuse an existing allocation which has been marked free in the
+  /// graph.
+  /// @param Size Size of the allocation.
+  /// @param AllocType USM type of the allocation.
+  /// @param ReadOnly True if the allocation is read only
+  /// @param DepNodes Node dependencies of this allocation, used to search for
+  /// reusable allocations.
+  /// @returns An optional allocation info value, where a null value indicates
+  /// that no allocation could be reused.
+  std::optional<alloc_info> tryReuseExistingAllocation(
+      size_t Size, usm::alloc AllocType, bool ReadOnly,
+      const std::vector<std::shared_ptr<node_impl>> &DepNodes);
+
   /// Returns an aligned byte size given a required granularity
   /// @param UnalignedByteSize The original requested allocation size
   /// @param Granularity The required granularity for this allocation
@@ -193,6 +180,8 @@ class graph_mem_pool {
     return ((UnalignedByteSize + Granularity - 1) / Granularity) * Granularity;
   }
 
+  /// Graph that owns this mem pool
+  graph_impl &MGraph;
   /// Context associated with allocations from this pool
   context MContext;
   /// Device associated with allocations from this pool
@@ -201,6 +190,8 @@ class graph_mem_pool {
   std::unordered_map<void *, alloc_info> MAllocations;
   /// List of physical memory allocations used for virtual device reservations
   std::vector<std::shared_ptr<physical_mem_impl>> MPhysicalMem;
+  /// List of pointers to allocations which are currently free for reuse
+  std::vector<void *> MFreeAllocations;
 };
 } // namespace detail
 } // namespace experimental
diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp
index 42b1991f153f5..7d87cc4abda7a 100644
--- a/sycl/source/detail/handler_impl.hpp
+++ b/sycl/source/detail/handler_impl.hpp
@@ -175,6 +175,9 @@ class handler_impl {
   std::shared_ptr<ext::oneapi::experimental::detail::node_impl> MSubgraphNode;
   /// Storage for the CG created when handling graph nodes added explicitly.
   std::unique_ptr<detail::CG> MGraphNodeCG;
+  /// Storage for node dependencies passed when adding a graph node explicitly
+  std::vector<std::shared_ptr<ext::oneapi::experimental::detail::node_impl>>
+      MNodeDeps;
 
   /// Storage for lambda/function when using HostTask
   std::shared_ptr<detail::HostTask> MHostTask;
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 4383bb4a2d49a..623e122351c3a 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -440,9 +440,11 @@ sycl::detail::optional<event> queue::ext_oneapi_get_last_event_impl() const {
     return std::nullopt;
 
   // If the last event was discarded or a NOP, we insert a marker to represent
-  // an event at end.
+  // an event at end. If the event comes from a graph we must skip this because
+  // the original event is used for tracking nodes in the graph.
   auto LastEventImpl = detail::getSyclObjImpl(*LastEvent);
-  if (LastEventImpl->isDiscarded() || LastEventImpl->isNOP())
+  if (!LastEventImpl->getCommandGraph() &&
+      (LastEventImpl->isDiscarded() || LastEventImpl->isNOP()))
     LastEvent =
         detail::createSyclObjFromImpl<event>(impl->insertMarkerEvent(impl));
   return LastEvent;
diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp
new file mode 100644
index 0000000000000..944e7449d5d05
--- /dev/null
+++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/async_alloc_device_memory_reuse.cpp"
diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp
new file mode 100644
index 0000000000000..bb093866ce70b
--- /dev/null
+++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/async_alloc_device_memory_reuse_multiple.cpp"
diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp
new file mode 100644
index 0000000000000..c00dab11f52cb
--- /dev/null
+++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/async_alloc_device_memory_reuse_zero_init.cpp"
diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp
new file mode 100644
index 0000000000000..9887aa0bce9b8
--- /dev/null
+++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp
@@ -0,0 +1,200 @@
+// Tests memory reuse behaviour of device graph allocations.
+// NOTE: This test partially relies on knowing how the implementation works and
+// that the contents of memory will persist when allocations are reused. This is
+// useful for testing but is not an assumption that a user can or should make.
+
+#include "../../graph_common.hpp"
+#include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
+
+using T = int;
+void add_nodes_to_graph(
+    exp_ext::command_graph<exp_ext::graph_state::modifiable> &Graph,
+    queue &Queue, size_t Size, T *PtrInput) {
+  // Create 2 pointers for async allocations
+  T *AsyncPtrA = nullptr;
+  T *AsyncPtrB = nullptr;
+  // Add alloc nodes at the root of the graph for each allocation, this should
+  // result in three unique allocations
+  auto AllocNodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    AsyncPtrA = static_cast<T *>(
+        exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T)));
+  });
+  auto AllocNodeB = add_node(Graph, Queue, [&](handler &CGH) {
+    AsyncPtrB = static_cast<T *>(
+        exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T)));
+  });
+
+  // Assert that we have received unique ptr values, this should always be true
+  // regardless of implementation.
+  assert((AsyncPtrA != AsyncPtrB));
+
+  // Store pointer values for later comparison
+  void *FirstAsyncPtrA = AsyncPtrA;
+  void *FirstAsyncPtrB = AsyncPtrB;
+  // Add kernel that fills the async allocs with values
+  auto KernelFillPtrs = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {AllocNodeA, AllocNodeB});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          AsyncPtrA[LinID] = (1 + LinID);
+          AsyncPtrB[LinID] = (2 + LinID);
+        });
+      },
+      AllocNodeA, AllocNodeB);
+  // Free all the async allocations, making it possible to reuse them
+  auto FreeNodeA = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelFillPtrs);
+        exp_ext::async_free(CGH, AsyncPtrA);
+      },
+      KernelFillPtrs);
+  auto FreeNodeB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelFillPtrs);
+        exp_ext::async_free(CGH, AsyncPtrB);
+      },
+      KernelFillPtrs);
+
+  // Create 2 connected layers in the graph, in each layer we will do a new
+  // async_alloc which should reuse one of the 2 previously freed pointers. The
+  // other kernel in the layer will simply operate on the output ptr.
+
+  // The first layer will have a direct dependency on a free node, but the
+  // subsequent layer will have an indirect dependency. We do not test the order
+  // in which allocations are picked for reuse, but we can assume both will be
+  // reused by the implementation (same size and properties so they are
+  // compatible).
+
+  // First layer, allocation is added first
+  auto AllocNodeA1 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {FreeNodeA, FreeNodeB});
+        AsyncPtrA = static_cast<T *>(
+            exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T)));
+      },
+      FreeNodeA, FreeNodeB);
+
+  // Check that the new allocation matches one of the previously returned
+  // values.
+  assert((AsyncPtrA == FirstAsyncPtrA) || (AsyncPtrA == FirstAsyncPtrB));
+  // Increment output pointer
+  auto KernelInc1 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {FreeNodeA, FreeNodeB});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          PtrInput[LinID] += 1;
+        });
+      },
+      FreeNodeA, FreeNodeB);
+
+  // Second layer, allocation has an indirect dependency on a free node
+  // Increment output pointer
+  auto KernelInc2 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {KernelInc1});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          PtrInput[LinID] += 1;
+        });
+      },
+      KernelInc1);
+  auto AllocNodeB2 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {KernelInc1, AllocNodeA1});
+        AsyncPtrB = static_cast<T *>(
+            exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T)));
+      },
+      KernelInc1, AllocNodeA1);
+  // Check that the new allocation matches one of the previously returned
+  // values.
+  assert((AsyncPtrB == FirstAsyncPtrA) || (AsyncPtrB == FirstAsyncPtrB));
+
+  // Add a final kernel that adds the async allocation values to the output
+  // pointer
+  auto KernelAddToOutput = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {KernelInc2, AllocNodeB2});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          PtrInput[LinID] += (AsyncPtrA[LinID] + AsyncPtrB[LinID]);
+        });
+      },
+      KernelInc2, AllocNodeB2);
+
+  // Free the allocations again
+
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelAddToOutput);
+        exp_ext::async_free(CGH, AsyncPtrA);
+      },
+      KernelAddToOutput);
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelAddToOutput);
+        exp_ext::async_free(CGH, AsyncPtrB);
+      },
+      KernelAddToOutput);
+}
+
+void calculate_reference_data(size_t Iterations, size_t Size,
+                              std::vector<T> &ReferenceOutput) {
+  for (size_t i = 0; i < Iterations; i++) {
+    for (size_t j = 0; j < Size; j++) {
+      ReferenceOutput[j] += 2;
+      ReferenceOutput[j] += (j + 1);
+      ReferenceOutput[j] += (j + 2);
+    }
+  }
+}
+
+int main() {
+  queue Queue{};
+
+  std::vector<T> DataInput(Size);
+
+  std::iota(DataInput.begin(), DataInput.end(), 1);
+
+  std::vector<T> ReferenceOutput(DataInput);
+  calculate_reference_data(Iterations, Size, ReferenceOutput);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrInput = malloc_device<T>(Size, Queue);
+
+  std::vector<T> OutputData(Size);
+
+  Queue.copy(DataInput.data(), PtrInput, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to graph
+  add_nodes_to_graph(Graph, Queue, Size, PtrInput);
+
+  auto GraphExec = Graph.finalize();
+
+  for (unsigned n = 0; n < Iterations; n++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+
+  Queue.wait_and_throw();
+  Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw();
+  for (size_t i = 0; i < Size; i++) {
+    assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData"));
+  }
+
+  free(PtrInput, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp
new file mode 100644
index 0000000000000..e791371544677
--- /dev/null
+++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp
@@ -0,0 +1,163 @@
+// Tests memory reuse behaviour of device graph allocations when an allocation
+// is reused multiple times. NOTE: This test partially relies on knowing how the
+// implementation works and that the contents of memory will persist when
+// allocations are reused. This is useful for testing but is not an assumption
+// that a user can or should make.
+
+#include "../../graph_common.hpp"
+#include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
+
+using T = int;
+void add_nodes_to_graph(
+    exp_ext::command_graph<exp_ext::graph_state::modifiable> &Graph,
+    queue &Queue, size_t Size, T *PtrInput) {
+  // Create 1 pointers for async allocations
+  T *AsyncPtr = nullptr;
+  // Add alloc nodes at the root of the graph for each allocation, this should
+  // result in three unique allocations
+  auto AllocNode1 = add_node(Graph, Queue, [&](handler &CGH) {
+    AsyncPtr = static_cast<T *>(
+        exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T)));
+  });
+
+  // Store pointer value for later comparison
+  void *FirstAsyncPtr = AsyncPtr;
+
+  // Add kernel that fills the async alloc with values
+  auto KernelFillPtrs = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {AllocNode1});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          AsyncPtr[LinID] = (1 + LinID);
+        });
+      },
+      AllocNode1);
+  // Free all the async allocation, making it possible to reuse it
+  auto FreeNode1 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelFillPtrs);
+        exp_ext::async_free(CGH, AsyncPtr);
+      },
+      KernelFillPtrs);
+
+  // Add an allocation node which should reuse the previous allocation, followed
+  // by a kernel that uses the data, and then freeing the allocation.
+
+  auto AllocNode2 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {FreeNode1});
+        AsyncPtr = static_cast<T *>(
+            exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T)));
+      },
+      FreeNode1);
+
+  // Check that the new allocation matches one of the previously returned
+  // values.
+  assert(AsyncPtr == FirstAsyncPtr);
+
+  auto KernelAdd1 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {AllocNode2});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          PtrInput[LinID] += AsyncPtr[LinID];
+        });
+      },
+      AllocNode2);
+
+  auto FreeNode2 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelAdd1);
+        exp_ext::async_free(CGH, AsyncPtr);
+      },
+      KernelAdd1);
+
+  // Repeat the previous 3 nodes to test reuse an allocation which has
+  // previously been reused.
+
+  auto AllocNode3 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {FreeNode2});
+        AsyncPtr = static_cast<T *>(
+            exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T)));
+      },
+      FreeNode2);
+
+  // Check that the new allocation matches one of the previously returned
+  // values.
+  assert(AsyncPtr == FirstAsyncPtr);
+
+  auto KernelAdd2 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {AllocNode3});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          PtrInput[LinID] += AsyncPtr[LinID];
+        });
+      },
+      AllocNode3);
+
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelAdd2);
+        exp_ext::async_free(CGH, AsyncPtr);
+      },
+      KernelAdd2);
+}
+
+void calculate_reference_data(size_t Iterations, size_t Size,
+                              std::vector<T> &ReferenceOutput) {
+  for (size_t i = 0; i < Iterations; i++) {
+    for (size_t j = 0; j < Size; j++) {
+      ReferenceOutput[j] += (j + 1) * 2;
+    }
+  }
+}
+
+int main() {
+  queue Queue{};
+
+  std::vector<T> DataInput(Size);
+
+  std::iota(DataInput.begin(), DataInput.end(), 1);
+
+  std::vector<T> ReferenceOutput(DataInput);
+  calculate_reference_data(Iterations, Size, ReferenceOutput);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrInput = malloc_device<T>(Size, Queue);
+
+  std::vector<T> OutputData(Size);
+
+  Queue.copy(DataInput.data(), PtrInput, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to graph
+  add_nodes_to_graph(Graph, Queue, Size, PtrInput);
+
+  auto GraphExec = Graph.finalize();
+
+  for (unsigned n = 0; n < Iterations; n++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+
+  Queue.wait_and_throw();
+  Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw();
+  for (size_t i = 0; i < Size; i++) {
+    assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData"));
+  }
+
+  free(PtrInput, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp
new file mode 100644
index 0000000000000..3f9bfb6443b1a
--- /dev/null
+++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp
@@ -0,0 +1,213 @@
+// Tests memory reuse behaviour of device graph allocations from a zero-init
+// memory pool.
+// NOTE: This test partially relies on knowing how the implementation works and
+// that the contents of memory will persist when allocations are reused. This is
+// useful for testing but is not an assumption that a user can or should make.
+
+#include "../../graph_common.hpp"
+#include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
+#include <sycl/ext/oneapi/experimental/async_alloc/memory_pool.hpp>
+#include <sycl/ext/oneapi/experimental/async_alloc/memory_pool_properties.hpp>
+
+#define __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__
+
+using T = int;
+void add_nodes_to_graph(
+    exp_ext::command_graph<exp_ext::graph_state::modifiable> &Graph,
+    queue &Queue, size_t Size, T *PtrInput) {
+  // Create a memory pool for async allocations with the zero init property
+  exp_ext::memory_pool MemPool{
+      Queue, usm::alloc::device, {exp_ext::property::memory_pool::zero_init{}}};
+  // Create 2 pointers for async allocations
+  T *AsyncPtrA = nullptr;
+  T *AsyncPtrB = nullptr;
+  // Add alloc nodes at the root of the graph for each allocation, this should
+  // result in three unique allocations
+  auto AllocNodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    AsyncPtrA = static_cast<T *>(
+        exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool));
+  });
+  auto AllocNodeB = add_node(Graph, Queue, [&](handler &CGH) {
+    AsyncPtrB = static_cast<T *>(
+        exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool));
+  });
+
+  // Assert that we have received unique ptr values, this should always be true
+  // regardless of implementation.
+  assert((AsyncPtrA != AsyncPtrB));
+
+  // Store pointer values for later comparison
+  void *FirstAsyncPtrA = AsyncPtrA;
+  void *FirstAsyncPtrB = AsyncPtrB;
+  // Add kernel that fills the async allocs with values
+  auto KernelFillPtrs = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {AllocNodeA, AllocNodeB});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          // Allocations should be zero initialized on this first use, so we
+          // should be able to just add on values rather than set them
+          // explicitly.
+          AsyncPtrA[LinID] += (1 + LinID);
+          AsyncPtrB[LinID] += (2 + LinID);
+        });
+      },
+      AllocNodeA, AllocNodeB);
+  // Free all the async allocations, making it possible to reuse them
+  auto FreeNodeA = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelFillPtrs);
+        exp_ext::async_free(CGH, AsyncPtrA);
+      },
+      KernelFillPtrs);
+  auto FreeNodeB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelFillPtrs);
+        exp_ext::async_free(CGH, AsyncPtrB);
+      },
+      KernelFillPtrs);
+
+  // Create 2 connected layers in the graph, in each layer we will do a new
+  // async_alloc which should reuse one of the 2 previously freed pointers. The
+  // other kernel in the layer will simply operate on the output ptr.
+
+  // The first layer will have a direct dependency on a free node, but the
+  // subsequent layer will have an indirect dependency. We do not test the order
+  // in which allocations are picked for reuse, but we can assume both will be
+  // reused by the implementation (same size and properties so they are
+  // compatible).
+
+  // First layer, allocation is added first
+  auto AllocNodeA1 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {FreeNodeA, FreeNodeB});
+        AsyncPtrA = static_cast<T *>(
+            exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool));
+      },
+      FreeNodeA, FreeNodeB);
+
+  // Check that the new allocation matches one of the previously returned
+  // values.
+  assert((AsyncPtrA == FirstAsyncPtrA) || (AsyncPtrA == FirstAsyncPtrB));
+  // Increment output pointer
+  auto KernelInc1 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {FreeNodeA, FreeNodeB});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          PtrInput[LinID] += 1;
+        });
+      },
+      FreeNodeA, FreeNodeB);
+
+  // Second layer, allocation has an indirect dependency on a free node
+  // Increment output pointer
+  auto KernelInc2 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {KernelInc1});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          PtrInput[LinID] += 1;
+        });
+      },
+      KernelInc1);
+  auto AllocNodeB2 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {KernelInc1, AllocNodeA1});
+        AsyncPtrB = static_cast<T *>(
+            exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool));
+      },
+      KernelInc1, AllocNodeA1);
+  // Check that the new allocation matches one of the previously returned
+  // values.
+  assert((AsyncPtrB == FirstAsyncPtrA) || (AsyncPtrB == FirstAsyncPtrB));
+
+  // Add a final kernel that adds the async allocation values to the output
+  // pointer
+  auto KernelAddToOutput = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {KernelInc2, AllocNodeB2});
+        CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+          size_t LinID = Item.get_linear_id();
+          PtrInput[LinID] += (AsyncPtrA[LinID] + AsyncPtrB[LinID]);
+        });
+      },
+      KernelInc2, AllocNodeB2);
+
+  // Free the allocations again
+
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelAddToOutput);
+        exp_ext::async_free(CGH, AsyncPtrA);
+      },
+      KernelAddToOutput);
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, KernelAddToOutput);
+        exp_ext::async_free(CGH, AsyncPtrB);
+      },
+      KernelAddToOutput);
+}
+
+void calculate_reference_data(size_t Iterations, size_t Size,
+                              std::vector<T> &ReferenceOutput) {
+  for (size_t i = 0; i < Iterations; i++) {
+    for (size_t j = 0; j < Size; j++) {
+      ReferenceOutput[j] += 2;
+      ReferenceOutput[j] += (j + 1) * (i + 1);
+      ReferenceOutput[j] += (j + 2) * (i + 1);
+    }
+  }
+}
+
+int main() {
+  queue Queue{};
+
+  std::vector<T> DataInput(Size);
+
+  std::iota(DataInput.begin(), DataInput.end(), 1);
+
+  std::vector<T> ReferenceOutput(DataInput);
+  calculate_reference_data(Iterations, Size, ReferenceOutput);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrInput = malloc_device<T>(Size, Queue);
+
+  std::vector<T> OutputData(Size);
+
+  Queue.copy(DataInput.data(), PtrInput, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to graph
+  add_nodes_to_graph(Graph, Queue, Size, PtrInput);
+
+  auto GraphExec = Graph.finalize();
+
+  Graph.print_graph("test.dot");
+
+  for (unsigned n = 0; n < Iterations; n++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+
+  Queue.wait_and_throw();
+  Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw();
+  for (size_t i = 0; i < Size; i++) {
+    assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData"));
+  }
+
+  free(PtrInput, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp
new file mode 100644
index 0000000000000..cfdc7e664acc5
--- /dev/null
+++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/async_alloc_device_memory_reuse.cpp"
diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp
new file mode 100644
index 0000000000000..a40da6fb026c5
--- /dev/null
+++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp
@@ -0,0 +1,161 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// Tests memory reuse behaviour of device graph allocations when an in-order
+// queue is used.
+// NOTE: This test partially relies on knowing how the
+// implementation works and that the contents of memory will persist when
+// allocations are reused. This is useful for testing but is not an assumption
+// that a user can or should make.
+
+#include "../../graph_common.hpp"
+#include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
+#include <sycl/properties/queue_properties.hpp>
+
+using T = int;
+void add_nodes_to_graph(queue &Queue, size_t Size, T *PtrInput) {
+  // Create 2 pointers for async allocations
+  T *AsyncPtrA = nullptr;
+  T *AsyncPtrB = nullptr;
+  // Add alloc nodes at the root of the graph for each allocation, this should
+  // result in three unique allocations
+  Queue.submit([&](handler &CGH) {
+    AsyncPtrA = static_cast<T *>(
+        exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T)));
+  });
+  // Use the free function instead of handle version
+  AsyncPtrB = static_cast<T *>(
+      exp_ext::async_malloc(Queue, usm::alloc::device, Size * sizeof(T)));
+
+  // Assert that we have received unique ptr values, this should always be true
+  // regardless of implementation.
+  assert((AsyncPtrA != AsyncPtrB));
+
+  // Store pointer values for later comparison
+  void *FirstAsyncPtrA = AsyncPtrA;
+  void *FirstAsyncPtrB = AsyncPtrB;
+  // Add kernel that fills the async allocs with values
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+      size_t LinID = Item.get_linear_id();
+      AsyncPtrA[LinID] = (1 + LinID);
+      AsyncPtrB[LinID] = (2 + LinID);
+    });
+  });
+  // Free all the async allocations, making it possible to reuse them
+  Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrA); });
+  Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrB); });
+
+  // Create 2 connected layers in the graph, in each layer we will do a new
+  // async_alloc which should reuse one of the 2 previously freed pointers. The
+  // other kernel in the layer will simply operate on the output ptr.
+
+  // The first layer will have a direct dependency on a free node, but the
+  // subsequent layer will have an indirect dependency. We do not test the order
+  // in which allocations are picked for reuse, but we can assume both will be
+  // reused by the implementation (same size and properties so they are
+  // compatible).
+
+  // First layer, allocation is added first
+  Queue.submit([&](handler &CGH) {
+    AsyncPtrA = static_cast<T *>(
+        exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T)));
+  });
+
+  // Check that the new allocation matches one of the previously returned
+  // values.
+  assert((AsyncPtrA == FirstAsyncPtrA) || (AsyncPtrA == FirstAsyncPtrB));
+  // Increment output pointer
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+      size_t LinID = Item.get_linear_id();
+      PtrInput[LinID] += 1;
+    });
+  });
+
+  // Second layer, allocation has an indirect dependency on a free node
+  // Increment output pointer
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+      size_t LinID = Item.get_linear_id();
+      PtrInput[LinID] += 1;
+    });
+  });
+  Queue.submit([&](handler &CGH) {
+    AsyncPtrB = static_cast<T *>(
+        exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T)));
+  });
+  // Check that the new allocation matches one of the previously returned
+  // values.
+  assert((AsyncPtrB == FirstAsyncPtrA) || (AsyncPtrB == FirstAsyncPtrB));
+
+  // Add a final kernel that adds the async allocation values to the output
+  // pointer
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> Item) {
+      size_t LinID = Item.get_linear_id();
+      PtrInput[LinID] += (AsyncPtrA[LinID] + AsyncPtrB[LinID]);
+    });
+  });
+
+  // Free the allocations again
+
+  Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrA); });
+  Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrB); });
+}
+
+void calculate_reference_data(size_t Iterations, size_t Size,
+                              std::vector<T> &ReferenceOutput) {
+  for (size_t i = 0; i < Iterations; i++) {
+    for (size_t j = 0; j < Size; j++) {
+      ReferenceOutput[j] += 2;
+      ReferenceOutput[j] += (j + 1);
+      ReferenceOutput[j] += (j + 2);
+    }
+  }
+}
+
+int main() {
+  queue Queue{{property::queue::in_order{}}};
+
+  std::vector<T> DataInput(Size);
+
+  std::iota(DataInput.begin(), DataInput.end(), 1);
+
+  std::vector<T> ReferenceOutput(DataInput);
+  calculate_reference_data(Iterations, Size, ReferenceOutput);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrInput = malloc_device<T>(Size, Queue);
+
+  std::vector<T> OutputData(Size);
+
+  Queue.copy(DataInput.data(), PtrInput, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to graph
+  Graph.begin_recording(Queue);
+  add_nodes_to_graph(Queue, Size, PtrInput);
+  Graph.end_recording(Queue);
+
+  auto GraphExec = Graph.finalize();
+
+  for (unsigned n = 0; n < Iterations; n++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+
+  Queue.wait_and_throw();
+  Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw();
+  for (size_t i = 0; i < Size; i++) {
+    assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData"));
+  }
+
+  free(PtrInput, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp
new file mode 100644
index 0000000000000..533121c33e83e
--- /dev/null
+++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/async_alloc_device_memory_reuse_multiple.cpp"
diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp
new file mode 100644
index 0000000000000..3506691046f15
--- /dev/null
+++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/async_alloc_device_memory_reuse_zero_init.cpp"

From 09e1322e8f5e3ec8e082afc61aa0a9e9e16d749f Mon Sep 17 00:00:00 2001
From: Ben Tracy <ben.tracy@codeplay.com>
Date: Thu, 8 May 2025 13:37:58 +0100
Subject: [PATCH 2/5] Addressing review comments

---
 sycl/source/detail/graph_impl.hpp | 12 +++++++-----
 sycl/source/queue.cpp             |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
index 2fc9ec93c9798..3a7084109185a 100644
--- a/sycl/source/detail/graph_impl.hpp
+++ b/sycl/source/detail/graph_impl.hpp
@@ -948,15 +948,16 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
   /// @param Events Events to find nodes for.
   /// @return A list of node counterparts for each event, in the same order.
   std::vector<std::shared_ptr<node_impl>> getNodesForEvents(
-      const std::vector<std::shared_ptr<sycl::detail::event_impl>> Events) {
-    std::vector<std::shared_ptr<node_impl>> NodeList{Events.size()};
+      const std::vector<std::shared_ptr<sycl::detail::event_impl>> &Events) {
+    std::vector<std::shared_ptr<node_impl>> NodeList{};
+    NodeList.reserve(Events.size());
 
     ReadLock Lock(MMutex);
 
-    for (size_t i = 0; i < Events.size(); i++) {
-      if (auto NodeFound = MEventsMap.find(Events[i]);
+    for (const auto &Event : Events) {
+      if (auto NodeFound = MEventsMap.find(Event);
           NodeFound != std::end(MEventsMap)) {
-        NodeList[i] = NodeFound->second;
+        NodeList.push_back(NodeFound->second);
       } else {
         throw sycl::exception(
             sycl::make_error_code(errc::invalid),
@@ -1221,6 +1222,7 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
   /// this graph.
   size_t getExecGraphCount() const { return MExecGraphCount; }
 
+  /// Resets the visited edges variable across all nodes in the graph to 0.
   void resetNodeVisitedEdges() {
     for (auto &Node : MNodeStorage) {
       Node->MTotalVisitedEdges = 0;
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 623e122351c3a..8b1b1af51ee11 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -443,7 +443,7 @@ sycl::detail::optional<event> queue::ext_oneapi_get_last_event_impl() const {
   // an event at end. If the event comes from a graph we must skip this because
   // the original event is used for tracking nodes in the graph.
   auto LastEventImpl = detail::getSyclObjImpl(*LastEvent);
-  if (!LastEventImpl->getCommandGraph() &&
+  if (!LastEventImpl->hasCommandGraph() &&
       (LastEventImpl->isDiscarded() || LastEventImpl->isNOP()))
     LastEvent =
         detail::createSyclObjFromImpl<event>(impl->insertMarkerEvent(impl));

From d01c9da57219ee8dc848973973fa2601ad44b32b Mon Sep 17 00:00:00 2001
From: Ben Tracy <ben.tracy@codeplay.com>
Date: Mon, 12 May 2025 12:44:47 +0100
Subject: [PATCH 3/5] Addressing review comments

- Fix wording in design doc
- Remove code duplication is async_alloc calls
- Early return from tryReuseExistingAllocation()
---
 sycl/doc/design/CommandGraph.md          | 16 +++---
 sycl/source/detail/async_alloc.cpp       | 62 ++++++++++++------------
 sycl/source/detail/graph_memory_pool.cpp | 18 +++++--
 3 files changed, 51 insertions(+), 45 deletions(-)

diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md
index c02c73a66d2bb..776016c22e52a 100644
--- a/sycl/doc/design/CommandGraph.md
+++ b/sycl/doc/design/CommandGraph.md
@@ -351,14 +351,14 @@ heuristic but rather expose this as a hint the user can provide.
 
 ### Graph Allocation Memory Reuse
 
-Within a single graph memory allocations which have previously been freed are
-checked when adding a new allocation to see if they can be reused. Because we
-have to return a pointer to the user immediately when the CGF for a node is
-processed when adding the node, we have to do this eagerly anytime
-`async_malloc()` is called. Allocations track the last free node associated with
-them to represent the most recent use of that allocation.
-
- To be reused the two allocations must meet these criteria:
+When adding a new allocation node to a graph, memory allocations which have
+previously been freed are checked to see if they can be reused. Because we have
+to return a pointer to the user immediately when the CGF for a node is
+processed, we have to do this eagerly anytime `async_malloc()` is called.
+Allocations track the last free node associated with them to represent the most
+recent use of that allocation.
+
+ To be reused, the two allocations must meet these criteria:
 
 - They must be of the same allocation type (device/host/shared).
 - They must have a matching size.
diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp
index b3be1e4ac6c78..1e530f0b01cc0 100644
--- a/sycl/source/detail/async_alloc.cpp
+++ b/sycl/source/detail/async_alloc.cpp
@@ -8,6 +8,7 @@
 
 #include <detail/context_impl.hpp>
 #include <detail/event_impl.hpp>
+#include <detail/graph_impl.hpp>
 #include <detail/queue_impl.hpp>
 #include <sycl/detail/ur.hpp>
 #include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
@@ -29,6 +30,27 @@ getUrEvents(const std::vector<std::shared_ptr<detail::event_impl>> &DepEvents) {
   }
   return RetUrEvents;
 }
+
+std::vector<std::shared_ptr<detail::node_impl>> getDepGraphNodes(
+    const std::shared_ptr<detail::handler_impl> &Handler,
+    const std::shared_ptr<detail::queue_impl> &Queue,
+    const std::shared_ptr<detail::graph_impl> &Graph,
+    const std::vector<std::shared_ptr<detail::event_impl>> &DepEvents) {
+  // Get dependent graph nodes from any events
+  auto DepNodes = Graph->getNodesForEvents(DepEvents);
+  // If this node was added explicitly we may have node deps in the handler as
+  // well, so add them to the list
+  DepNodes.insert(DepNodes.end(), Handler->MNodeDeps.begin(),
+                  Handler->MNodeDeps.end());
+  // If this is being recorded from an in-order queue we need to get the last
+  // in-order node if any, since this will later become a dependency of the
+  // node being processed here.
+  if (const auto &LastInOrderNode = Graph->getLastInorderNode(Queue);
+      LastInOrderNode) {
+    DepNodes.push_back(LastInOrderNode);
+  }
+  return DepNodes;
+}
 } // namespace
 
 __SYCL_EXPORT
@@ -49,34 +71,21 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) {
   // Get CG event dependencies for this allocation.
   const auto &DepEvents = h.impl->CGData.MEvents;
   auto UREvents = getUrEvents(DepEvents);
-  uint32_t numEvents = DepEvents.size();
 
   void *alloc = nullptr;
 
   ur_event_handle_t Event = nullptr;
   // If a graph is present do the allocation from the graph memory pool instead.
   if (auto Graph = h.getCommandGraph(); Graph) {
-    // Get dependent graph nodes from any events
-    auto DepNodes = Graph->getNodesForEvents(DepEvents);
-    // If this node was added explicitly we may have node deps in the handler as
-    // well, so add them to the list
-    DepNodes.insert(DepNodes.end(), h.impl->MNodeDeps.begin(),
-                    h.impl->MNodeDeps.end());
-    // If this is being recorded from an in-order queue we need to get the last
-    // in-order node if any, since this will later become a dependency of the
-    // node being processed here.
-    if (const auto &LastInOrderNode = Graph->getLastInorderNode(h.MQueue);
-        LastInOrderNode) {
-      DepNodes.push_back(LastInOrderNode);
-    }
-
+    auto DepNodes = getDepGraphNodes(sycl::detail::getSyclObjImpl(h), h.MQueue,
+                                     Graph, DepEvents);
     alloc = Graph->getMemPool().malloc(size, kind, DepNodes);
   } else {
     auto &Q = h.MQueue->getHandleRef();
     Adapter->call<sycl::errc::runtime,
                   sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
-        Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, UREvents.data(),
-        &alloc, &Event);
+        Q, (ur_usm_pool_handle_t)0, size, nullptr, UREvents.size(),
+        UREvents.data(), &alloc, &Event);
   }
 
   // Async malloc must return a void* immediately.
@@ -113,26 +122,15 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size,
   // Get CG event dependencies for this allocation.
   const auto &DepEvents = h.impl->CGData.MEvents;
   auto UREvents = getUrEvents(DepEvents);
-  uint32_t numEvents = DepEvents.size();
 
   void *alloc = nullptr;
 
   ur_event_handle_t Event = nullptr;
   // If a graph is present do the allocation from the graph memory pool instead.
   if (auto Graph = h.getCommandGraph(); Graph) {
-    // Get dependent graph nodes from any events
-    auto DepNodes = Graph->getNodesForEvents(DepEvents);
-    // If this node was added explicitly we may have node deps in the handler as
-    // well, so add them to the list
-    DepNodes.insert(DepNodes.end(), h.impl->MNodeDeps.begin(),
-                    h.impl->MNodeDeps.end());
-    // If this is being recorded from an in-order queue we need to get the last
-    // in-order node if any, since this will later become a dependency of the
-    // node being processed here.
-    if (const auto &LastInOrderNode = Graph->getLastInorderNode(h.MQueue);
-        LastInOrderNode) {
-      DepNodes.push_back(LastInOrderNode);
-    }
+    auto DepNodes = getDepGraphNodes(sycl::detail::getSyclObjImpl(h), h.MQueue,
+                                     Graph, DepEvents);
+
     // Memory pool is passed as the graph may use some properties of it.
     alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), DepNodes,
                                        sycl::detail::getSyclObjImpl(pool));
@@ -140,7 +138,7 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size,
     auto &Q = h.MQueue->getHandleRef();
     Adapter->call<sycl::errc::runtime,
                   sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
-        Q, memPoolImpl.get()->get_handle(), size, nullptr, numEvents,
+        Q, memPoolImpl.get()->get_handle(), size, nullptr, UREvents.size(),
         UREvents.data(), &alloc, &Event);
   }
   // Async malloc must return a void* immediately.
diff --git a/sycl/source/detail/graph_memory_pool.cpp b/sycl/source/detail/graph_memory_pool.cpp
index 56d813e6294e7..bda278c88780f 100644
--- a/sycl/source/detail/graph_memory_pool.cpp
+++ b/sycl/source/detail/graph_memory_pool.cpp
@@ -94,7 +94,7 @@ graph_mem_pool::tryReuseExistingAllocation(
   CompatibleAllocs.reserve(MFreeAllocations.size());
 
   // Loop over free allocation list, search for ones that are compatible for
-  // reuse Currently that means they have the same alloc kind, size and read
+  // reuse. Currently that means they have the same alloc kind, size and read
   // only property.
 
   for (auto &Ptr : MFreeAllocations) {
@@ -106,6 +106,11 @@ graph_mem_pool::tryReuseExistingAllocation(
     }
   }
 
+  // If we have no suitable allocs to reuse, return early
+  if (CompatibleAllocs.size() == 0) {
+    return std::nullopt;
+  }
+
   // Traverse graph back from each DepNode to try and find any of the suitable
   // free nodes. We do this in a breadth-first approach because we want to find
   // the shortest path to a reusable allocation.
@@ -119,7 +124,9 @@ graph_mem_pool::tryReuseExistingAllocation(
 
   std::optional<alloc_info> AllocInfo = {};
 
-  // Called when traversing over nodes to check if the current node
+  // Called when traversing over nodes to check if the current node is a free
+  // node for one of the available allocations. If it is we populate AllocInfo
+  // with the allocation to be reused.
   auto CheckNodeEqual =
       [&CompatibleAllocs,
        &AllocInfo](const std::shared_ptr<node_impl> &CurrentNode) -> bool {
@@ -145,9 +152,10 @@ graph_mem_pool::tryReuseExistingAllocation(
       continue;
     }
 
-    // Check if the node is a free node, and if so is a free node for any of the
-    // allocations which are free for reuse. We should not bother checking nodes
-    // that are free nodes, so we continue and check their predecessors.
+    // Check if the node is a free node and, if so, check if it is a free node
+    // for any of the allocations which are free for reuse. We should not bother
+    // checking nodes that are not free nodes, so we continue and check their
+    // predecessors.
     if (CurrentNode->MNodeType == node_type::async_free &&
         CheckNodeEqual(CurrentNode)) {
       // If we found an allocation AllocInfo has already been populated in

From 32a7a35a6e95d090bb6bddf3676eb4b9718c1299 Mon Sep 17 00:00:00 2001
From: Ben Tracy <ben.tracy@codeplay.com>
Date: Thu, 15 May 2025 16:04:20 +0100
Subject: [PATCH 4/5] Add early exit from tryReuseExistingAllocation if no dep
 nodes

- No dep nodes means that we cannot be connected to any free nodes for reuse
---
 sycl/source/detail/graph_memory_pool.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sycl/source/detail/graph_memory_pool.cpp b/sycl/source/detail/graph_memory_pool.cpp
index bda278c88780f..a036efd3a287c 100644
--- a/sycl/source/detail/graph_memory_pool.cpp
+++ b/sycl/source/detail/graph_memory_pool.cpp
@@ -88,6 +88,11 @@ std::optional<graph_mem_pool::alloc_info>
 graph_mem_pool::tryReuseExistingAllocation(
     size_t Size, usm::alloc AllocType, bool ReadOnly,
     const std::vector<std::shared_ptr<node_impl>> &DepNodes) {
+  // If we have no dependencies this is a no-op because allocations must connect
+  // to a free node for reuse to be possible.
+  // if (DepNodes.empty()) {
+  //   return std::nullopt;
+  // }
 
   std::vector<alloc_info> CompatibleAllocs;
   // Compatible allocs can only be as big as MFreeAllocations

From 753186e05d32cc7ba84d767c7d3eebc53a4a5306 Mon Sep 17 00:00:00 2001
From: Ben Tracy <ben.tracy@codeplay.com>
Date: Thu, 15 May 2025 17:13:44 +0100
Subject: [PATCH 5/5] Adjust getDepGraphNodes for handler changes

---
 sycl/source/detail/async_alloc.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp
index 1e530f0b01cc0..46d2f3f5589bf 100644
--- a/sycl/source/detail/async_alloc.cpp
+++ b/sycl/source/detail/async_alloc.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "sycl/accessor.hpp"
 #include <detail/context_impl.hpp>
 #include <detail/event_impl.hpp>
 #include <detail/graph_impl.hpp>
@@ -32,16 +33,16 @@ getUrEvents(const std::vector<std::shared_ptr<detail::event_impl>> &DepEvents) {
 }
 
 std::vector<std::shared_ptr<detail::node_impl>> getDepGraphNodes(
-    const std::shared_ptr<detail::handler_impl> &Handler,
-    const std::shared_ptr<detail::queue_impl> &Queue,
+    sycl::handler &Handler, const std::shared_ptr<detail::queue_impl> &Queue,
     const std::shared_ptr<detail::graph_impl> &Graph,
     const std::vector<std::shared_ptr<detail::event_impl>> &DepEvents) {
+  auto HandlerImpl = detail::getSyclObjImpl(Handler);
   // Get dependent graph nodes from any events
   auto DepNodes = Graph->getNodesForEvents(DepEvents);
   // If this node was added explicitly we may have node deps in the handler as
   // well, so add them to the list
-  DepNodes.insert(DepNodes.end(), Handler->MNodeDeps.begin(),
-                  Handler->MNodeDeps.end());
+  DepNodes.insert(DepNodes.end(), HandlerImpl->MNodeDeps.begin(),
+                  HandlerImpl->MNodeDeps.end());
   // If this is being recorded from an in-order queue we need to get the last
   // in-order node if any, since this will later become a dependency of the
   // node being processed here.
@@ -77,8 +78,7 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) {
   ur_event_handle_t Event = nullptr;
   // If a graph is present do the allocation from the graph memory pool instead.
   if (auto Graph = h.getCommandGraph(); Graph) {
-    auto DepNodes = getDepGraphNodes(sycl::detail::getSyclObjImpl(h), h.MQueue,
-                                     Graph, DepEvents);
+    auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents);
     alloc = Graph->getMemPool().malloc(size, kind, DepNodes);
   } else {
     auto &Q = h.MQueue->getHandleRef();
@@ -128,8 +128,7 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size,
   ur_event_handle_t Event = nullptr;
   // If a graph is present do the allocation from the graph memory pool instead.
   if (auto Graph = h.getCommandGraph(); Graph) {
-    auto DepNodes = getDepGraphNodes(sycl::detail::getSyclObjImpl(h), h.MQueue,
-                                     Graph, DepEvents);
+    auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents);
 
     // Memory pool is passed as the graph may use some properties of it.
     alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), DepNodes,