From c1d95954805e5612f1493e51bf0480efd27020cc Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Tue, 6 May 2025 17:30:49 +0100 Subject: [PATCH 1/5] [SYCL][Graph] Memory reuse for graph allocations in a single graph - Optimize memory use for allocations within a single graph by reusing memory where possible - New handler impl member for node dependency access with the CGF - New E2E tests for memory reuse - Add design doc information on memory reuse - Add missing CGType -> string conversion for graph printing alloc and free nodes --- sycl/doc/design/CommandGraph.md | 20 ++ sycl/source/CMakeLists.txt | 1 + sycl/source/detail/async_alloc.cpp | 52 ++++- sycl/source/detail/graph_impl.cpp | 20 +- sycl/source/detail/graph_impl.hpp | 36 +++ sycl/source/detail/graph_memory_pool.cpp | 189 ++++++++++++++++ sycl/source/detail/graph_memory_pool.hpp | 89 ++++---- sycl/source/detail/handler_impl.hpp | 3 + sycl/source/queue.cpp | 6 +- .../async_alloc_device_memory_reuse.cpp | 10 + ...ync_alloc_device_memory_reuse_multiple.cpp | 10 + ...nc_alloc_device_memory_reuse_zero_init.cpp | 10 + .../async_alloc_device_memory_reuse.cpp | 200 ++++++++++++++++ ...ync_alloc_device_memory_reuse_multiple.cpp | 163 ++++++++++++++ ...nc_alloc_device_memory_reuse_zero_init.cpp | 213 ++++++++++++++++++ .../async_alloc_device_memory_reuse.cpp | 10 + ...ync_alloc_device_memory_reuse_in_order.cpp | 161 +++++++++++++ ...ync_alloc_device_memory_reuse_multiple.cpp | 10 + ...nc_alloc_device_memory_reuse_zero_init.cpp | 10 + 19 files changed, 1150 insertions(+), 63 deletions(-) create mode 100644 sycl/source/detail/graph_memory_pool.cpp create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp create mode 100644 sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 956082b31097f..c02c73a66d2bb 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -349,6 +349,26 @@ safely assumed to be more performant. It is not likely we'll try to allow in-order execution in more scenarios through a complicated (and imperfect) heuristic but rather expose this as a hint the user can provide. +### Graph Allocation Memory Reuse + +Within a single graph memory allocations which have previously been freed are +checked when adding a new allocation to see if they can be reused. Because we +have to return a pointer to the user immediately when the CGF for a node is +processed when adding the node, we have to do this eagerly anytime +`async_malloc()` is called. Allocations track the last free node associated with +them to represent the most recent use of that allocation. + + To be reused the two allocations must meet these criteria: + +- They must be of the same allocation type (device/host/shared). +- They must have a matching size. +- They must have the same properties (currently only read-only matters). +- There must be a path from the last free node associated with a given + allocation to the new allocation node being added. + +If these criteria are met we update the last free node for the allocation and +return the existing pointer to the user. + ## Backend Implementation Implementation of UR command-buffers for each of the supported SYCL 2020 diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index dac4e71af45af..40686655b71b4 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -322,6 +322,7 @@ set(SYCL_COMMON_SOURCES "detail/memory_pool_impl.cpp" "detail/async_alloc.cpp" "detail/memory_pool.cpp" + "detail/graph_memory_pool.cpp" "$<$:detail/windows_ur.cpp>" "$<$,$>:detail/posix_ur.cpp>" ) diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp index b21f6d13c42e3..b3be1e4ac6c78 100644 --- a/sycl/source/detail/async_alloc.cpp +++ b/sycl/source/detail/async_alloc.cpp @@ -46,21 +46,36 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) { auto &Adapter = h.getContextImplPtr()->getAdapter(); - // Get events to wait on. - auto depEvents = getUrEvents(h.impl->CGData.MEvents); - uint32_t numEvents = h.impl->CGData.MEvents.size(); + // Get CG event dependencies for this allocation. + const auto &DepEvents = h.impl->CGData.MEvents; + auto UREvents = getUrEvents(DepEvents); + uint32_t numEvents = DepEvents.size(); void *alloc = nullptr; ur_event_handle_t Event = nullptr; // If a graph is present do the allocation from the graph memory pool instead. if (auto Graph = h.getCommandGraph(); Graph) { - alloc = Graph->getMemPool().malloc(size, kind); + // Get dependent graph nodes from any events + auto DepNodes = Graph->getNodesForEvents(DepEvents); + // If this node was added explicitly we may have node deps in the handler as + // well, so add them to the list + DepNodes.insert(DepNodes.end(), h.impl->MNodeDeps.begin(), + h.impl->MNodeDeps.end()); + // If this is being recorded from an in-order queue we need to get the last + // in-order node if any, since this will later become a dependency of the + // node being processed here. + if (const auto &LastInOrderNode = Graph->getLastInorderNode(h.MQueue); + LastInOrderNode) { + DepNodes.push_back(LastInOrderNode); + } + + alloc = Graph->getMemPool().malloc(size, kind, DepNodes); } else { auto &Q = h.MQueue->getHandleRef(); Adapter->call( - Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, depEvents.data(), + Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, UREvents.data(), &alloc, &Event); } @@ -95,24 +110,38 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size, auto &Adapter = h.getContextImplPtr()->getAdapter(); auto &memPoolImpl = sycl::detail::getSyclObjImpl(pool); - // Get events to wait on. - auto depEvents = getUrEvents(h.impl->CGData.MEvents); - uint32_t numEvents = h.impl->CGData.MEvents.size(); + // Get CG event dependencies for this allocation. + const auto &DepEvents = h.impl->CGData.MEvents; + auto UREvents = getUrEvents(DepEvents); + uint32_t numEvents = DepEvents.size(); void *alloc = nullptr; ur_event_handle_t Event = nullptr; // If a graph is present do the allocation from the graph memory pool instead. if (auto Graph = h.getCommandGraph(); Graph) { + // Get dependent graph nodes from any events + auto DepNodes = Graph->getNodesForEvents(DepEvents); + // If this node was added explicitly we may have node deps in the handler as + // well, so add them to the list + DepNodes.insert(DepNodes.end(), h.impl->MNodeDeps.begin(), + h.impl->MNodeDeps.end()); + // If this is being recorded from an in-order queue we need to get the last + // in-order node if any, since this will later become a dependency of the + // node being processed here. + if (const auto &LastInOrderNode = Graph->getLastInorderNode(h.MQueue); + LastInOrderNode) { + DepNodes.push_back(LastInOrderNode); + } // Memory pool is passed as the graph may use some properties of it. - alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), + alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), DepNodes, sycl::detail::getSyclObjImpl(pool)); } else { auto &Q = h.MQueue->getHandleRef(); Adapter->call( Q, memPoolImpl.get()->get_handle(), size, nullptr, numEvents, - depEvents.data(), &alloc, &Event); + UREvents.data(), &alloc, &Event); } // Async malloc must return a void* immediately. // Set up CommandGroup which is a no-op and pass the event from the alloc. @@ -140,6 +169,9 @@ async_malloc_from_pool(const sycl::queue &q, size_t size, } __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr) { + // We only check for errors for the graph here because marking the allocation + // as free in the graph memory pool requires a node object which doesn't exist + // at this point. if (auto Graph = h.getCommandGraph(); Graph) { // Check if the pointer to be freed has an associated allocation node, and // error if not diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 252fc220df70d..e3ba7f6846098 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -344,7 +344,8 @@ graph_impl::graph_impl(const sycl::context &SyclContext, const sycl::device &SyclDevice, const sycl::property_list &PropList) : MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(), - MEventsMap(), MInorderQueueMap(), MGraphMemPool(SyclContext, SyclDevice), + MEventsMap(), MInorderQueueMap(), + MGraphMemPool(*this, SyclContext, SyclDevice), MID(NextAvailableID.fetch_add(1, std::memory_order_relaxed)) { checkGraphPropertiesAndThrow(PropList); if (PropList.has_property()) { @@ -504,6 +505,10 @@ graph_impl::add(std::function CGF, (void)Args; sycl::handler Handler{shared_from_this()}; + // Pass the node deps to the handler so they are available when processing the + // CGF, need for async_malloc nodes. + Handler.impl->MNodeDeps = Deps; + #if XPTI_ENABLE_INSTRUMENTATION // Save code location if one was set in TLS. // Ideally it would be nice to capture user's call code location @@ -527,6 +532,10 @@ graph_impl::add(std::function CGF, Handler.finalize(); + // In explicit mode the handler processing of the CGF does not need a write + // lock as it does not modify the graph, we extract information from it here + // and modify the graph. + graph_impl::WriteLock Lock(MMutex); node_type NodeType = Handler.impl->MUserFacingNodeType != ext::oneapi::experimental::node_type::empty @@ -597,6 +606,14 @@ graph_impl::add(node_type NodeType, addDepsToNode(NodeImpl, Deps); + if (NodeType == node_type::async_free) { + auto AsyncFreeCG = + static_cast(NodeImpl->MCommandGroup.get()); + // If this is an async free node mark that it is now available for reuse, + // and pass the async free node for tracking. + MGraphMemPool.markAllocationAsAvailable(AsyncFreeCG->getPtr(), NodeImpl); + } + return NodeImpl; } @@ -1778,7 +1795,6 @@ node modifiable_command_graph::addImpl(std::function CGF, DepImpls.push_back(sycl::detail::getSyclObjImpl(D)); } - graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(CGF, {}, DepImpls); return sycl::detail::createSyclObjFromImpl(std::move(NodeImpl)); } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index ccc6455f6296f..2fc9ec93c9798 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -735,6 +735,12 @@ class node_impl : public std::enable_shared_from_this { case sycl::detail::CGType::EnqueueNativeCommand: Stream << "CGNativeCommand \\n"; break; + case sycl::detail::CGType::AsyncAlloc: + Stream << "CGAsyncAlloc \\n"; + break; + case sycl::detail::CGType::AsyncFree: + Stream << "CGAsyncFree \\n"; + break; default: Stream << "Other \\n"; break; @@ -937,6 +943,30 @@ class graph_impl : public std::enable_shared_from_this { "No node in this graph is associated with this event"); } + /// Find the nodes associated with a list of SYCL events. Throws if no node is + /// found for a given event. + /// @param Events Events to find nodes for. + /// @return A list of node counterparts for each event, in the same order. + std::vector> getNodesForEvents( + const std::vector> Events) { + std::vector> NodeList{Events.size()}; + + ReadLock Lock(MMutex); + + for (size_t i = 0; i < Events.size(); i++) { + if (auto NodeFound = MEventsMap.find(Events[i]); + NodeFound != std::end(MEventsMap)) { + NodeList[i] = NodeFound->second; + } else { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "No node in this graph is associated with this event"); + } + } + + return NodeList; + } + /// Query for the context tied to this graph. /// @return Context associated with graph. sycl::context getContext() const { return MContext; } @@ -1191,6 +1221,12 @@ class graph_impl : public std::enable_shared_from_this { /// this graph. size_t getExecGraphCount() const { return MExecGraphCount; } + void resetNodeVisitedEdges() { + for (auto &Node : MNodeStorage) { + Node->MTotalVisitedEdges = 0; + } + } + private: /// Check the graph for cycles by performing a depth-first search of the /// graph. If a node is visited more than once in a given path through the diff --git a/sycl/source/detail/graph_memory_pool.cpp b/sycl/source/detail/graph_memory_pool.cpp new file mode 100644 index 0000000000000..56d813e6294e7 --- /dev/null +++ b/sycl/source/detail/graph_memory_pool.cpp @@ -0,0 +1,189 @@ +//==--------- graph_memory_pool.cpp --- SYCL graph extension ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "graph_memory_pool.hpp" + +#include +#include + +#include "graph_impl.hpp" + +namespace sycl { +inline namespace _V1 { +namespace ext { +namespace oneapi { +namespace experimental { +namespace detail { + +void * +graph_mem_pool::malloc(size_t Size, usm::alloc AllocType, + const std::vector> &DepNodes, + const std::shared_ptr &MemPool) { + // We are potentially modifying contents of this memory pool and the owning + // graph, so take a lock here. + graph_impl::WriteLock Lock(MGraph.MMutex); + + void *Alloc = nullptr; + alloc_info AllocInfo = {}; + AllocInfo.Kind = AllocType; + // Collect relevant properties from memory pool + if (MemPool) { + auto PropList = MemPool->getPropList(); + if (PropList.has_property()) { + AllocInfo.ZeroInit = true; + } + if (PropList.has_property()) { + AllocInfo.ReadOnly = true; + } + } + + switch (AllocType) { + case usm::alloc::device: { + + auto &CtxImpl = sycl::detail::getSyclObjImpl(MContext); + auto &Adapter = CtxImpl->getAdapter(); + + size_t Granularity = get_mem_granularity(MDevice, MContext); + uintptr_t StartPtr = 0; + size_t AlignedSize = alignByteSize(Size, Granularity); + // See if we can find an allocation to reuse + auto AllocOpt = tryReuseExistingAllocation(AlignedSize, AllocType, + AllocInfo.ReadOnly, DepNodes); + + // If we got a value back then an allocation was available for reuse and we + // can just return that pointer + if (AllocOpt) { + return AllocOpt.value().Ptr; + } + + // If no allocation could be reused, do a new virtual reservation + Adapter->call( + CtxImpl->getHandleRef(), reinterpret_cast(StartPtr), + AlignedSize, &Alloc); + + AllocInfo.Size = AlignedSize; + AllocInfo.Ptr = Alloc; + + MAllocations[Alloc] = AllocInfo; + break; + } + + default: + throw sycl::exception(sycl::make_error_code(errc::feature_not_supported), + "Only device allocations are currently supported " + "in graph allocation nodes!"); + break; + } + + return Alloc; +} + +std::optional +graph_mem_pool::tryReuseExistingAllocation( + size_t Size, usm::alloc AllocType, bool ReadOnly, + const std::vector> &DepNodes) { + + std::vector CompatibleAllocs; + // Compatible allocs can only be as big as MFreeAllocations + CompatibleAllocs.reserve(MFreeAllocations.size()); + + // Loop over free allocation list, search for ones that are compatible for + // reuse Currently that means they have the same alloc kind, size and read + // only property. + + for (auto &Ptr : MFreeAllocations) { + alloc_info &Info = MAllocations.at(Ptr); + if (Info.Kind == AllocType && Info.Size == Size && + Info.ReadOnly == ReadOnly) { + // Store the alloc info since it is compatible + CompatibleAllocs.push_back(Info); + } + } + + // Traverse graph back from each DepNode to try and find any of the suitable + // free nodes. We do this in a breadth-first approach because we want to find + // the shortest path to a reusable allocation. + + std::queue> NodesToCheck; + + // Add all the dependent nodes to the queue, they will be popped first + for (auto &Dep : DepNodes) { + NodesToCheck.push(Dep); + } + + std::optional AllocInfo = {}; + + // Called when traversing over nodes to check if the current node + auto CheckNodeEqual = + [&CompatibleAllocs, + &AllocInfo](const std::shared_ptr &CurrentNode) -> bool { + for (auto &Alloc : CompatibleAllocs) { + const auto &AllocFreeNode = Alloc.LastFreeNode; + // Compare control blocks without having to lock AllocFreeNode to check + // for node equality + if (!CurrentNode.owner_before(AllocFreeNode) && + !AllocFreeNode.owner_before(CurrentNode)) { + Alloc.LastFreeNode.reset(); + AllocInfo = Alloc; + return true; + } + } + return false; + }; + + while (!NodesToCheck.empty()) { + auto CurrentNode = NodesToCheck.front().lock(); + NodesToCheck.pop(); + + if (CurrentNode->MTotalVisitedEdges > 0) { + continue; + } + + // Check if the node is a free node, and if so is a free node for any of the + // allocations which are free for reuse. We should not bother checking nodes + // that are free nodes, so we continue and check their predecessors. + if (CurrentNode->MNodeType == node_type::async_free && + CheckNodeEqual(CurrentNode)) { + // If we found an allocation AllocInfo has already been populated in + // CheckNodeEqual(), so we simply break out of the loop + break; + } + + // Add CurrentNode predecessors to queue + for (auto &Pred : CurrentNode->MPredecessors) { + NodesToCheck.push(Pred); + } + + // Mark node as visited + CurrentNode->MTotalVisitedEdges = 1; + } + // Reset visited nodes tracking + MGraph.resetNodeVisitedEdges(); + // If we found an allocation, remove it from the free list. + if (AllocInfo) { + MFreeAllocations.erase(std::find(MFreeAllocations.begin(), + MFreeAllocations.end(), + AllocInfo.value().Ptr)); + } + + return AllocInfo; +} + +void graph_mem_pool::markAllocationAsAvailable( + void *Ptr, const std::shared_ptr &FreeNode) { + MFreeAllocations.push_back(Ptr); + MAllocations.at(Ptr).LastFreeNode = FreeNode; +} + +} // namespace detail +} // namespace experimental +} // namespace oneapi +} // namespace ext +} // namespace _V1 +} // namespace sycl diff --git a/sycl/source/detail/graph_memory_pool.hpp b/sycl/source/detail/graph_memory_pool.hpp index e9133bb7f7d0c..6da1b14526b12 100644 --- a/sycl/source/detail/graph_memory_pool.hpp +++ b/sycl/source/detail/graph_memory_pool.hpp @@ -20,15 +20,20 @@ namespace oneapi { namespace experimental { namespace detail { +// Forward declarations +class node_impl; + /// Class handling graph-owned memory allocations. Device allocations are /// managed using virtual memory. class graph_mem_pool { // Info descriptor for various properties of an allocation struct alloc_info { + // Pointer to the start of the allocation + void *Ptr = nullptr; // Allocation kind - usm::alloc Kind; + usm::alloc Kind = usm::alloc::unknown; // Size of the allocation - size_t Size; + size_t Size = 0; // Is currently mapped to physical memory bool Mapped = false; // Index into the array of physical memory @@ -37,11 +42,14 @@ class graph_mem_pool { bool ReadOnly = false; // Should the allocation be zero initialized during initial allocation bool ZeroInit = false; + // Last free node for this allocation in the graph + std::weak_ptr LastFreeNode = {}; }; public: - graph_mem_pool(const context &Context, const device &Device) - : MContext(Context), MDevice(Device) {} + graph_mem_pool(graph_impl &GraphImpl, const context &Context, + const device &Device) + : MGraph(GraphImpl), MContext(Context), MDevice(Device) {} ~graph_mem_pool() { for (auto &[Ptr, AllocInfo] : MAllocations) { // Unmap allocations if required before physical memory is released @@ -64,55 +72,14 @@ class graph_mem_pool { /// memory before use by calling allocateAndMapAll() /// @param Size Size of the allocation /// @param AllocType Type of the allocation + /// @param DepNodes Any node dependencies for this allocation, used to + /// identify allocations that can be reused. /// @param MemPool Optional memory pool from which allocations will not be /// made directly but properties may be respected. /// @return A pointer to the start of the allocation void *malloc(size_t Size, usm::alloc AllocType, - const std::shared_ptr &MemPool = nullptr) { - void *Alloc = nullptr; - switch (AllocType) { - case usm::alloc::device: { - - auto &CtxImpl = sycl::detail::getSyclObjImpl(MContext); - auto &Adapter = CtxImpl->getAdapter(); - - size_t Granularity = get_mem_granularity(MDevice, MContext); - uintptr_t StartPtr = 0; - size_t AlignedSize = alignByteSize(Size, Granularity); - // Do virtual reservation - Adapter->call( - CtxImpl->getHandleRef(), reinterpret_cast(StartPtr), - AlignedSize, &Alloc); - - alloc_info AllocInfo = {}; - AllocInfo.Kind = AllocType; - AllocInfo.Size = AlignedSize; - - // Collect relevant properties from memory pool - if (MemPool) { - auto PropList = MemPool->getPropList(); - if (PropList.has_property()) { - AllocInfo.ZeroInit = true; - } - if (PropList.has_property()) { - AllocInfo.ReadOnly = true; - } - } - - MAllocations[Alloc] = AllocInfo; - break; - } - - default: - throw sycl::exception(sycl::make_error_code(errc::feature_not_supported), - "Only device allocations are currently supported " - "in graph allocation nodes!"); - break; - } - - return Alloc; - } + const std::vector> &DepNodes, + const std::shared_ptr &MemPool = nullptr); /// Return the total amount of memory being used by this pool size_t getMemUseCurrent() const { @@ -184,7 +151,27 @@ class graph_mem_pool { return MAllocations.find(Ptr) != MAllocations.end(); } + /// Mark that this allocation has been freed and is available for reuse. + /// @param Ptr The pointer to the allocation. + /// @param FreeNode The graph node of node_type::async_free which is freeing + /// the allocation. + void markAllocationAsAvailable(void *Ptr, + const std::shared_ptr &FreeNode); + private: + /// Tries to reuse an existing allocation which has been marked free in the + /// graph. + /// @param Size Size of the allocation. + /// @param AllocType USM type of the allocation. + /// @param ReadOnly True if the allocation is read only + /// @param DepNodes Node dependencies of this allocation, used to search for + /// reusable allocations. + /// @returns An optional allocation info value, where a null value indicates + /// that no allocation could be reused. + std::optional tryReuseExistingAllocation( + size_t Size, usm::alloc AllocType, bool ReadOnly, + const std::vector> &DepNodes); + /// Returns an aligned byte size given a required granularity /// @param UnalignedByteSize The original requested allocation size /// @param Granularity The required granularity for this allocation @@ -193,6 +180,8 @@ class graph_mem_pool { return ((UnalignedByteSize + Granularity - 1) / Granularity) * Granularity; } + /// Graph that owns this mem pool + graph_impl &MGraph; /// Context associated with allocations from this pool context MContext; /// Device associated with allocations from this pool @@ -201,6 +190,8 @@ class graph_mem_pool { std::unordered_map MAllocations; /// List of physical memory allocations used for virtual device reservations std::vector> MPhysicalMem; + /// List of pointers to allocations which are currently free for reuse + std::vector MFreeAllocations; }; } // namespace detail } // namespace experimental diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 42b1991f153f5..7d87cc4abda7a 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -175,6 +175,9 @@ class handler_impl { std::shared_ptr MSubgraphNode; /// Storage for the CG created when handling graph nodes added explicitly. std::unique_ptr MGraphNodeCG; + /// Storage for node dependencies passed when adding a graph node explicitly + std::vector> + MNodeDeps; /// Storage for lambda/function when using HostTask std::shared_ptr MHostTask; diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 4383bb4a2d49a..623e122351c3a 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -440,9 +440,11 @@ sycl::detail::optional queue::ext_oneapi_get_last_event_impl() const { return std::nullopt; // If the last event was discarded or a NOP, we insert a marker to represent - // an event at end. + // an event at end. If the event comes from a graph we must skip this because + // the original event is used for tracking nodes in the graph. auto LastEventImpl = detail::getSyclObjImpl(*LastEvent); - if (LastEventImpl->isDiscarded() || LastEventImpl->isNOP()) + if (!LastEventImpl->getCommandGraph() && + (LastEventImpl->isDiscarded() || LastEventImpl->isNOP())) LastEvent = detail::createSyclObjFromImpl(impl->insertMarkerEvent(impl)); return LastEvent; diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp new file mode 100644 index 0000000000000..944e7449d5d05 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/async_alloc_device_memory_reuse.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp new file mode 100644 index 0000000000000..bb093866ce70b --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/async_alloc_device_memory_reuse_multiple.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp new file mode 100644 index 0000000000000..c00dab11f52cb --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/async_alloc_device_memory_reuse_zero_init.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp new file mode 100644 index 0000000000000..9887aa0bce9b8 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp @@ -0,0 +1,200 @@ +// Tests memory reuse behaviour of device graph allocations. +// NOTE: This test partially relies on knowing how the implementation works and +// that the contents of memory will persist when allocations are reused. This is +// useful for testing but is not an assumption that a user can or should make. + +#include "../../graph_common.hpp" +#include + +using T = int; +void add_nodes_to_graph( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, T *PtrInput) { + // Create 2 pointers for async allocations + T *AsyncPtrA = nullptr; + T *AsyncPtrB = nullptr; + // Add alloc nodes at the root of the graph for each allocation, this should + // result in three unique allocations + auto AllocNodeA = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtrA = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + auto AllocNodeB = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtrB = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + + // Assert that we have received unique ptr values, this should always be true + // regardless of implementation. + assert((AsyncPtrA != AsyncPtrB)); + + // Store pointer values for later comparison + void *FirstAsyncPtrA = AsyncPtrA; + void *FirstAsyncPtrB = AsyncPtrB; + // Add kernel that fills the async allocs with values + auto KernelFillPtrs = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {AllocNodeA, AllocNodeB}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + AsyncPtrA[LinID] = (1 + LinID); + AsyncPtrB[LinID] = (2 + LinID); + }); + }, + AllocNodeA, AllocNodeB); + // Free all the async allocations, making it possible to reuse them + auto FreeNodeA = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelFillPtrs); + exp_ext::async_free(CGH, AsyncPtrA); + }, + KernelFillPtrs); + auto FreeNodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelFillPtrs); + exp_ext::async_free(CGH, AsyncPtrB); + }, + KernelFillPtrs); + + // Create 2 connected layers in the graph, in each layer we will do a new + // async_alloc which should reuse one of the 2 previously freed pointers. The + // other kernel in the layer will simply operate on the output ptr. + + // The first layer will have a direct dependency on a free node, but the + // subsequent layer will have an indirect dependency. We do not test the order + // in which allocations are picked for reuse, but we can assume both will be + // reused by the implementation (same size and properties so they are + // compatible). + + // First layer, allocation is added first + auto AllocNodeA1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNodeA, FreeNodeB}); + AsyncPtrA = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }, + FreeNodeA, FreeNodeB); + + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrA == FirstAsyncPtrA) || (AsyncPtrA == FirstAsyncPtrB)); + // Increment output pointer + auto KernelInc1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNodeA, FreeNodeB}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }, + FreeNodeA, FreeNodeB); + + // Second layer, allocation has an indirect dependency on a free node + // Increment output pointer + auto KernelInc2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc1}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }, + KernelInc1); + auto AllocNodeB2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc1, AllocNodeA1}); + AsyncPtrB = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }, + KernelInc1, AllocNodeA1); + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrB == FirstAsyncPtrA) || (AsyncPtrB == FirstAsyncPtrB)); + + // Add a final kernel that adds the async allocation values to the output + // pointer + auto KernelAddToOutput = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc2, AllocNodeB2}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += (AsyncPtrA[LinID] + AsyncPtrB[LinID]); + }); + }, + KernelInc2, AllocNodeB2); + + // Free the allocations again + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAddToOutput); + exp_ext::async_free(CGH, AsyncPtrA); + }, + KernelAddToOutput); + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAddToOutput); + exp_ext::async_free(CGH, AsyncPtrB); + }, + KernelAddToOutput); +} + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceOutput) { + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceOutput[j] += 2; + ReferenceOutput[j] += (j + 1); + ReferenceOutput[j] += (j + 2); + } + } +} + +int main() { + queue Queue{}; + + std::vector DataInput(Size); + + std::iota(DataInput.begin(), DataInput.end(), 1); + + std::vector ReferenceOutput(DataInput); + calculate_reference_data(Iterations, Size, ReferenceOutput); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrInput = malloc_device(Size, Queue); + + std::vector OutputData(Size); + + Queue.copy(DataInput.data(), PtrInput, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes_to_graph(Graph, Queue, Size, PtrInput); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData")); + } + + free(PtrInput, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp new file mode 100644 index 0000000000000..e791371544677 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp @@ -0,0 +1,163 @@ +// Tests memory reuse behaviour of device graph allocations when an allocation +// is reused multiple times. NOTE: This test partially relies on knowing how the +// implementation works and that the contents of memory will persist when +// allocations are reused. This is useful for testing but is not an assumption +// that a user can or should make. + +#include "../../graph_common.hpp" +#include + +using T = int; +void add_nodes_to_graph( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, T *PtrInput) { + // Create 1 pointers for async allocations + T *AsyncPtr = nullptr; + // Add alloc nodes at the root of the graph for each allocation, this should + // result in three unique allocations + auto AllocNode1 = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtr = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + + // Store pointer value for later comparison + void *FirstAsyncPtr = AsyncPtr; + + // Add kernel that fills the async alloc with values + auto KernelFillPtrs = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {AllocNode1}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + AsyncPtr[LinID] = (1 + LinID); + }); + }, + AllocNode1); + // Free all the async allocation, making it possible to reuse it + auto FreeNode1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelFillPtrs); + exp_ext::async_free(CGH, AsyncPtr); + }, + KernelFillPtrs); + + // Add an allocation node which should reuse the previous allocation, followed + // by a kernel that uses the data, and then freeing the allocation. + + auto AllocNode2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNode1}); + AsyncPtr = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }, + FreeNode1); + + // Check that the new allocation matches one of the previously returned + // values. + assert(AsyncPtr == FirstAsyncPtr); + + auto KernelAdd1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {AllocNode2}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += AsyncPtr[LinID]; + }); + }, + AllocNode2); + + auto FreeNode2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAdd1); + exp_ext::async_free(CGH, AsyncPtr); + }, + KernelAdd1); + + // Repeat the previous 3 nodes to test reuse an allocation which has + // previously been reused. + + auto AllocNode3 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNode2}); + AsyncPtr = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }, + FreeNode2); + + // Check that the new allocation matches one of the previously returned + // values. + assert(AsyncPtr == FirstAsyncPtr); + + auto KernelAdd2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {AllocNode3}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += AsyncPtr[LinID]; + }); + }, + AllocNode3); + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAdd2); + exp_ext::async_free(CGH, AsyncPtr); + }, + KernelAdd2); +} + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceOutput) { + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceOutput[j] += (j + 1) * 2; + } + } +} + +int main() { + queue Queue{}; + + std::vector DataInput(Size); + + std::iota(DataInput.begin(), DataInput.end(), 1); + + std::vector ReferenceOutput(DataInput); + calculate_reference_data(Iterations, Size, ReferenceOutput); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrInput = malloc_device(Size, Queue); + + std::vector OutputData(Size); + + Queue.copy(DataInput.data(), PtrInput, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes_to_graph(Graph, Queue, Size, PtrInput); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData")); + } + + free(PtrInput, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp new file mode 100644 index 0000000000000..3f9bfb6443b1a --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp @@ -0,0 +1,213 @@ +// Tests memory reuse behaviour of device graph allocations from a zero-init +// memory pool. +// NOTE: This test partially relies on knowing how the implementation works and +// that the contents of memory will persist when allocations are reused. This is +// useful for testing but is not an assumption that a user can or should make. + +#include "../../graph_common.hpp" +#include +#include +#include + +#define __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ + +using T = int; +void add_nodes_to_graph( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, T *PtrInput) { + // Create a memory pool for async allocations with the zero init property + exp_ext::memory_pool MemPool{ + Queue, usm::alloc::device, {exp_ext::property::memory_pool::zero_init{}}}; + // Create 2 pointers for async allocations + T *AsyncPtrA = nullptr; + T *AsyncPtrB = nullptr; + // Add alloc nodes at the root of the graph for each allocation, this should + // result in three unique allocations + auto AllocNodeA = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtrA = static_cast( + exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool)); + }); + auto AllocNodeB = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtrB = static_cast( + exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool)); + }); + + // Assert that we have received unique ptr values, this should always be true + // regardless of implementation. + assert((AsyncPtrA != AsyncPtrB)); + + // Store pointer values for later comparison + void *FirstAsyncPtrA = AsyncPtrA; + void *FirstAsyncPtrB = AsyncPtrB; + // Add kernel that fills the async allocs with values + auto KernelFillPtrs = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {AllocNodeA, AllocNodeB}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + // Allocations should be zero initialized on this first use, so we + // should be able to just add on values rather than set them + // explicitly. + AsyncPtrA[LinID] += (1 + LinID); + AsyncPtrB[LinID] += (2 + LinID); + }); + }, + AllocNodeA, AllocNodeB); + // Free all the async allocations, making it possible to reuse them + auto FreeNodeA = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelFillPtrs); + exp_ext::async_free(CGH, AsyncPtrA); + }, + KernelFillPtrs); + auto FreeNodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelFillPtrs); + exp_ext::async_free(CGH, AsyncPtrB); + }, + KernelFillPtrs); + + // Create 2 connected layers in the graph, in each layer we will do a new + // async_alloc which should reuse one of the 2 previously freed pointers. The + // other kernel in the layer will simply operate on the output ptr. + + // The first layer will have a direct dependency on a free node, but the + // subsequent layer will have an indirect dependency. We do not test the order + // in which allocations are picked for reuse, but we can assume both will be + // reused by the implementation (same size and properties so they are + // compatible). + + // First layer, allocation is added first + auto AllocNodeA1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNodeA, FreeNodeB}); + AsyncPtrA = static_cast( + exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool)); + }, + FreeNodeA, FreeNodeB); + + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrA == FirstAsyncPtrA) || (AsyncPtrA == FirstAsyncPtrB)); + // Increment output pointer + auto KernelInc1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNodeA, FreeNodeB}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }, + FreeNodeA, FreeNodeB); + + // Second layer, allocation has an indirect dependency on a free node + // Increment output pointer + auto KernelInc2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc1}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }, + KernelInc1); + auto AllocNodeB2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc1, AllocNodeA1}); + AsyncPtrB = static_cast( + exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool)); + }, + KernelInc1, AllocNodeA1); + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrB == FirstAsyncPtrA) || (AsyncPtrB == FirstAsyncPtrB)); + + // Add a final kernel that adds the async allocation values to the output + // pointer + auto KernelAddToOutput = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc2, AllocNodeB2}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += (AsyncPtrA[LinID] + AsyncPtrB[LinID]); + }); + }, + KernelInc2, AllocNodeB2); + + // Free the allocations again + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAddToOutput); + exp_ext::async_free(CGH, AsyncPtrA); + }, + KernelAddToOutput); + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAddToOutput); + exp_ext::async_free(CGH, AsyncPtrB); + }, + KernelAddToOutput); +} + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceOutput) { + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceOutput[j] += 2; + ReferenceOutput[j] += (j + 1) * (i + 1); + ReferenceOutput[j] += (j + 2) * (i + 1); + } + } +} + +int main() { + queue Queue{}; + + std::vector DataInput(Size); + + std::iota(DataInput.begin(), DataInput.end(), 1); + + std::vector ReferenceOutput(DataInput); + calculate_reference_data(Iterations, Size, ReferenceOutput); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrInput = malloc_device(Size, Queue); + + std::vector OutputData(Size); + + Queue.copy(DataInput.data(), PtrInput, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes_to_graph(Graph, Queue, Size, PtrInput); + + auto GraphExec = Graph.finalize(); + + Graph.print_graph("test.dot"); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData")); + } + + free(PtrInput, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp new file mode 100644 index 0000000000000..cfdc7e664acc5 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/async_alloc_device_memory_reuse.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp new file mode 100644 index 0000000000000..a40da6fb026c5 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp @@ -0,0 +1,161 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests memory reuse behaviour of device graph allocations when an in-order +// queue is used. +// NOTE: This test partially relies on knowing how the +// implementation works and that the contents of memory will persist when +// allocations are reused. This is useful for testing but is not an assumption +// that a user can or should make. + +#include "../../graph_common.hpp" +#include +#include + +using T = int; +void add_nodes_to_graph(queue &Queue, size_t Size, T *PtrInput) { + // Create 2 pointers for async allocations + T *AsyncPtrA = nullptr; + T *AsyncPtrB = nullptr; + // Add alloc nodes at the root of the graph for each allocation, this should + // result in three unique allocations + Queue.submit([&](handler &CGH) { + AsyncPtrA = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + // Use the free function instead of handle version + AsyncPtrB = static_cast( + exp_ext::async_malloc(Queue, usm::alloc::device, Size * sizeof(T))); + + // Assert that we have received unique ptr values, this should always be true + // regardless of implementation. + assert((AsyncPtrA != AsyncPtrB)); + + // Store pointer values for later comparison + void *FirstAsyncPtrA = AsyncPtrA; + void *FirstAsyncPtrB = AsyncPtrB; + // Add kernel that fills the async allocs with values + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + AsyncPtrA[LinID] = (1 + LinID); + AsyncPtrB[LinID] = (2 + LinID); + }); + }); + // Free all the async allocations, making it possible to reuse them + Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrA); }); + Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrB); }); + + // Create 2 connected layers in the graph, in each layer we will do a new + // async_alloc which should reuse one of the 2 previously freed pointers. The + // other kernel in the layer will simply operate on the output ptr. + + // The first layer will have a direct dependency on a free node, but the + // subsequent layer will have an indirect dependency. We do not test the order + // in which allocations are picked for reuse, but we can assume both will be + // reused by the implementation (same size and properties so they are + // compatible). + + // First layer, allocation is added first + Queue.submit([&](handler &CGH) { + AsyncPtrA = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrA == FirstAsyncPtrA) || (AsyncPtrA == FirstAsyncPtrB)); + // Increment output pointer + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }); + + // Second layer, allocation has an indirect dependency on a free node + // Increment output pointer + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }); + Queue.submit([&](handler &CGH) { + AsyncPtrB = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrB == FirstAsyncPtrA) || (AsyncPtrB == FirstAsyncPtrB)); + + // Add a final kernel that adds the async allocation values to the output + // pointer + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += (AsyncPtrA[LinID] + AsyncPtrB[LinID]); + }); + }); + + // Free the allocations again + + Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrA); }); + Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrB); }); +} + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceOutput) { + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceOutput[j] += 2; + ReferenceOutput[j] += (j + 1); + ReferenceOutput[j] += (j + 2); + } + } +} + +int main() { + queue Queue{{property::queue::in_order{}}}; + + std::vector DataInput(Size); + + std::iota(DataInput.begin(), DataInput.end(), 1); + + std::vector ReferenceOutput(DataInput); + calculate_reference_data(Iterations, Size, ReferenceOutput); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrInput = malloc_device(Size, Queue); + + std::vector OutputData(Size); + + Queue.copy(DataInput.data(), PtrInput, Size); + Queue.wait_and_throw(); + + // Add commands to graph + Graph.begin_recording(Queue); + add_nodes_to_graph(Queue, Size, PtrInput); + Graph.end_recording(Queue); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData")); + } + + free(PtrInput, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp new file mode 100644 index 0000000000000..533121c33e83e --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/async_alloc_device_memory_reuse_multiple.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp new file mode 100644 index 0000000000000..3506691046f15 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/async_alloc_device_memory_reuse_zero_init.cpp" From 09e1322e8f5e3ec8e082afc61aa0a9e9e16d749f Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 8 May 2025 13:37:58 +0100 Subject: [PATCH 2/5] Addressing review comments --- sycl/source/detail/graph_impl.hpp | 12 +++++++----- sycl/source/queue.cpp | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 2fc9ec93c9798..3a7084109185a 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -948,15 +948,16 @@ class graph_impl : public std::enable_shared_from_this { /// @param Events Events to find nodes for. /// @return A list of node counterparts for each event, in the same order. std::vector> getNodesForEvents( - const std::vector> Events) { - std::vector> NodeList{Events.size()}; + const std::vector> &Events) { + std::vector> NodeList{}; + NodeList.reserve(Events.size()); ReadLock Lock(MMutex); - for (size_t i = 0; i < Events.size(); i++) { - if (auto NodeFound = MEventsMap.find(Events[i]); + for (const auto &Event : Events) { + if (auto NodeFound = MEventsMap.find(Event); NodeFound != std::end(MEventsMap)) { - NodeList[i] = NodeFound->second; + NodeList.push_back(NodeFound->second); } else { throw sycl::exception( sycl::make_error_code(errc::invalid), @@ -1221,6 +1222,7 @@ class graph_impl : public std::enable_shared_from_this { /// this graph. size_t getExecGraphCount() const { return MExecGraphCount; } + /// Resets the visited edges variable across all nodes in the graph to 0. void resetNodeVisitedEdges() { for (auto &Node : MNodeStorage) { Node->MTotalVisitedEdges = 0; diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 623e122351c3a..8b1b1af51ee11 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -443,7 +443,7 @@ sycl::detail::optional queue::ext_oneapi_get_last_event_impl() const { // an event at end. If the event comes from a graph we must skip this because // the original event is used for tracking nodes in the graph. auto LastEventImpl = detail::getSyclObjImpl(*LastEvent); - if (!LastEventImpl->getCommandGraph() && + if (!LastEventImpl->hasCommandGraph() && (LastEventImpl->isDiscarded() || LastEventImpl->isNOP())) LastEvent = detail::createSyclObjFromImpl(impl->insertMarkerEvent(impl)); From d01c9da57219ee8dc848973973fa2601ad44b32b Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Mon, 12 May 2025 12:44:47 +0100 Subject: [PATCH 3/5] Addressing review comments - Fix wording in design doc - Remove code duplication is async_alloc calls - Early return from tryReuseExistingAllocation() --- sycl/doc/design/CommandGraph.md | 16 +++--- sycl/source/detail/async_alloc.cpp | 62 ++++++++++++------------ sycl/source/detail/graph_memory_pool.cpp | 18 +++++-- 3 files changed, 51 insertions(+), 45 deletions(-) diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index c02c73a66d2bb..776016c22e52a 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -351,14 +351,14 @@ heuristic but rather expose this as a hint the user can provide. ### Graph Allocation Memory Reuse -Within a single graph memory allocations which have previously been freed are -checked when adding a new allocation to see if they can be reused. Because we -have to return a pointer to the user immediately when the CGF for a node is -processed when adding the node, we have to do this eagerly anytime -`async_malloc()` is called. Allocations track the last free node associated with -them to represent the most recent use of that allocation. - - To be reused the two allocations must meet these criteria: +When adding a new allocation node to a graph, memory allocations which have +previously been freed are checked to see if they can be reused. Because we have +to return a pointer to the user immediately when the CGF for a node is +processed, we have to do this eagerly anytime `async_malloc()` is called. +Allocations track the last free node associated with them to represent the most +recent use of that allocation. + + To be reused, the two allocations must meet these criteria: - They must be of the same allocation type (device/host/shared). - They must have a matching size. diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp index b3be1e4ac6c78..1e530f0b01cc0 100644 --- a/sycl/source/detail/async_alloc.cpp +++ b/sycl/source/detail/async_alloc.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,27 @@ getUrEvents(const std::vector> &DepEvents) { } return RetUrEvents; } + +std::vector> getDepGraphNodes( + const std::shared_ptr &Handler, + const std::shared_ptr &Queue, + const std::shared_ptr &Graph, + const std::vector> &DepEvents) { + // Get dependent graph nodes from any events + auto DepNodes = Graph->getNodesForEvents(DepEvents); + // If this node was added explicitly we may have node deps in the handler as + // well, so add them to the list + DepNodes.insert(DepNodes.end(), Handler->MNodeDeps.begin(), + Handler->MNodeDeps.end()); + // If this is being recorded from an in-order queue we need to get the last + // in-order node if any, since this will later become a dependency of the + // node being processed here. + if (const auto &LastInOrderNode = Graph->getLastInorderNode(Queue); + LastInOrderNode) { + DepNodes.push_back(LastInOrderNode); + } + return DepNodes; +} } // namespace __SYCL_EXPORT @@ -49,34 +71,21 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) { // Get CG event dependencies for this allocation. const auto &DepEvents = h.impl->CGData.MEvents; auto UREvents = getUrEvents(DepEvents); - uint32_t numEvents = DepEvents.size(); void *alloc = nullptr; ur_event_handle_t Event = nullptr; // If a graph is present do the allocation from the graph memory pool instead. if (auto Graph = h.getCommandGraph(); Graph) { - // Get dependent graph nodes from any events - auto DepNodes = Graph->getNodesForEvents(DepEvents); - // If this node was added explicitly we may have node deps in the handler as - // well, so add them to the list - DepNodes.insert(DepNodes.end(), h.impl->MNodeDeps.begin(), - h.impl->MNodeDeps.end()); - // If this is being recorded from an in-order queue we need to get the last - // in-order node if any, since this will later become a dependency of the - // node being processed here. - if (const auto &LastInOrderNode = Graph->getLastInorderNode(h.MQueue); - LastInOrderNode) { - DepNodes.push_back(LastInOrderNode); - } - + auto DepNodes = getDepGraphNodes(sycl::detail::getSyclObjImpl(h), h.MQueue, + Graph, DepEvents); alloc = Graph->getMemPool().malloc(size, kind, DepNodes); } else { auto &Q = h.MQueue->getHandleRef(); Adapter->call( - Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, UREvents.data(), - &alloc, &Event); + Q, (ur_usm_pool_handle_t)0, size, nullptr, UREvents.size(), + UREvents.data(), &alloc, &Event); } // Async malloc must return a void* immediately. @@ -113,26 +122,15 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size, // Get CG event dependencies for this allocation. const auto &DepEvents = h.impl->CGData.MEvents; auto UREvents = getUrEvents(DepEvents); - uint32_t numEvents = DepEvents.size(); void *alloc = nullptr; ur_event_handle_t Event = nullptr; // If a graph is present do the allocation from the graph memory pool instead. if (auto Graph = h.getCommandGraph(); Graph) { - // Get dependent graph nodes from any events - auto DepNodes = Graph->getNodesForEvents(DepEvents); - // If this node was added explicitly we may have node deps in the handler as - // well, so add them to the list - DepNodes.insert(DepNodes.end(), h.impl->MNodeDeps.begin(), - h.impl->MNodeDeps.end()); - // If this is being recorded from an in-order queue we need to get the last - // in-order node if any, since this will later become a dependency of the - // node being processed here. - if (const auto &LastInOrderNode = Graph->getLastInorderNode(h.MQueue); - LastInOrderNode) { - DepNodes.push_back(LastInOrderNode); - } + auto DepNodes = getDepGraphNodes(sycl::detail::getSyclObjImpl(h), h.MQueue, + Graph, DepEvents); + // Memory pool is passed as the graph may use some properties of it. alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), DepNodes, sycl::detail::getSyclObjImpl(pool)); @@ -140,7 +138,7 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size, auto &Q = h.MQueue->getHandleRef(); Adapter->call( - Q, memPoolImpl.get()->get_handle(), size, nullptr, numEvents, + Q, memPoolImpl.get()->get_handle(), size, nullptr, UREvents.size(), UREvents.data(), &alloc, &Event); } // Async malloc must return a void* immediately. diff --git a/sycl/source/detail/graph_memory_pool.cpp b/sycl/source/detail/graph_memory_pool.cpp index 56d813e6294e7..bda278c88780f 100644 --- a/sycl/source/detail/graph_memory_pool.cpp +++ b/sycl/source/detail/graph_memory_pool.cpp @@ -94,7 +94,7 @@ graph_mem_pool::tryReuseExistingAllocation( CompatibleAllocs.reserve(MFreeAllocations.size()); // Loop over free allocation list, search for ones that are compatible for - // reuse Currently that means they have the same alloc kind, size and read + // reuse. Currently that means they have the same alloc kind, size and read // only property. for (auto &Ptr : MFreeAllocations) { @@ -106,6 +106,11 @@ graph_mem_pool::tryReuseExistingAllocation( } } + // If we have no suitable allocs to reuse, return early + if (CompatibleAllocs.size() == 0) { + return std::nullopt; + } + // Traverse graph back from each DepNode to try and find any of the suitable // free nodes. We do this in a breadth-first approach because we want to find // the shortest path to a reusable allocation. @@ -119,7 +124,9 @@ graph_mem_pool::tryReuseExistingAllocation( std::optional AllocInfo = {}; - // Called when traversing over nodes to check if the current node + // Called when traversing over nodes to check if the current node is a free + // node for one of the available allocations. If it is we populate AllocInfo + // with the allocation to be reused. auto CheckNodeEqual = [&CompatibleAllocs, &AllocInfo](const std::shared_ptr &CurrentNode) -> bool { @@ -145,9 +152,10 @@ graph_mem_pool::tryReuseExistingAllocation( continue; } - // Check if the node is a free node, and if so is a free node for any of the - // allocations which are free for reuse. We should not bother checking nodes - // that are free nodes, so we continue and check their predecessors. + // Check if the node is a free node and, if so, check if it is a free node + // for any of the allocations which are free for reuse. We should not bother + // checking nodes that are not free nodes, so we continue and check their + // predecessors. if (CurrentNode->MNodeType == node_type::async_free && CheckNodeEqual(CurrentNode)) { // If we found an allocation AllocInfo has already been populated in From 32a7a35a6e95d090bb6bddf3676eb4b9718c1299 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 15 May 2025 16:04:20 +0100 Subject: [PATCH 4/5] Add early exit from tryReuseExistingAllocation if no dep nodes - No dep nodes means that we cannot be connected to any free nodes for reuse --- sycl/source/detail/graph_memory_pool.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sycl/source/detail/graph_memory_pool.cpp b/sycl/source/detail/graph_memory_pool.cpp index bda278c88780f..a036efd3a287c 100644 --- a/sycl/source/detail/graph_memory_pool.cpp +++ b/sycl/source/detail/graph_memory_pool.cpp @@ -88,6 +88,11 @@ std::optional graph_mem_pool::tryReuseExistingAllocation( size_t Size, usm::alloc AllocType, bool ReadOnly, const std::vector> &DepNodes) { + // If we have no dependencies this is a no-op because allocations must connect + // to a free node for reuse to be possible. + // if (DepNodes.empty()) { + // return std::nullopt; + // } std::vector CompatibleAllocs; // Compatible allocs can only be as big as MFreeAllocations From 753186e05d32cc7ba84d767c7d3eebc53a4a5306 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 15 May 2025 17:13:44 +0100 Subject: [PATCH 5/5] Adjust getDepGraphNodes for handler changes --- sycl/source/detail/async_alloc.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp index 1e530f0b01cc0..46d2f3f5589bf 100644 --- a/sycl/source/detail/async_alloc.cpp +++ b/sycl/source/detail/async_alloc.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "sycl/accessor.hpp" #include #include #include @@ -32,16 +33,16 @@ getUrEvents(const std::vector> &DepEvents) { } std::vector> getDepGraphNodes( - const std::shared_ptr &Handler, - const std::shared_ptr &Queue, + sycl::handler &Handler, const std::shared_ptr &Queue, const std::shared_ptr &Graph, const std::vector> &DepEvents) { + auto HandlerImpl = detail::getSyclObjImpl(Handler); // Get dependent graph nodes from any events auto DepNodes = Graph->getNodesForEvents(DepEvents); // If this node was added explicitly we may have node deps in the handler as // well, so add them to the list - DepNodes.insert(DepNodes.end(), Handler->MNodeDeps.begin(), - Handler->MNodeDeps.end()); + DepNodes.insert(DepNodes.end(), HandlerImpl->MNodeDeps.begin(), + HandlerImpl->MNodeDeps.end()); // If this is being recorded from an in-order queue we need to get the last // in-order node if any, since this will later become a dependency of the // node being processed here. @@ -77,8 +78,7 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) { ur_event_handle_t Event = nullptr; // If a graph is present do the allocation from the graph memory pool instead. if (auto Graph = h.getCommandGraph(); Graph) { - auto DepNodes = getDepGraphNodes(sycl::detail::getSyclObjImpl(h), h.MQueue, - Graph, DepEvents); + auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents); alloc = Graph->getMemPool().malloc(size, kind, DepNodes); } else { auto &Q = h.MQueue->getHandleRef(); @@ -128,8 +128,7 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size, ur_event_handle_t Event = nullptr; // If a graph is present do the allocation from the graph memory pool instead. if (auto Graph = h.getCommandGraph(); Graph) { - auto DepNodes = getDepGraphNodes(sycl::detail::getSyclObjImpl(h), h.MQueue, - Graph, DepEvents); + auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents); // Memory pool is passed as the graph may use some properties of it. alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), DepNodes,