diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 956082b31097f..776016c22e52a 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -349,6 +349,26 @@ safely assumed to be more performant. It is not likely we'll try to allow in-order execution in more scenarios through a complicated (and imperfect) heuristic but rather expose this as a hint the user can provide. +### Graph Allocation Memory Reuse + +When adding a new allocation node to a graph, memory allocations which have +previously been freed are checked to see if they can be reused. Because we have +to return a pointer to the user immediately when the CGF for a node is +processed, we have to do this eagerly anytime `async_malloc()` is called. +Allocations track the last free node associated with them to represent the most +recent use of that allocation. + + To be reused, the two allocations must meet these criteria: + +- They must be of the same allocation type (device/host/shared). +- They must have a matching size. +- They must have the same properties (currently only read-only matters). +- There must be a path from the last free node associated with a given + allocation to the new allocation node being added. + +If these criteria are met we update the last free node for the allocation and +return the existing pointer to the user. + ## Backend Implementation Implementation of UR command-buffers for each of the supported SYCL 2020 diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index e3b62fcea33c6..8a947c90a91fc 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -323,6 +323,7 @@ set(SYCL_COMMON_SOURCES "detail/memory_pool_impl.cpp" "detail/async_alloc.cpp" "detail/memory_pool.cpp" + "detail/graph_memory_pool.cpp" "$<$:detail/windows_ur.cpp>" "$<$,$>:detail/posix_ur.cpp>" ) diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp index b21f6d13c42e3..46d2f3f5589bf 100644 --- a/sycl/source/detail/async_alloc.cpp +++ b/sycl/source/detail/async_alloc.cpp @@ -6,8 +6,10 @@ // //===----------------------------------------------------------------------===// +#include "sycl/accessor.hpp" #include #include +#include #include #include #include @@ -29,6 +31,27 @@ getUrEvents(const std::vector> &DepEvents) { } return RetUrEvents; } + +std::vector> getDepGraphNodes( + sycl::handler &Handler, const std::shared_ptr &Queue, + const std::shared_ptr &Graph, + const std::vector> &DepEvents) { + auto HandlerImpl = detail::getSyclObjImpl(Handler); + // Get dependent graph nodes from any events + auto DepNodes = Graph->getNodesForEvents(DepEvents); + // If this node was added explicitly we may have node deps in the handler as + // well, so add them to the list + DepNodes.insert(DepNodes.end(), HandlerImpl->MNodeDeps.begin(), + HandlerImpl->MNodeDeps.end()); + // If this is being recorded from an in-order queue we need to get the last + // in-order node if any, since this will later become a dependency of the + // node being processed here. + if (const auto &LastInOrderNode = Graph->getLastInorderNode(Queue); + LastInOrderNode) { + DepNodes.push_back(LastInOrderNode); + } + return DepNodes; +} } // namespace __SYCL_EXPORT @@ -46,22 +69,23 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) { auto &Adapter = h.getContextImplPtr()->getAdapter(); - // Get events to wait on. - auto depEvents = getUrEvents(h.impl->CGData.MEvents); - uint32_t numEvents = h.impl->CGData.MEvents.size(); + // Get CG event dependencies for this allocation. + const auto &DepEvents = h.impl->CGData.MEvents; + auto UREvents = getUrEvents(DepEvents); void *alloc = nullptr; ur_event_handle_t Event = nullptr; // If a graph is present do the allocation from the graph memory pool instead. if (auto Graph = h.getCommandGraph(); Graph) { - alloc = Graph->getMemPool().malloc(size, kind); + auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents); + alloc = Graph->getMemPool().malloc(size, kind, DepNodes); } else { auto &Q = h.MQueue->getHandleRef(); Adapter->call( - Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, depEvents.data(), - &alloc, &Event); + Q, (ur_usm_pool_handle_t)0, size, nullptr, UREvents.size(), + UREvents.data(), &alloc, &Event); } // Async malloc must return a void* immediately. @@ -95,24 +119,26 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size, auto &Adapter = h.getContextImplPtr()->getAdapter(); auto &memPoolImpl = sycl::detail::getSyclObjImpl(pool); - // Get events to wait on. - auto depEvents = getUrEvents(h.impl->CGData.MEvents); - uint32_t numEvents = h.impl->CGData.MEvents.size(); + // Get CG event dependencies for this allocation. + const auto &DepEvents = h.impl->CGData.MEvents; + auto UREvents = getUrEvents(DepEvents); void *alloc = nullptr; ur_event_handle_t Event = nullptr; // If a graph is present do the allocation from the graph memory pool instead. if (auto Graph = h.getCommandGraph(); Graph) { + auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents); + // Memory pool is passed as the graph may use some properties of it. - alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), + alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), DepNodes, sycl::detail::getSyclObjImpl(pool)); } else { auto &Q = h.MQueue->getHandleRef(); Adapter->call( - Q, memPoolImpl.get()->get_handle(), size, nullptr, numEvents, - depEvents.data(), &alloc, &Event); + Q, memPoolImpl.get()->get_handle(), size, nullptr, UREvents.size(), + UREvents.data(), &alloc, &Event); } // Async malloc must return a void* immediately. // Set up CommandGroup which is a no-op and pass the event from the alloc. @@ -140,6 +166,9 @@ async_malloc_from_pool(const sycl::queue &q, size_t size, } __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr) { + // We only check for errors for the graph here because marking the allocation + // as free in the graph memory pool requires a node object which doesn't exist + // at this point. if (auto Graph = h.getCommandGraph(); Graph) { // Check if the pointer to be freed has an associated allocation node, and // error if not diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 931816551d95e..8cbf89ea8987a 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -344,7 +344,8 @@ graph_impl::graph_impl(const sycl::context &SyclContext, const sycl::device &SyclDevice, const sycl::property_list &PropList) : MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(), - MEventsMap(), MInorderQueueMap(), MGraphMemPool(SyclContext, SyclDevice), + MEventsMap(), MInorderQueueMap(), + MGraphMemPool(*this, SyclContext, SyclDevice), MID(NextAvailableID.fetch_add(1, std::memory_order_relaxed)) { checkGraphPropertiesAndThrow(PropList); if (PropList.has_property()) { @@ -509,6 +510,10 @@ graph_impl::add(std::function CGF, sycl::handler Handler{shared_from_this()}; #endif + // Pass the node deps to the handler so they are available when processing the + // CGF, need for async_malloc nodes. + Handler.impl->MNodeDeps = Deps; + #if XPTI_ENABLE_INSTRUMENTATION // Save code location if one was set in TLS. // Ideally it would be nice to capture user's call code location @@ -532,6 +537,10 @@ graph_impl::add(std::function CGF, Handler.finalize(); + // In explicit mode the handler processing of the CGF does not need a write + // lock as it does not modify the graph, we extract information from it here + // and modify the graph. + graph_impl::WriteLock Lock(MMutex); node_type NodeType = Handler.impl->MUserFacingNodeType != ext::oneapi::experimental::node_type::empty @@ -602,6 +611,14 @@ graph_impl::add(node_type NodeType, addDepsToNode(NodeImpl, Deps); + if (NodeType == node_type::async_free) { + auto AsyncFreeCG = + static_cast(NodeImpl->MCommandGroup.get()); + // If this is an async free node mark that it is now available for reuse, + // and pass the async free node for tracking. + MGraphMemPool.markAllocationAsAvailable(AsyncFreeCG->getPtr(), NodeImpl); + } + return NodeImpl; } @@ -1789,7 +1806,6 @@ node modifiable_command_graph::addImpl(std::function CGF, DepImpls.push_back(sycl::detail::getSyclObjImpl(D)); } - graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(CGF, {}, DepImpls); return sycl::detail::createSyclObjFromImpl(std::move(NodeImpl)); } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 1600b76f7b991..83dfa3af62f2f 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -735,6 +735,12 @@ class node_impl : public std::enable_shared_from_this { case sycl::detail::CGType::EnqueueNativeCommand: Stream << "CGNativeCommand \\n"; break; + case sycl::detail::CGType::AsyncAlloc: + Stream << "CGAsyncAlloc \\n"; + break; + case sycl::detail::CGType::AsyncFree: + Stream << "CGAsyncFree \\n"; + break; default: Stream << "Other \\n"; break; @@ -937,6 +943,31 @@ class graph_impl : public std::enable_shared_from_this { "No node in this graph is associated with this event"); } + /// Find the nodes associated with a list of SYCL events. Throws if no node is + /// found for a given event. + /// @param Events Events to find nodes for. + /// @return A list of node counterparts for each event, in the same order. + std::vector> getNodesForEvents( + const std::vector> &Events) { + std::vector> NodeList{}; + NodeList.reserve(Events.size()); + + ReadLock Lock(MMutex); + + for (const auto &Event : Events) { + if (auto NodeFound = MEventsMap.find(Event); + NodeFound != std::end(MEventsMap)) { + NodeList.push_back(NodeFound->second); + } else { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "No node in this graph is associated with this event"); + } + } + + return NodeList; + } + /// Query for the context tied to this graph. /// @return Context associated with graph. sycl::context getContext() const { return MContext; } @@ -1191,6 +1222,13 @@ class graph_impl : public std::enable_shared_from_this { /// this graph. size_t getExecGraphCount() const { return MExecGraphCount; } + /// Resets the visited edges variable across all nodes in the graph to 0. + void resetNodeVisitedEdges() { + for (auto &Node : MNodeStorage) { + Node->MTotalVisitedEdges = 0; + } + } + private: /// Check the graph for cycles by performing a depth-first search of the /// graph. If a node is visited more than once in a given path through the diff --git a/sycl/source/detail/graph_memory_pool.cpp b/sycl/source/detail/graph_memory_pool.cpp new file mode 100644 index 0000000000000..a036efd3a287c --- /dev/null +++ b/sycl/source/detail/graph_memory_pool.cpp @@ -0,0 +1,202 @@ +//==--------- graph_memory_pool.cpp --- SYCL graph extension ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "graph_memory_pool.hpp" + +#include +#include + +#include "graph_impl.hpp" + +namespace sycl { +inline namespace _V1 { +namespace ext { +namespace oneapi { +namespace experimental { +namespace detail { + +void * +graph_mem_pool::malloc(size_t Size, usm::alloc AllocType, + const std::vector> &DepNodes, + const std::shared_ptr &MemPool) { + // We are potentially modifying contents of this memory pool and the owning + // graph, so take a lock here. + graph_impl::WriteLock Lock(MGraph.MMutex); + + void *Alloc = nullptr; + alloc_info AllocInfo = {}; + AllocInfo.Kind = AllocType; + // Collect relevant properties from memory pool + if (MemPool) { + auto PropList = MemPool->getPropList(); + if (PropList.has_property()) { + AllocInfo.ZeroInit = true; + } + if (PropList.has_property()) { + AllocInfo.ReadOnly = true; + } + } + + switch (AllocType) { + case usm::alloc::device: { + + auto &CtxImpl = sycl::detail::getSyclObjImpl(MContext); + auto &Adapter = CtxImpl->getAdapter(); + + size_t Granularity = get_mem_granularity(MDevice, MContext); + uintptr_t StartPtr = 0; + size_t AlignedSize = alignByteSize(Size, Granularity); + // See if we can find an allocation to reuse + auto AllocOpt = tryReuseExistingAllocation(AlignedSize, AllocType, + AllocInfo.ReadOnly, DepNodes); + + // If we got a value back then an allocation was available for reuse and we + // can just return that pointer + if (AllocOpt) { + return AllocOpt.value().Ptr; + } + + // If no allocation could be reused, do a new virtual reservation + Adapter->call( + CtxImpl->getHandleRef(), reinterpret_cast(StartPtr), + AlignedSize, &Alloc); + + AllocInfo.Size = AlignedSize; + AllocInfo.Ptr = Alloc; + + MAllocations[Alloc] = AllocInfo; + break; + } + + default: + throw sycl::exception(sycl::make_error_code(errc::feature_not_supported), + "Only device allocations are currently supported " + "in graph allocation nodes!"); + break; + } + + return Alloc; +} + +std::optional +graph_mem_pool::tryReuseExistingAllocation( + size_t Size, usm::alloc AllocType, bool ReadOnly, + const std::vector> &DepNodes) { + // If we have no dependencies this is a no-op because allocations must connect + // to a free node for reuse to be possible. + // if (DepNodes.empty()) { + // return std::nullopt; + // } + + std::vector CompatibleAllocs; + // Compatible allocs can only be as big as MFreeAllocations + CompatibleAllocs.reserve(MFreeAllocations.size()); + + // Loop over free allocation list, search for ones that are compatible for + // reuse. Currently that means they have the same alloc kind, size and read + // only property. + + for (auto &Ptr : MFreeAllocations) { + alloc_info &Info = MAllocations.at(Ptr); + if (Info.Kind == AllocType && Info.Size == Size && + Info.ReadOnly == ReadOnly) { + // Store the alloc info since it is compatible + CompatibleAllocs.push_back(Info); + } + } + + // If we have no suitable allocs to reuse, return early + if (CompatibleAllocs.size() == 0) { + return std::nullopt; + } + + // Traverse graph back from each DepNode to try and find any of the suitable + // free nodes. We do this in a breadth-first approach because we want to find + // the shortest path to a reusable allocation. + + std::queue> NodesToCheck; + + // Add all the dependent nodes to the queue, they will be popped first + for (auto &Dep : DepNodes) { + NodesToCheck.push(Dep); + } + + std::optional AllocInfo = {}; + + // Called when traversing over nodes to check if the current node is a free + // node for one of the available allocations. If it is we populate AllocInfo + // with the allocation to be reused. + auto CheckNodeEqual = + [&CompatibleAllocs, + &AllocInfo](const std::shared_ptr &CurrentNode) -> bool { + for (auto &Alloc : CompatibleAllocs) { + const auto &AllocFreeNode = Alloc.LastFreeNode; + // Compare control blocks without having to lock AllocFreeNode to check + // for node equality + if (!CurrentNode.owner_before(AllocFreeNode) && + !AllocFreeNode.owner_before(CurrentNode)) { + Alloc.LastFreeNode.reset(); + AllocInfo = Alloc; + return true; + } + } + return false; + }; + + while (!NodesToCheck.empty()) { + auto CurrentNode = NodesToCheck.front().lock(); + NodesToCheck.pop(); + + if (CurrentNode->MTotalVisitedEdges > 0) { + continue; + } + + // Check if the node is a free node and, if so, check if it is a free node + // for any of the allocations which are free for reuse. We should not bother + // checking nodes that are not free nodes, so we continue and check their + // predecessors. + if (CurrentNode->MNodeType == node_type::async_free && + CheckNodeEqual(CurrentNode)) { + // If we found an allocation AllocInfo has already been populated in + // CheckNodeEqual(), so we simply break out of the loop + break; + } + + // Add CurrentNode predecessors to queue + for (auto &Pred : CurrentNode->MPredecessors) { + NodesToCheck.push(Pred); + } + + // Mark node as visited + CurrentNode->MTotalVisitedEdges = 1; + } + // Reset visited nodes tracking + MGraph.resetNodeVisitedEdges(); + // If we found an allocation, remove it from the free list. + if (AllocInfo) { + MFreeAllocations.erase(std::find(MFreeAllocations.begin(), + MFreeAllocations.end(), + AllocInfo.value().Ptr)); + } + + return AllocInfo; +} + +void graph_mem_pool::markAllocationAsAvailable( + void *Ptr, const std::shared_ptr &FreeNode) { + MFreeAllocations.push_back(Ptr); + MAllocations.at(Ptr).LastFreeNode = FreeNode; +} + +} // namespace detail +} // namespace experimental +} // namespace oneapi +} // namespace ext +} // namespace _V1 +} // namespace sycl diff --git a/sycl/source/detail/graph_memory_pool.hpp b/sycl/source/detail/graph_memory_pool.hpp index e9133bb7f7d0c..6da1b14526b12 100644 --- a/sycl/source/detail/graph_memory_pool.hpp +++ b/sycl/source/detail/graph_memory_pool.hpp @@ -20,15 +20,20 @@ namespace oneapi { namespace experimental { namespace detail { +// Forward declarations +class node_impl; + /// Class handling graph-owned memory allocations. Device allocations are /// managed using virtual memory. class graph_mem_pool { // Info descriptor for various properties of an allocation struct alloc_info { + // Pointer to the start of the allocation + void *Ptr = nullptr; // Allocation kind - usm::alloc Kind; + usm::alloc Kind = usm::alloc::unknown; // Size of the allocation - size_t Size; + size_t Size = 0; // Is currently mapped to physical memory bool Mapped = false; // Index into the array of physical memory @@ -37,11 +42,14 @@ class graph_mem_pool { bool ReadOnly = false; // Should the allocation be zero initialized during initial allocation bool ZeroInit = false; + // Last free node for this allocation in the graph + std::weak_ptr LastFreeNode = {}; }; public: - graph_mem_pool(const context &Context, const device &Device) - : MContext(Context), MDevice(Device) {} + graph_mem_pool(graph_impl &GraphImpl, const context &Context, + const device &Device) + : MGraph(GraphImpl), MContext(Context), MDevice(Device) {} ~graph_mem_pool() { for (auto &[Ptr, AllocInfo] : MAllocations) { // Unmap allocations if required before physical memory is released @@ -64,55 +72,14 @@ class graph_mem_pool { /// memory before use by calling allocateAndMapAll() /// @param Size Size of the allocation /// @param AllocType Type of the allocation + /// @param DepNodes Any node dependencies for this allocation, used to + /// identify allocations that can be reused. /// @param MemPool Optional memory pool from which allocations will not be /// made directly but properties may be respected. /// @return A pointer to the start of the allocation void *malloc(size_t Size, usm::alloc AllocType, - const std::shared_ptr &MemPool = nullptr) { - void *Alloc = nullptr; - switch (AllocType) { - case usm::alloc::device: { - - auto &CtxImpl = sycl::detail::getSyclObjImpl(MContext); - auto &Adapter = CtxImpl->getAdapter(); - - size_t Granularity = get_mem_granularity(MDevice, MContext); - uintptr_t StartPtr = 0; - size_t AlignedSize = alignByteSize(Size, Granularity); - // Do virtual reservation - Adapter->call( - CtxImpl->getHandleRef(), reinterpret_cast(StartPtr), - AlignedSize, &Alloc); - - alloc_info AllocInfo = {}; - AllocInfo.Kind = AllocType; - AllocInfo.Size = AlignedSize; - - // Collect relevant properties from memory pool - if (MemPool) { - auto PropList = MemPool->getPropList(); - if (PropList.has_property()) { - AllocInfo.ZeroInit = true; - } - if (PropList.has_property()) { - AllocInfo.ReadOnly = true; - } - } - - MAllocations[Alloc] = AllocInfo; - break; - } - - default: - throw sycl::exception(sycl::make_error_code(errc::feature_not_supported), - "Only device allocations are currently supported " - "in graph allocation nodes!"); - break; - } - - return Alloc; - } + const std::vector> &DepNodes, + const std::shared_ptr &MemPool = nullptr); /// Return the total amount of memory being used by this pool size_t getMemUseCurrent() const { @@ -184,7 +151,27 @@ class graph_mem_pool { return MAllocations.find(Ptr) != MAllocations.end(); } + /// Mark that this allocation has been freed and is available for reuse. + /// @param Ptr The pointer to the allocation. + /// @param FreeNode The graph node of node_type::async_free which is freeing + /// the allocation. + void markAllocationAsAvailable(void *Ptr, + const std::shared_ptr &FreeNode); + private: + /// Tries to reuse an existing allocation which has been marked free in the + /// graph. + /// @param Size Size of the allocation. + /// @param AllocType USM type of the allocation. + /// @param ReadOnly True if the allocation is read only + /// @param DepNodes Node dependencies of this allocation, used to search for + /// reusable allocations. + /// @returns An optional allocation info value, where a null value indicates + /// that no allocation could be reused. + std::optional tryReuseExistingAllocation( + size_t Size, usm::alloc AllocType, bool ReadOnly, + const std::vector> &DepNodes); + /// Returns an aligned byte size given a required granularity /// @param UnalignedByteSize The original requested allocation size /// @param Granularity The required granularity for this allocation @@ -193,6 +180,8 @@ class graph_mem_pool { return ((UnalignedByteSize + Granularity - 1) / Granularity) * Granularity; } + /// Graph that owns this mem pool + graph_impl &MGraph; /// Context associated with allocations from this pool context MContext; /// Device associated with allocations from this pool @@ -201,6 +190,8 @@ class graph_mem_pool { std::unordered_map MAllocations; /// List of physical memory allocations used for virtual device reservations std::vector> MPhysicalMem; + /// List of pointers to allocations which are currently free for reuse + std::vector MFreeAllocations; }; } // namespace detail } // namespace experimental diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 42b1991f153f5..7d87cc4abda7a 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -175,6 +175,9 @@ class handler_impl { std::shared_ptr MSubgraphNode; /// Storage for the CG created when handling graph nodes added explicitly. std::unique_ptr MGraphNodeCG; + /// Storage for node dependencies passed when adding a graph node explicitly + std::vector> + MNodeDeps; /// Storage for lambda/function when using HostTask std::shared_ptr MHostTask; diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 983966937db62..79abe45c0bc07 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -461,9 +461,11 @@ sycl::detail::optional queue::ext_oneapi_get_last_event_impl() const { return std::nullopt; // If the last event was discarded or a NOP, we insert a marker to represent - // an event at end. + // an event at end. If the event comes from a graph we must skip this because + // the original event is used for tracking nodes in the graph. auto LastEventImpl = detail::getSyclObjImpl(*LastEvent); - if (LastEventImpl->isDiscarded() || LastEventImpl->isNOP()) + if (!LastEventImpl->hasCommandGraph() && + (LastEventImpl->isDiscarded() || LastEventImpl->isNOP())) LastEvent = detail::createSyclObjFromImpl(impl->insertMarkerEvent(impl)); return LastEvent; diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp new file mode 100644 index 0000000000000..944e7449d5d05 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/async_alloc_device_memory_reuse.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp new file mode 100644 index 0000000000000..bb093866ce70b --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_multiple.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/async_alloc_device_memory_reuse_multiple.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp new file mode 100644 index 0000000000000..c00dab11f52cb --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_memory_reuse_zero_init.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/async_alloc_device_memory_reuse_zero_init.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp new file mode 100644 index 0000000000000..9887aa0bce9b8 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse.cpp @@ -0,0 +1,200 @@ +// Tests memory reuse behaviour of device graph allocations. +// NOTE: This test partially relies on knowing how the implementation works and +// that the contents of memory will persist when allocations are reused. This is +// useful for testing but is not an assumption that a user can or should make. + +#include "../../graph_common.hpp" +#include + +using T = int; +void add_nodes_to_graph( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, T *PtrInput) { + // Create 2 pointers for async allocations + T *AsyncPtrA = nullptr; + T *AsyncPtrB = nullptr; + // Add alloc nodes at the root of the graph for each allocation, this should + // result in three unique allocations + auto AllocNodeA = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtrA = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + auto AllocNodeB = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtrB = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + + // Assert that we have received unique ptr values, this should always be true + // regardless of implementation. + assert((AsyncPtrA != AsyncPtrB)); + + // Store pointer values for later comparison + void *FirstAsyncPtrA = AsyncPtrA; + void *FirstAsyncPtrB = AsyncPtrB; + // Add kernel that fills the async allocs with values + auto KernelFillPtrs = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {AllocNodeA, AllocNodeB}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + AsyncPtrA[LinID] = (1 + LinID); + AsyncPtrB[LinID] = (2 + LinID); + }); + }, + AllocNodeA, AllocNodeB); + // Free all the async allocations, making it possible to reuse them + auto FreeNodeA = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelFillPtrs); + exp_ext::async_free(CGH, AsyncPtrA); + }, + KernelFillPtrs); + auto FreeNodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelFillPtrs); + exp_ext::async_free(CGH, AsyncPtrB); + }, + KernelFillPtrs); + + // Create 2 connected layers in the graph, in each layer we will do a new + // async_alloc which should reuse one of the 2 previously freed pointers. The + // other kernel in the layer will simply operate on the output ptr. + + // The first layer will have a direct dependency on a free node, but the + // subsequent layer will have an indirect dependency. We do not test the order + // in which allocations are picked for reuse, but we can assume both will be + // reused by the implementation (same size and properties so they are + // compatible). + + // First layer, allocation is added first + auto AllocNodeA1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNodeA, FreeNodeB}); + AsyncPtrA = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }, + FreeNodeA, FreeNodeB); + + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrA == FirstAsyncPtrA) || (AsyncPtrA == FirstAsyncPtrB)); + // Increment output pointer + auto KernelInc1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNodeA, FreeNodeB}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }, + FreeNodeA, FreeNodeB); + + // Second layer, allocation has an indirect dependency on a free node + // Increment output pointer + auto KernelInc2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc1}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }, + KernelInc1); + auto AllocNodeB2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc1, AllocNodeA1}); + AsyncPtrB = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }, + KernelInc1, AllocNodeA1); + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrB == FirstAsyncPtrA) || (AsyncPtrB == FirstAsyncPtrB)); + + // Add a final kernel that adds the async allocation values to the output + // pointer + auto KernelAddToOutput = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc2, AllocNodeB2}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += (AsyncPtrA[LinID] + AsyncPtrB[LinID]); + }); + }, + KernelInc2, AllocNodeB2); + + // Free the allocations again + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAddToOutput); + exp_ext::async_free(CGH, AsyncPtrA); + }, + KernelAddToOutput); + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAddToOutput); + exp_ext::async_free(CGH, AsyncPtrB); + }, + KernelAddToOutput); +} + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceOutput) { + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceOutput[j] += 2; + ReferenceOutput[j] += (j + 1); + ReferenceOutput[j] += (j + 2); + } + } +} + +int main() { + queue Queue{}; + + std::vector DataInput(Size); + + std::iota(DataInput.begin(), DataInput.end(), 1); + + std::vector ReferenceOutput(DataInput); + calculate_reference_data(Iterations, Size, ReferenceOutput); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrInput = malloc_device(Size, Queue); + + std::vector OutputData(Size); + + Queue.copy(DataInput.data(), PtrInput, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes_to_graph(Graph, Queue, Size, PtrInput); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData")); + } + + free(PtrInput, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp new file mode 100644 index 0000000000000..e791371544677 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_multiple.cpp @@ -0,0 +1,163 @@ +// Tests memory reuse behaviour of device graph allocations when an allocation +// is reused multiple times. NOTE: This test partially relies on knowing how the +// implementation works and that the contents of memory will persist when +// allocations are reused. This is useful for testing but is not an assumption +// that a user can or should make. + +#include "../../graph_common.hpp" +#include + +using T = int; +void add_nodes_to_graph( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, T *PtrInput) { + // Create 1 pointers for async allocations + T *AsyncPtr = nullptr; + // Add alloc nodes at the root of the graph for each allocation, this should + // result in three unique allocations + auto AllocNode1 = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtr = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + + // Store pointer value for later comparison + void *FirstAsyncPtr = AsyncPtr; + + // Add kernel that fills the async alloc with values + auto KernelFillPtrs = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {AllocNode1}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + AsyncPtr[LinID] = (1 + LinID); + }); + }, + AllocNode1); + // Free all the async allocation, making it possible to reuse it + auto FreeNode1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelFillPtrs); + exp_ext::async_free(CGH, AsyncPtr); + }, + KernelFillPtrs); + + // Add an allocation node which should reuse the previous allocation, followed + // by a kernel that uses the data, and then freeing the allocation. + + auto AllocNode2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNode1}); + AsyncPtr = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }, + FreeNode1); + + // Check that the new allocation matches one of the previously returned + // values. + assert(AsyncPtr == FirstAsyncPtr); + + auto KernelAdd1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {AllocNode2}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += AsyncPtr[LinID]; + }); + }, + AllocNode2); + + auto FreeNode2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAdd1); + exp_ext::async_free(CGH, AsyncPtr); + }, + KernelAdd1); + + // Repeat the previous 3 nodes to test reuse an allocation which has + // previously been reused. + + auto AllocNode3 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNode2}); + AsyncPtr = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }, + FreeNode2); + + // Check that the new allocation matches one of the previously returned + // values. + assert(AsyncPtr == FirstAsyncPtr); + + auto KernelAdd2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {AllocNode3}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += AsyncPtr[LinID]; + }); + }, + AllocNode3); + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAdd2); + exp_ext::async_free(CGH, AsyncPtr); + }, + KernelAdd2); +} + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceOutput) { + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceOutput[j] += (j + 1) * 2; + } + } +} + +int main() { + queue Queue{}; + + std::vector DataInput(Size); + + std::iota(DataInput.begin(), DataInput.end(), 1); + + std::vector ReferenceOutput(DataInput); + calculate_reference_data(Iterations, Size, ReferenceOutput); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrInput = malloc_device(Size, Queue); + + std::vector OutputData(Size); + + Queue.copy(DataInput.data(), PtrInput, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes_to_graph(Graph, Queue, Size, PtrInput); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData")); + } + + free(PtrInput, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp new file mode 100644 index 0000000000000..3f9bfb6443b1a --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_memory_reuse_zero_init.cpp @@ -0,0 +1,213 @@ +// Tests memory reuse behaviour of device graph allocations from a zero-init +// memory pool. +// NOTE: This test partially relies on knowing how the implementation works and +// that the contents of memory will persist when allocations are reused. This is +// useful for testing but is not an assumption that a user can or should make. + +#include "../../graph_common.hpp" +#include +#include +#include + +#define __SYCL_USE_VARIADIC_SPIRV_OCL_PRINTF__ + +using T = int; +void add_nodes_to_graph( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, T *PtrInput) { + // Create a memory pool for async allocations with the zero init property + exp_ext::memory_pool MemPool{ + Queue, usm::alloc::device, {exp_ext::property::memory_pool::zero_init{}}}; + // Create 2 pointers for async allocations + T *AsyncPtrA = nullptr; + T *AsyncPtrB = nullptr; + // Add alloc nodes at the root of the graph for each allocation, this should + // result in three unique allocations + auto AllocNodeA = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtrA = static_cast( + exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool)); + }); + auto AllocNodeB = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtrB = static_cast( + exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool)); + }); + + // Assert that we have received unique ptr values, this should always be true + // regardless of implementation. + assert((AsyncPtrA != AsyncPtrB)); + + // Store pointer values for later comparison + void *FirstAsyncPtrA = AsyncPtrA; + void *FirstAsyncPtrB = AsyncPtrB; + // Add kernel that fills the async allocs with values + auto KernelFillPtrs = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {AllocNodeA, AllocNodeB}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + // Allocations should be zero initialized on this first use, so we + // should be able to just add on values rather than set them + // explicitly. + AsyncPtrA[LinID] += (1 + LinID); + AsyncPtrB[LinID] += (2 + LinID); + }); + }, + AllocNodeA, AllocNodeB); + // Free all the async allocations, making it possible to reuse them + auto FreeNodeA = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelFillPtrs); + exp_ext::async_free(CGH, AsyncPtrA); + }, + KernelFillPtrs); + auto FreeNodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelFillPtrs); + exp_ext::async_free(CGH, AsyncPtrB); + }, + KernelFillPtrs); + + // Create 2 connected layers in the graph, in each layer we will do a new + // async_alloc which should reuse one of the 2 previously freed pointers. The + // other kernel in the layer will simply operate on the output ptr. + + // The first layer will have a direct dependency on a free node, but the + // subsequent layer will have an indirect dependency. We do not test the order + // in which allocations are picked for reuse, but we can assume both will be + // reused by the implementation (same size and properties so they are + // compatible). + + // First layer, allocation is added first + auto AllocNodeA1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNodeA, FreeNodeB}); + AsyncPtrA = static_cast( + exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool)); + }, + FreeNodeA, FreeNodeB); + + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrA == FirstAsyncPtrA) || (AsyncPtrA == FirstAsyncPtrB)); + // Increment output pointer + auto KernelInc1 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {FreeNodeA, FreeNodeB}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }, + FreeNodeA, FreeNodeB); + + // Second layer, allocation has an indirect dependency on a free node + // Increment output pointer + auto KernelInc2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc1}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }, + KernelInc1); + auto AllocNodeB2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc1, AllocNodeA1}); + AsyncPtrB = static_cast( + exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool)); + }, + KernelInc1, AllocNodeA1); + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrB == FirstAsyncPtrA) || (AsyncPtrB == FirstAsyncPtrB)); + + // Add a final kernel that adds the async allocation values to the output + // pointer + auto KernelAddToOutput = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {KernelInc2, AllocNodeB2}); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += (AsyncPtrA[LinID] + AsyncPtrB[LinID]); + }); + }, + KernelInc2, AllocNodeB2); + + // Free the allocations again + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAddToOutput); + exp_ext::async_free(CGH, AsyncPtrA); + }, + KernelAddToOutput); + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelAddToOutput); + exp_ext::async_free(CGH, AsyncPtrB); + }, + KernelAddToOutput); +} + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceOutput) { + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceOutput[j] += 2; + ReferenceOutput[j] += (j + 1) * (i + 1); + ReferenceOutput[j] += (j + 2) * (i + 1); + } + } +} + +int main() { + queue Queue{}; + + std::vector DataInput(Size); + + std::iota(DataInput.begin(), DataInput.end(), 1); + + std::vector ReferenceOutput(DataInput); + calculate_reference_data(Iterations, Size, ReferenceOutput); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrInput = malloc_device(Size, Queue); + + std::vector OutputData(Size); + + Queue.copy(DataInput.data(), PtrInput, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes_to_graph(Graph, Queue, Size, PtrInput); + + auto GraphExec = Graph.finalize(); + + Graph.print_graph("test.dot"); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData")); + } + + free(PtrInput, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp new file mode 100644 index 0000000000000..cfdc7e664acc5 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/async_alloc_device_memory_reuse.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp new file mode 100644 index 0000000000000..a40da6fb026c5 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_in_order.cpp @@ -0,0 +1,161 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests memory reuse behaviour of device graph allocations when an in-order +// queue is used. +// NOTE: This test partially relies on knowing how the +// implementation works and that the contents of memory will persist when +// allocations are reused. This is useful for testing but is not an assumption +// that a user can or should make. + +#include "../../graph_common.hpp" +#include +#include + +using T = int; +void add_nodes_to_graph(queue &Queue, size_t Size, T *PtrInput) { + // Create 2 pointers for async allocations + T *AsyncPtrA = nullptr; + T *AsyncPtrB = nullptr; + // Add alloc nodes at the root of the graph for each allocation, this should + // result in three unique allocations + Queue.submit([&](handler &CGH) { + AsyncPtrA = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + // Use the free function instead of handle version + AsyncPtrB = static_cast( + exp_ext::async_malloc(Queue, usm::alloc::device, Size * sizeof(T))); + + // Assert that we have received unique ptr values, this should always be true + // regardless of implementation. + assert((AsyncPtrA != AsyncPtrB)); + + // Store pointer values for later comparison + void *FirstAsyncPtrA = AsyncPtrA; + void *FirstAsyncPtrB = AsyncPtrB; + // Add kernel that fills the async allocs with values + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + AsyncPtrA[LinID] = (1 + LinID); + AsyncPtrB[LinID] = (2 + LinID); + }); + }); + // Free all the async allocations, making it possible to reuse them + Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrA); }); + Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrB); }); + + // Create 2 connected layers in the graph, in each layer we will do a new + // async_alloc which should reuse one of the 2 previously freed pointers. The + // other kernel in the layer will simply operate on the output ptr. + + // The first layer will have a direct dependency on a free node, but the + // subsequent layer will have an indirect dependency. We do not test the order + // in which allocations are picked for reuse, but we can assume both will be + // reused by the implementation (same size and properties so they are + // compatible). + + // First layer, allocation is added first + Queue.submit([&](handler &CGH) { + AsyncPtrA = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrA == FirstAsyncPtrA) || (AsyncPtrA == FirstAsyncPtrB)); + // Increment output pointer + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }); + + // Second layer, allocation has an indirect dependency on a free node + // Increment output pointer + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += 1; + }); + }); + Queue.submit([&](handler &CGH) { + AsyncPtrB = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + // Check that the new allocation matches one of the previously returned + // values. + assert((AsyncPtrB == FirstAsyncPtrA) || (AsyncPtrB == FirstAsyncPtrB)); + + // Add a final kernel that adds the async allocation values to the output + // pointer + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += (AsyncPtrA[LinID] + AsyncPtrB[LinID]); + }); + }); + + // Free the allocations again + + Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrA); }); + Queue.submit([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtrB); }); +} + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceOutput) { + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceOutput[j] += 2; + ReferenceOutput[j] += (j + 1); + ReferenceOutput[j] += (j + 2); + } + } +} + +int main() { + queue Queue{{property::queue::in_order{}}}; + + std::vector DataInput(Size); + + std::iota(DataInput.begin(), DataInput.end(), 1); + + std::vector ReferenceOutput(DataInput); + calculate_reference_data(Iterations, Size, ReferenceOutput); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrInput = malloc_device(Size, Queue); + + std::vector OutputData(Size); + + Queue.copy(DataInput.data(), PtrInput, Size); + Queue.wait_and_throw(); + + // Add commands to graph + Graph.begin_recording(Queue); + add_nodes_to_graph(Queue, Size, PtrInput); + Graph.end_recording(Queue); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData")); + } + + free(PtrInput, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp new file mode 100644 index 0000000000000..533121c33e83e --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_multiple.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/async_alloc_device_memory_reuse_multiple.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp new file mode 100644 index 0000000000000..3506691046f15 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_memory_reuse_zero_init.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/async_alloc_device_memory_reuse_zero_init.cpp"