diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 4e5b80b593e46..956082b31097f 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -305,6 +305,28 @@ from the same dynamic command-group object. This allows the SYCL runtime to access the list of alternative kernels when calling the UR API to append a kernel command to a command-buffer. +## Graph-Owned Memory Allocations +### Device Allocations + +Device allocations for graphs are implemented using virtual memory. Allocation +commands performing a virtual reservation for the provided size, and physical +memory is created and mapped only during graph finalization. This allows valid +device addresses to be returned immediately when building the graph without the +penalty of doing any memory allocations during graph building, which could have +a negative impact on features such as whole-graph update through increased +overhead. + +### Behaviour of async_free + +`async_free` nodes are treated as hints rather than an actual memory free +operation. This is because deallocating during graph execution is both +undesirable for performance and not feasible with the current +implementation/backends. Instead a free node represents a promise from the user +that the memory is no longer in use. This enables optimizations such as +potentially reusing that memory for subsequent allocation nodes in the graph. +This allows us to reduce the total amount of concurrent memory required by a +single graph. + ## Optimizations ### Interactions with Profiling diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 53800bbfb4380..a902381de3c5f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -121,7 +121,9 @@ enum class node_type { memadvise = 7, ext_oneapi_barrier = 8, host_task = 9, - native_command = 10 + native_command = 10, + async_malloc = 11, + async_free = 12 }; /// Class representing a node in the graph, returned by command_graph::add(). @@ -429,6 +431,10 @@ class __SYCL_EXPORT executable_command_graph /// @param Nodes The nodes to use for updating the graph. void update(const std::vector &Nodes); + /// Return the total amount of memory required by this graph for graph-owned + /// memory allocations. + size_t get_required_mem_size() const; + /// Common Reference Semantics friend bool operator==(const executable_command_graph &LHS, const executable_command_graph &RHS) { diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp index 8251da36ea492..b21f6d13c42e3 100644 --- a/sycl/source/detail/async_alloc.cpp +++ b/sycl/source/detail/async_alloc.cpp @@ -44,23 +44,25 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) { sycl::make_error_code(sycl::errc::feature_not_supported), "Only device backed asynchronous allocations are supported!"); - h.throwIfGraphAssociated< - ext::oneapi::experimental::detail::UnsupportedGraphFeatures:: - sycl_ext_oneapi_async_alloc>(); - auto &Adapter = h.getContextImplPtr()->getAdapter(); - auto &Q = h.MQueue->getHandleRef(); // Get events to wait on. auto depEvents = getUrEvents(h.impl->CGData.MEvents); uint32_t numEvents = h.impl->CGData.MEvents.size(); void *alloc = nullptr; - ur_event_handle_t Event; - Adapter->call( - Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, depEvents.data(), - &alloc, &Event); + + ur_event_handle_t Event = nullptr; + // If a graph is present do the allocation from the graph memory pool instead. + if (auto Graph = h.getCommandGraph(); Graph) { + alloc = Graph->getMemPool().malloc(size, kind); + } else { + auto &Q = h.MQueue->getHandleRef(); + Adapter->call( + Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, depEvents.data(), + &alloc, &Event); + } // Async malloc must return a void* immediately. // Set up CommandGroup which is a no-op and pass the @@ -90,12 +92,7 @@ __SYCL_EXPORT void *async_malloc(const sycl::queue &q, sycl::usm::alloc kind, __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size, const memory_pool &pool) { - h.throwIfGraphAssociated< - ext::oneapi::experimental::detail::UnsupportedGraphFeatures:: - sycl_ext_oneapi_async_alloc>(); - auto &Adapter = h.getContextImplPtr()->getAdapter(); - auto &Q = h.MQueue->getHandleRef(); auto &memPoolImpl = sycl::detail::getSyclObjImpl(pool); // Get events to wait on. @@ -103,12 +100,20 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size, uint32_t numEvents = h.impl->CGData.MEvents.size(); void *alloc = nullptr; - ur_event_handle_t Event; - Adapter->call( - Q, memPoolImpl.get()->get_handle(), size, nullptr, numEvents, - depEvents.data(), &alloc, &Event); + ur_event_handle_t Event = nullptr; + // If a graph is present do the allocation from the graph memory pool instead. + if (auto Graph = h.getCommandGraph(); Graph) { + // Memory pool is passed as the graph may use some properties of it. + alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), + sycl::detail::getSyclObjImpl(pool)); + } else { + auto &Q = h.MQueue->getHandleRef(); + Adapter->call( + Q, memPoolImpl.get()->get_handle(), size, nullptr, numEvents, + depEvents.data(), &alloc, &Event); + } // Async malloc must return a void* immediately. // Set up CommandGroup which is a no-op and pass the event from the alloc. h.impl->MAsyncAllocEvent = Event; @@ -135,9 +140,15 @@ async_malloc_from_pool(const sycl::queue &q, size_t size, } __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr) { - h.throwIfGraphAssociated< - ext::oneapi::experimental::detail::UnsupportedGraphFeatures:: - sycl_ext_oneapi_async_alloc>(); + if (auto Graph = h.getCommandGraph(); Graph) { + // Check if the pointer to be freed has an associated allocation node, and + // error if not + if (!Graph->getMemPool().hasAllocation(ptr)) { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Cannot add a free node to a graph for which " + "there is no associated allocation node!"); + } + } h.impl->MFreePtr = ptr; h.setType(detail::CGType::AsyncFree); diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 8cb8647c49ee9..ce5985275ac21 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -57,6 +57,10 @@ inline const char *nodeTypeToString(node_type NodeType) { return "host_task"; case node_type::native_command: return "native_command"; + case node_type::async_malloc: + return "async_malloc"; + case node_type::async_free: + return "async_free"; } assert(false && "Unhandled node type"); return {}; @@ -340,7 +344,7 @@ graph_impl::graph_impl(const sycl::context &SyclContext, const sycl::device &SyclDevice, const sycl::property_list &PropList) : MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(), - MEventsMap(), MInorderQueueMap(), + MEventsMap(), MInorderQueueMap(), MGraphMemPool(SyclContext, SyclDevice), MID(NextAvailableID.fetch_add(1, std::memory_order_relaxed)) { checkGraphPropertiesAndThrow(PropList); if (PropList.has_property()) { @@ -750,12 +754,12 @@ void graph_impl::beginRecording( } } -// Check if nodes are empty and if so loop back through predecessors until we -// find the real dependency. +// Check if nodes do not require enqueueing and if so loop back through +// predecessors until we find the real dependency. void exec_graph_impl::findRealDeps( std::vector &Deps, std::shared_ptr CurrentNode, int ReferencePartitionNum) { - if (CurrentNode->isEmpty()) { + if (!CurrentNode->requiresEnqueue()) { for (auto &N : CurrentNode->MPredecessors) { auto NodeImpl = N.lock(); findRealDeps(Deps, NodeImpl, ReferencePartitionNum); @@ -875,9 +879,9 @@ void exec_graph_impl::createCommandBuffers( Partition->MCommandBuffers[Device] = OutCommandBuffer; for (const auto &Node : Partition->MSchedule) { - // Empty nodes are not processed as other nodes, but only their + // Some nodes are not scheduled like other nodes, and only their // dependencies are propagated in findRealDeps - if (Node->isEmpty()) + if (!Node->requiresEnqueue()) continue; sycl::detail::CGType type = Node->MCGType; @@ -943,6 +947,8 @@ exec_graph_impl::exec_graph_impl(sycl::context Context, exec_graph_impl::~exec_graph_impl() { try { + MGraphImpl->markExecGraphDestroyed(); + const sycl::detail::AdapterPtr &Adapter = sycl::detail::getSyclObjImpl(MContext)->getAdapter(); MSchedule.clear(); @@ -952,6 +958,9 @@ exec_graph_impl::~exec_graph_impl() { Event->wait(Event); } + // Clean up any graph-owned allocations that were allocated + MGraphImpl->getMemPool().deallocateAndUnmapAll(); + for (const auto &Partition : MPartitions) { Partition->MSchedule.clear(); for (const auto &Iter : Partition->MCommandBuffers) { @@ -1810,6 +1819,14 @@ modifiable_command_graph::finalize(const sycl::property_list &PropList) const { // Graph is read and written in this scope so we lock // this graph with full priviledges. graph_impl::WriteLock Lock(impl->MMutex); + // If the graph uses graph-owned allocations and an executable graph already + // exists we must throw an error. + if (impl->getMemPool().hasAllocations() && impl->getExecGraphCount() > 0) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Graphs containing allocations can only have a " + "single executable graph alive at any one time."); + } + return command_graph{ this->impl, this->impl->getContext(), PropList}; } @@ -1937,11 +1954,16 @@ executable_command_graph::executable_command_graph( const property_list &PropList) : impl(std::make_shared(Ctx, Graph, PropList)) { finalizeImpl(); // Create backend representation for executable graph + // Mark that we have created an executable graph from the modifiable graph. + Graph->markExecGraphCreated(); } void executable_command_graph::finalizeImpl() { impl->makePartitions(); + // Handle any work required for graph-owned memory allocations + impl->finalizeMemoryAllocations(); + auto Device = impl->getGraphImpl()->getDevice(); for (auto Partition : impl->getPartitions()) { if (!Partition->isHostTask()) { @@ -1969,6 +1991,13 @@ void executable_command_graph::update(const std::vector &Nodes) { impl->update(NodeImpls); } +size_t executable_command_graph::get_required_mem_size() const { + // Since each graph has a unique mem pool, return the current memory usage for + // now. This call my change if we move to being able to share memory between + // unique graphs. + return impl->getGraphImpl()->getMemPool().getMemUseCurrent(); +} + dynamic_parameter_base::dynamic_parameter_base( command_graph Graph) : impl(std::make_shared( diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 3234721626a10..acc334f61cc3a 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,11 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) { return node_type::subgraph; case sycl::detail::CGType::EnqueueNativeCommand: return node_type::native_command; + case sycl::detail::CGType::AsyncAlloc: + return node_type::async_malloc; + case sycl::detail::CGType::AsyncFree: + return node_type::async_free; + default: assert(false && "Invalid Graph Node Type"); return node_type::empty; @@ -473,6 +479,21 @@ class node_impl : public std::enable_shared_from_this { } } + /// Returns true if this node should be enqueued to the backend, if not only + /// its dependencies are considered. + bool requiresEnqueue() const { + switch (MNodeType) { + case node_type::empty: + case node_type::ext_oneapi_barrier: + case node_type::async_malloc: + case node_type::async_free: + return false; + + default: + return true; + } + } + private: void rebuildArgStorage(std::vector &Args, const std::vector> &OldArgStorage, @@ -919,6 +940,12 @@ class graph_impl : public std::enable_shared_from_this { /// @return Context associated with graph. sycl::context getContext() const { return MContext; } + /// Query for the context impl tied to this graph. + /// @return shared_ptr ref for the context impl associated with graph. + const std::shared_ptr &getContextImplPtr() const { + return sycl::detail::getSyclObjImpl(MContext); + } + /// Query for the device_impl tied to this graph. /// @return device_impl shared ptr reference associated with graph. const DeviceImplPtr &getDeviceImplPtr() const { @@ -1139,6 +1166,32 @@ class graph_impl : public std::enable_shared_from_this { unsigned long long getID() const { return MID; } + /// Get the memory pool used for graph-owned allocations. + graph_mem_pool &getMemPool() { return MGraphMemPool; } + + /// Mark that an executable graph was created from this modifiable graph, used + /// for tracking live graphs for graph-owned allocations. + void markExecGraphCreated() { MExecGraphCount++; } + + /// Mark that an executable graph created from this modifiable graph was + /// destroyed, used for tracking live graphs for graph-owned allocations. + void markExecGraphDestroyed() { + while (true) { + size_t CurrentVal = MExecGraphCount; + if (CurrentVal == 0) { + break; + } + if (MExecGraphCount.compare_exchange_strong(CurrentVal, CurrentVal - 1) == + false) { + continue; + } + } + } + + /// Get the number of unique executable graph instances currently alive for + /// this graph. + size_t getExecGraphCount() const { return MExecGraphCount; } + private: /// Check the graph for cycles by performing a depth-first search of the /// graph. If a node is visited more than once in a given path through the @@ -1206,10 +1259,17 @@ class graph_impl : public std::enable_shared_from_this { std::map, std::shared_ptr, std::owner_less>> MBarrierDependencyMap; + /// Graph memory pool for handling graph-owned memory allocations for this + /// graph. + graph_mem_pool MGraphMemPool; unsigned long long MID; // Used for std::hash in order to create a unique hash for the instance. inline static std::atomic NextAvailableID = 0; + + // The number of live executable graphs that have been created from this + // modifiable graph + std::atomic MExecGraphCount = 0; }; /// Class representing the implementation of command_graph. @@ -1334,6 +1394,14 @@ class exec_graph_impl { unsigned long long getID() const { return MID; } + /// Do any work required during finalization to finalize graph-owned memory + /// allocations. + void finalizeMemoryAllocations() { + // This call allocates physical memory and maps all virtual device + // allocations + MGraphImpl->getMemPool().allocateAndMapAll(); + } + private: /// Create a command-group for the node and add it to command-buffer by going /// through the scheduler. diff --git a/sycl/source/detail/graph_memory_pool.hpp b/sycl/source/detail/graph_memory_pool.hpp new file mode 100644 index 0000000000000..be90c3b58892a --- /dev/null +++ b/sycl/source/detail/graph_memory_pool.hpp @@ -0,0 +1,210 @@ +//==--------- graph_memory_pool.hpp --- SYCL graph extension ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include // For physical_mem_impl +#include // For context +#include // For device +#include // For get_mem_granularity + +namespace sycl { +inline namespace _V1 { +namespace ext { +namespace oneapi { +namespace experimental { +namespace detail { + +/// Class handling graph-owned memory allocations. Device allocations are +/// managed using virtual memory. +class graph_mem_pool { + // Info descriptor for various properties of an allocation + struct alloc_info { + // Allocation kind + usm::alloc Kind; + // Size of the allocation + size_t Size; + // Is currently mapped to physical memory + bool Mapped = false; + // Index into the array of physical memory + size_t PhysicalMemID = 0; + // Is the allocation read only + bool ReadOnly = false; + // Should the allocation be zero initialized during initial allocation + bool ZeroInit = false; + }; + +public: + graph_mem_pool(const context &Context, const device &Device) + : MContext(Context), MDevice(Device) {} + ~graph_mem_pool() { + for (auto &[Ptr, AllocInfo] : MAllocations) { + // Unmap allocations if required before physical memory is released + // Physical mem is released when MPhysicalMem is cleared + if (AllocInfo.Mapped) { + unmap(Ptr, AllocInfo.Size, MContext); + } + // Free the VA range + free_virtual_mem(reinterpret_cast(Ptr), AllocInfo.Size, + MContext); + } + MPhysicalMem.clear(); + } + + /// Memory pool cannot be copied + graph_mem_pool(graph_mem_pool &) = delete; + + /// Get a pointer to a new allocation. For device allocations these are + /// virtual reservations which must be later mapped to allocated physical + /// memory before use by calling allocateAndMapAll() + /// @param Size Size of the allocation + /// @param AllocType Type of the allocation + /// @param MemPool Optional memory pool from which allocations will not be + /// made directly but properties may be respected. + /// @return A pointer to the start of the allocation + void *malloc(size_t Size, usm::alloc AllocType, + const std::shared_ptr &MemPool = nullptr) { + void *Alloc = nullptr; + switch (AllocType) { + case usm::alloc::device: { + + auto &CtxImpl = sycl::detail::getSyclObjImpl(MContext); + auto &Adapter = CtxImpl->getAdapter(); + + size_t Granularity = get_mem_granularity(MDevice, MContext); + uintptr_t StartPtr = 0; + size_t AlignedSize = alignByteSize(Size, Granularity); + // Do virtual reservation + Adapter->call( + CtxImpl->getHandleRef(), reinterpret_cast(StartPtr), + AlignedSize, &Alloc); + + alloc_info AllocInfo = {}; + AllocInfo.Kind = AllocType; + AllocInfo.Size = AlignedSize; + + // Collect relevant properties from memory pool + if (MemPool) { + auto PropList = MemPool->getPropList(); + if (PropList.has_property()) { + AllocInfo.ZeroInit = true; + } + if (PropList.has_property()) { + AllocInfo.ReadOnly = true; + } + } + + MAllocations[Alloc] = AllocInfo; + break; + } + + default: + throw sycl::exception(sycl::make_error_code(errc::feature_not_supported), + "Only device allocations are currently supported " + "in graph allocation nodes!"); + break; + } + + return Alloc; + } + + /// Return the total amount of memory being used by this pool + size_t getMemUseCurrent() const { + size_t TotalMem = 0; + for (auto &PhysicalMem : MPhysicalMem) { + TotalMem += PhysicalMem->size(); + } + + return TotalMem; + } + + /// For device allocations using virtual memory this function allocates + /// physical memory and maps each virtual range to it, should be called during + /// graph finalization. + void allocateAndMapAll() { + // Queue used for any initializing of memory, zero-init etc. + sycl::queue Queue{MContext, MDevice}; + + for (auto &Allocation : MAllocations) { + // Set access mode + void *Ptr = Allocation.first; + alloc_info &AllocInfo = Allocation.second; + address_access_mode AccessMode = AllocInfo.ReadOnly + ? address_access_mode::read + : address_access_mode::read_write; + + // Create physical memory + auto PhysicalMem = std::make_shared(MDevice, MContext, + AllocInfo.Size); + // Map the virtual reservation to it + PhysicalMem->map(reinterpret_cast(Ptr), AllocInfo.Size, + AccessMode, 0); + + // Zero init if required + if (AllocInfo.ZeroInit) { + Queue.submit( + [&](sycl::handler &CGH) { CGH.memset(Ptr, 0, AllocInfo.Size); }); + } + + MPhysicalMem.push_back(PhysicalMem); + AllocInfo.PhysicalMemID = MPhysicalMem.size() - 1; + AllocInfo.Mapped = true; + } + + // Wait on any operations we enqueued. + Queue.wait_and_throw(); + } + + /// For device virtual reservations unmap and deallocate physical memory. + /// Virtual reservations are not released and can be reallocated/mapped again. + /// Typically called on executable graph destruction. + void deallocateAndUnmapAll() { + for (auto &[Ptr, AllocInfo] : MAllocations) { + // Unmap allocations before physical memory is released + // Physical mem is released when MPhysicalMem is cleared + unmap(Ptr, AllocInfo.Size, MContext); + AllocInfo.PhysicalMemID = 0; + AllocInfo.Mapped = false; + } + + MPhysicalMem.clear(); + } + + /// True if this pool has any allocations + bool hasAllocations() const { return MAllocations.size(); } + + // True if an allocation exists for this pointer + bool hasAllocation(void *Ptr) const { + return MAllocations.find(Ptr) != MAllocations.end(); + } + +private: + /// Returns an aligned byte size given a required granularity + /// @param UnalignedByteSize The original requested allocation size + /// @param Granularity The required granularity for this allocation + /// @returns The aligned size + static size_t alignByteSize(size_t UnalignedByteSize, size_t Granularity) { + return ((UnalignedByteSize + Granularity - 1) / Granularity) * Granularity; + } + + /// Context associated with allocations from this pool + context MContext; + /// Device associated with allocations from this pool + device MDevice; + /// Map of allocated pointers to an info struct + std::unordered_map MAllocations; + /// List of physical memory allocations used for virtual device reservations + std::vector> MPhysicalMem; +}; +} // namespace detail +} // namespace experimental +} // namespace oneapi +} // namespace ext +} // namespace _V1 +} // namespace sycl diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index a6057ac5ceaf4..0f60442aa9e18 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -2052,6 +2052,9 @@ void handler::memcpyFromHostOnlyDeviceGlobal(void *Dest, const std::shared_ptr & handler::getContextImplPtr() const { + if (impl->MGraph) { + return impl->MGraph->getContextImplPtr(); + } return MQueue->getContextImplPtr(); } diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device.cpp new file mode 100644 index 0000000000000..108ef34fe3489 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/async_alloc_device.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_zero_init.cpp new file mode 100644 index 0000000000000..0595482a24731 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_device_zero_init.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/async_alloc_device_zero_init.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_queries.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_queries.cpp new file mode 100644 index 0000000000000..b86cd9bf04476 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Explicit/async_alloc_queries.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/async_alloc_queries.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device.cpp new file mode 100644 index 0000000000000..6d4452815f645 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device.cpp @@ -0,0 +1,147 @@ +// Tests basic adding of async allocation nodes for device allocations + +#include "../../graph_common.hpp" +#include + +using T = int; +void add_nodes_to_graph( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, T *PtrInput) { + T *AsyncPtrA = nullptr; + // Add alloc node + auto AllocNode = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtrA = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + // Add memcpy to alloc node + auto MemcpyNodeA = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, AllocNode); + CGH.memcpy(AsyncPtrA, PtrInput, Size * sizeof(T)); + }, + AllocNode); + + // add kernel that operates on async memory + auto KernelNodeA = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, MemcpyNodeA); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + PtrInput[LinID] += AsyncPtrA[LinID]; + }); + }, + MemcpyNodeA); + + // Add free node + + auto FreeNode = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelNodeA); + exp_ext::async_free(CGH, AsyncPtrA); + }, + KernelNodeA); + + // Add alloc node + + T *AsyncPtrB = nullptr; + + auto AllocNodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, FreeNode); + AsyncPtrB = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }, + FreeNode); + + // Add kernels that operates on async memory + auto KernelNodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, AllocNodeB); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + AsyncPtrB[LinID] = PtrInput[LinID] + LinID; + }); + }, + AllocNodeB); + auto KernelNodeC = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelNodeB); + CGH.parallel_for(range<1>(Size), [=](item<1> Item) { + size_t LinID = Item.get_linear_id(); + AsyncPtrB[LinID] *= 3; + }); + }, + KernelNodeB); + // Add copy back to input USM pointer + auto MemcpyNodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, KernelNodeC); + CGH.memcpy(PtrInput, AsyncPtrB, Size * sizeof(T)); + }, + KernelNodeC); + // Add free node + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, MemcpyNodeB); + exp_ext::async_free(CGH, AsyncPtrB); + }, + MemcpyNodeB); +} + +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceOutput) { + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceOutput[j] *= 2; + ReferenceOutput[j] += j; + ReferenceOutput[j] *= 3; + } + } +} + +int main() { + queue Queue{}; + + std::vector DataInput(Size); + + std::iota(DataInput.begin(), DataInput.end(), 1); + + std::vector ReferenceOutput(DataInput); + calculate_reference_data(Iterations, Size, ReferenceOutput); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrInput = malloc_device(Size, Queue); + + std::vector OutputData(Size); + + Queue.copy(DataInput.data(), PtrInput, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes_to_graph(Graph, Queue, Size, PtrInput); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + Queue.copy(PtrInput, OutputData.data(), Size).wait_and_throw(); + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceOutput[i], OutputData[i], "OutputData")); + } + + free(PtrInput, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_zero_init.cpp new file mode 100644 index 0000000000000..32078c8ce660b --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_device_zero_init.cpp @@ -0,0 +1,59 @@ +// Tests zero initializing a graph memory allocation based on memory pool +// properties + +#include "../../graph_common.hpp" +#include +#include +#include + +using T = int; + +int main() { + queue Queue{}; + + std::vector Output(Size); + std::vector ReferenceOutput(Size, T(0)); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + exp_ext::memory_pool MemPool{ + Queue, usm::alloc::device, {exp_ext::property::memory_pool::zero_init{}}}; + + // Add commands to graph + T *AsyncPtrA = nullptr; + // Add alloc node that is zero initialized + auto AllocNode = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtrA = static_cast( + exp_ext::async_malloc_from_pool(CGH, Size * sizeof(T), MemPool)); + }); + + // Copy that zero init memory back to host + auto MemcpyNode = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, AllocNode); + CGH.memcpy(Output.data(), AsyncPtrA, Size * sizeof(T)); + }, + AllocNode); + + // Free memory + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, MemcpyNode); + exp_ext::async_free(CGH, AsyncPtrA); + }, + MemcpyNode); + + auto GraphExec = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceOutput[i], Output[i], "Output")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_queries.cpp b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_queries.cpp new file mode 100644 index 0000000000000..08ac34f81f3d0 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/Inputs/async_alloc_queries.cpp @@ -0,0 +1,51 @@ +// Tests queries associated with graph-owned allocations + +#include "../../graph_common.hpp" +#include + +using T = int; + +int main() { + queue Queue{}; + + size_t Size = 2 << 18; + + // Expected size is number of elements * size of data type * iterations + const size_t ExpectedMinSize = Size * sizeof(T) * Iterations; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + // Graph should report 0 memory usage with no allocation nodes + { + auto GraphExec = Graph.finalize(); + assert(GraphExec.get_required_mem_size() == 0); + } + + // Add allocs and frees for each command. Allocs are all root nodes so they + // will require unique memory for each one. + for (size_t i = 0; i < Iterations; i++) { + T *AsyncPtr = nullptr; + // Add alloc node + auto AllocNode = add_node(Graph, Queue, [&](handler &CGH) { + AsyncPtr = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + + // Free memory, node depends on only the associated allocation node + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, AllocNode); + exp_ext::async_free(CGH, AsyncPtr); + }, + AllocNode); + } + + auto GraphExec = Graph.finalize(); + + // Memory allocated might be adjusted for example based on device granularity, + // so it may be more than expected but never less. + assert(GraphExec.get_required_mem_size() >= ExpectedMinSize); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device.cpp new file mode 100644 index 0000000000000..35776d4bbbd26 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/async_alloc_device.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_zero_init.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_zero_init.cpp new file mode 100644 index 0000000000000..0dd1bcf149f80 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_device_zero_init.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/async_alloc_device_zero_init.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_queries.cpp b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_queries.cpp new file mode 100644 index 0000000000000..c2fe4c9b195b7 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/RecordReplay/async_alloc_queries.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/async_alloc_queries.cpp" diff --git a/sycl/test-e2e/Graph/AsyncAlloc/async_alloc_finalize_exception.cpp b/sycl/test-e2e/Graph/AsyncAlloc/async_alloc_finalize_exception.cpp new file mode 100644 index 0000000000000..847e3d0dfbf08 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/async_alloc_finalize_exception.cpp @@ -0,0 +1,93 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests that attempting to finalize and create more than one executable graph +// containing allocations is an error. + +#include "../graph_common.hpp" +#include + +using T = int; + +int main() { + queue Queue{}; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + std::vector OutputData(Size); + std::vector ReferenceData(Size); + + std::iota(ReferenceData.begin(), ReferenceData.end(), 0); + + // Add alloc and free commands + T *AsyncPtr = nullptr; + // Add alloc node + auto AllocNode = Graph.add([&](handler &CGH) { + AsyncPtr = static_cast( + exp_ext::async_malloc(CGH, usm::alloc::device, Size * sizeof(T))); + }); + + auto KernelNode = Graph.add( + [&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> ID) { + size_t LinID = ID.get_linear_id(); + AsyncPtr[LinID] = static_cast(LinID); + }); + }, + {exp_ext::property::node::depends_on{AllocNode}}); + + // Copy data out for verification + auto CopyNode = Graph.add( + [&](handler &CGH) { + CGH.memcpy(OutputData.data(), AsyncPtr, Size * sizeof(T)); + }, + {exp_ext::property::node::depends_on{KernelNode}}); + + // Free memory, node depends on only the associated allocation node + Graph.add([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtr); }, + {exp_ext::property::node::depends_on{CopyNode}}); + + // Constrain scope of GraphExec + { + auto GraphExec = Graph.finalize(); + // Graphs support CRS so copying here does not create a separate instance + // and should be allowed. + auto GraphExec2 = GraphExec; + + // Check that the graph executes correctly + + Queue.ext_oneapi_graph(GraphExec).wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceData[i], OutputData[i], "OutputData")); + } + + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + // GraphExec is still alive so this should be an error + Graph.finalize(); + } catch (const exception &e) { + ExceptionCode = e.code(); + } + + assert(ExceptionCode == sycl::errc::invalid); + } + + // GraphExec and GraphExec2 are now destroyed, so we should be able to + // finalize again + auto GraphExec = Graph.finalize(); + + // Check that the graph executes correctly again + + Queue.ext_oneapi_graph(GraphExec).wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceData[i], OutputData[i], "OutputData")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/async_alloc_free_exception.cpp b/sycl/test-e2e/Graph/AsyncAlloc/async_alloc_free_exception.cpp new file mode 100644 index 0000000000000..aae37b1437174 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/async_alloc_free_exception.cpp @@ -0,0 +1,52 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests that attempting to free a pointer in a graph that doesn't have an +// associated allocation node fails in a range of scenarios + +#include "../graph_common.hpp" +#include + +using T = int; + +// Attempts to add a free to the graph for a pointer with no associated +// allocation, and check that the correct exception is returned. +void addInvalidFreeAndCheckForException( + exp_ext::command_graph &Graph) { + void *FakePtr = (void *)1; + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + Graph.add([&](handler &CGH) { exp_ext::async_free(CGH, FakePtr); }); + } catch (const exception &e) { + ExceptionCode = e.code(); + } + + assert(ExceptionCode == sycl::errc::invalid); +} + +int main() { + queue Queue{}; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + // Add an invalid free to the graph + addInvalidFreeAndCheckForException(Graph); + + void *AsyncPtr = nullptr; + // Add a real allocation node + auto AllocNode = Graph.add([&](handler &CGH) { + AsyncPtr = sycl::ext::oneapi::experimental::async_malloc( + CGH, usm::alloc::device, Size); + }); + + // Try the invalid free again + addInvalidFreeAndCheckForException(Graph); + + // Add a real free node + Graph.add([&](handler &CGH) { exp_ext::async_free(CGH, AsyncPtr); }); + + // Try the invalid free again + addInvalidFreeAndCheckForException(Graph); + + return 0; +} diff --git a/sycl/test-e2e/Graph/AsyncAlloc/lit.local.cfg b/sycl/test-e2e/Graph/AsyncAlloc/lit.local.cfg new file mode 100644 index 0000000000000..4b1bd2c979d99 --- /dev/null +++ b/sycl/test-e2e/Graph/AsyncAlloc/lit.local.cfg @@ -0,0 +1,5 @@ +# Async alloc tests require virtual memory for device allocations +# Async alloc extension is required to support creating memory pools +config.required_features += ['aspect-ext_oneapi_virtual_mem', 'aspect-ext_oneapi_limited_graph', 'aspect-ext_oneapi_async_memory_alloc'] +# V2 adapter does not support async alloc api yet +config.unsupported_features += ['level_zero_v2_adapter'] diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 967e13b09d739..4a02e92dae289 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3650,6 +3650,7 @@ _ZNK4sycl3_V13ext6oneapi12experimental21dynamic_command_group16get_active_indexE _ZNK4sycl3_V13ext6oneapi12experimental4node14get_successorsEv _ZNK4sycl3_V13ext6oneapi12experimental4node16get_predecessorsEv _ZNK4sycl3_V13ext6oneapi12experimental4node8get_typeEv +_ZNK4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph21get_required_mem_sizeEv _ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph11print_graphENS0_6detail11string_viewEb _ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph11print_graphENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEb _ZNK4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph14get_root_nodesEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 27a22a7044322..b1ef7c89edffa 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4171,6 +4171,7 @@ ?get_queue@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEBA?AVqueue@56@XZ ?get_range@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$range@$02@56@XZ ?get_range@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$02@34@XZ +?get_required_mem_size@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA_KXZ ?get_reserved_size_current@memory_pool@experimental@oneapi@ext@_V1@sycl@@QEBA_KXZ ?get_root_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ ?get_size@image_plain@detail@_V1@sycl@@IEBA_KXZ diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index d6977dd3ddbed..f52e690b171e8 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -221,48 +221,8 @@ void addImagesCopies(experimental::detail::modifiable_command_graph &G, /// @param G Modifiable graph to add commands to. /// @param Q Queue to submit nodes to. /// @param Size Size in bytes to allocate. -/// @param MemPool Memory pool to allocate to. /// @param Ptr Generic pointer to allocated memory. -template -void addAsyncAlloc(experimental::detail::modifiable_command_graph &G, queue &Q, - size_t Size, - sycl::ext::oneapi::experimental::memory_pool &memPool, - [[maybe_unused]] void *Ptr) { - // simple alloc with specified pool - std::error_code ExceptionCode = make_error_code(sycl::errc::success); - try { - if constexpr (PathKind == OperationPath::RecordReplay) { - Q.submit([&](handler &CGH) { - Ptr = sycl::ext::oneapi::experimental::async_malloc_from_pool(CGH, Size, - memPool); - }); - } - if constexpr (PathKind == OperationPath::Shortcut) { - Ptr = sycl::ext::oneapi::experimental::async_malloc_from_pool(Q, Size, - memPool); - } - if constexpr (PathKind == OperationPath::Explicit) { - G.add([&](handler &CGH) { - Ptr = sycl::ext::oneapi::experimental::async_malloc_from_pool(CGH, Size, - memPool); - }); - } - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); -} - -/// Tries to add nodes including asynchronous allocation instructions to the -/// graph G. It tests that an invalid exception has been thrown since the -/// sycl_ext_oneapi_async_alloc can not be used along with SYCL Graph. -/// -/// @param G Modifiable graph to add commands to. -/// @param Q Queue to submit nodes to. -/// @param Size Size in bytes to allocate. -/// @param Ptr Generic pointer to allocated memory. -template +template void addAsyncAlloc(experimental::detail::modifiable_command_graph &G, queue &Q, size_t Size, [[maybe_unused]] void *Ptr) { // simple alloc @@ -270,57 +230,24 @@ void addAsyncAlloc(experimental::detail::modifiable_command_graph &G, queue &Q, try { if constexpr (PathKind == OperationPath::RecordReplay) { Q.submit([&](handler &CGH) { - Ptr = sycl::ext::oneapi::experimental::async_malloc( - CGH, sycl::usm::alloc::device, Size); + Ptr = + sycl::ext::oneapi::experimental::async_malloc(CGH, AllocKind, Size); }); } if constexpr (PathKind == OperationPath::Shortcut) { - Ptr = sycl::ext::oneapi::experimental::async_malloc( - Q, sycl::usm::alloc::device, Size); + Ptr = sycl::ext::oneapi::experimental::async_malloc(Q, AllocKind, Size); } if constexpr (PathKind == OperationPath::Explicit) { G.add([&](handler &CGH) { - Ptr = sycl::ext::oneapi::experimental::async_malloc( - CGH, sycl::usm::alloc::device, Size); + Ptr = + sycl::ext::oneapi::experimental::async_malloc(CGH, AllocKind, Size); }); } } catch (exception &Exception) { ExceptionCode = Exception.code(); } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); -} - -/// Tries to add nodes including asynchronous free instructions to the graph G. -/// It tests that an invalid exception has been thrown since the -/// sycl_ext_oneapi_async_alloc can not be used along with SYCL Graph. -/// -/// @param G Modifiable graph to add commands to. -/// @param Q Queue to submit nodes to. -/// @param Ptr Pointer to asynchronously allocated memory to free. -template -void addAsyncFree(experimental::detail::modifiable_command_graph &G, queue &Q, - void *Ptr) { - // simple free - std::error_code ExceptionCode = make_error_code(sycl::errc::success); - try { - if constexpr (PathKind == OperationPath::RecordReplay) { - Q.submit([&](handler &CGH) { - sycl::ext::oneapi::experimental::async_free(CGH, Ptr); - }); - } - if constexpr (PathKind == OperationPath::Shortcut) { - sycl::ext::oneapi::experimental::async_free(Q, Ptr); - } - if constexpr (PathKind == OperationPath::Explicit) { - G.add([&](handler &CGH) { - sycl::ext::oneapi::experimental::async_free(CGH, Ptr); - }); - } - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + ASSERT_EQ(ExceptionCode, sycl::errc::feature_not_supported); } } // anonymous namespace @@ -981,33 +908,32 @@ TEST_F(CommandGraphTest, DynamicCommandGroupMismatchAccessorEdges) { ASSERT_THROW(Graph.add(DynCG), sycl::exception); } -// ext_oneapi_async_alloc isn't currently supported with SYCL graphs -TEST_F(CommandGraphTest, AsyncAllocExceptionCheck) { +// host and shared allocations are not currently supported by graphs, checks for +// correct exception behaviour. +TEST_F(CommandGraphTest, AsyncAllocKindExceptionCheck) { auto Context = Queue.get_context(); auto Device = Queue.get_device(); - // Create pool - sycl::ext::oneapi::experimental::memory_pool MemPool( - Context, Device, sycl::usm::alloc::device); - void *Ptr1 = nullptr; void *Ptr2 = nullptr; Graph.begin_recording(Queue); - addAsyncAlloc(Graph, Queue, 1024, MemPool, Ptr1); - addAsyncAlloc(Graph, Queue, 1024, Ptr2); - - addAsyncFree(Graph, Queue, Ptr1); - addAsyncFree(Graph, Queue, Ptr2); + addAsyncAlloc(Graph, Queue, + 1024, Ptr1); + addAsyncAlloc(Graph, Queue, + 1024, Ptr1); + addAsyncAlloc(Graph, Queue, 1024, + Ptr2); + addAsyncAlloc(Graph, Queue, 1024, + Ptr2); Graph.end_recording(); void *Ptr3 = nullptr; void *Ptr4 = nullptr; - addAsyncAlloc(Graph, Queue, 1024, MemPool, Ptr3); - addAsyncAlloc(Graph, Queue, 1024, Ptr4); - - addAsyncFree(Graph, Queue, Ptr3); - addAsyncFree(Graph, Queue, Ptr4); + addAsyncAlloc(Graph, Queue, 1024, + Ptr3); + addAsyncAlloc(Graph, Queue, 1024, + Ptr4); }