Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions sycl/doc/design/CommandGraph.md
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,28 @@ from the same dynamic command-group object. This allows the SYCL runtime to
access the list of alternative kernels when calling the UR API to append a
kernel command to a command-buffer.

## Graph-Owned Memory Allocations
### Device Allocations

Device allocations for graphs are implemented using virtual memory. Allocation
commands performing a virtual reservation for the provided size, and physical
memory is created and mapped only during graph finalization. This allows valid
device addresses to be returned immediately when building the graph without the
penalty of doing any memory allocations during graph building, which could have
a negative impact on features such as whole-graph update through increased
overhead.

### Behaviour of async_free

`async_free` nodes are treated as hints rather than an actual memory free
operation. This is because deallocating during graph execution is both
undesirable for performance and not feasible with the current
implementation/backends. Instead a free node represents a promise from the user
that the memory is no longer in use. This enables optimizations such as
potentially reusing that memory for subsequent allocation nodes in the graph.
This allows us to reduce the total amount of concurrent memory required by a
single graph.

## Optimizations
### Interactions with Profiling

Expand Down
8 changes: 7 additions & 1 deletion sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,9 @@ enum class node_type {
memadvise = 7,
ext_oneapi_barrier = 8,
host_task = 9,
native_command = 10
native_command = 10,
async_malloc = 11,
async_free = 12
};

/// Class representing a node in the graph, returned by command_graph::add().
Expand Down Expand Up @@ -429,6 +431,10 @@ class __SYCL_EXPORT executable_command_graph
/// @param Nodes The nodes to use for updating the graph.
void update(const std::vector<node> &Nodes);

/// Return the total amount of memory required by this graph for graph-owned
/// memory allocations.
size_t get_required_mem_size() const;

/// Common Reference Semantics
friend bool operator==(const executable_command_graph &LHS,
const executable_command_graph &RHS) {
Expand Down
57 changes: 34 additions & 23 deletions sycl/source/detail/async_alloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,25 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) {
sycl::make_error_code(sycl::errc::feature_not_supported),
"Only device backed asynchronous allocations are supported!");

h.throwIfGraphAssociated<
ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
sycl_ext_oneapi_async_alloc>();

auto &Adapter = h.getContextImplPtr()->getAdapter();
auto &Q = h.MQueue->getHandleRef();

// Get events to wait on.
auto depEvents = getUrEvents(h.impl->CGData.MEvents);
uint32_t numEvents = h.impl->CGData.MEvents.size();

void *alloc = nullptr;
ur_event_handle_t Event;
Adapter->call<sycl::errc::runtime,
sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, depEvents.data(),
&alloc, &Event);

ur_event_handle_t Event = nullptr;
// If a graph is present do the allocation from the graph memory pool instead.
if (auto Graph = h.getCommandGraph(); Graph) {
alloc = Graph->getMemPool().malloc(size, kind);
} else {
auto &Q = h.MQueue->getHandleRef();
Adapter->call<sycl::errc::runtime,
sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, depEvents.data(),
&alloc, &Event);
}

// Async malloc must return a void* immediately.
// Set up CommandGroup which is a no-op and pass the
Expand Down Expand Up @@ -90,25 +92,28 @@ __SYCL_EXPORT void *async_malloc(const sycl::queue &q, sycl::usm::alloc kind,
__SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size,
const memory_pool &pool) {

h.throwIfGraphAssociated<
ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
sycl_ext_oneapi_async_alloc>();

auto &Adapter = h.getContextImplPtr()->getAdapter();
auto &Q = h.MQueue->getHandleRef();
auto &memPoolImpl = sycl::detail::getSyclObjImpl(pool);

// Get events to wait on.
auto depEvents = getUrEvents(h.impl->CGData.MEvents);
uint32_t numEvents = h.impl->CGData.MEvents.size();

void *alloc = nullptr;
ur_event_handle_t Event;
Adapter->call<sycl::errc::runtime,
sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
Q, memPoolImpl.get()->get_handle(), size, nullptr, numEvents,
depEvents.data(), &alloc, &Event);

ur_event_handle_t Event = nullptr;
// If a graph is present do the allocation from the graph memory pool instead.
if (auto Graph = h.getCommandGraph(); Graph) {
// Memory pool is passed as the graph may use some properties of it.
alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(),
sycl::detail::getSyclObjImpl(pool));
} else {
auto &Q = h.MQueue->getHandleRef();
Adapter->call<sycl::errc::runtime,
sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
Q, memPoolImpl.get()->get_handle(), size, nullptr, numEvents,
depEvents.data(), &alloc, &Event);
}
// Async malloc must return a void* immediately.
// Set up CommandGroup which is a no-op and pass the event from the alloc.
h.impl->MAsyncAllocEvent = Event;
Expand All @@ -135,9 +140,15 @@ async_malloc_from_pool(const sycl::queue &q, size_t size,
}

__SYCL_EXPORT void async_free(sycl::handler &h, void *ptr) {
h.throwIfGraphAssociated<
ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
sycl_ext_oneapi_async_alloc>();
if (auto Graph = h.getCommandGraph(); Graph) {
// Check if the pointer to be freed has an associated allocation node, and
// error if not
if (!Graph->getMemPool().hasAllocation(ptr)) {
throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
"Cannot add a free node to a graph for which "
"there is no associated allocation node!");
}
}

h.impl->MFreePtr = ptr;
h.setType(detail::CGType::AsyncFree);
Expand Down
41 changes: 35 additions & 6 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ inline const char *nodeTypeToString(node_type NodeType) {
return "host_task";
case node_type::native_command:
return "native_command";
case node_type::async_malloc:
return "async_malloc";
case node_type::async_free:
return "async_free";
}
assert(false && "Unhandled node type");
return {};
Expand Down Expand Up @@ -340,7 +344,7 @@ graph_impl::graph_impl(const sycl::context &SyclContext,
const sycl::device &SyclDevice,
const sycl::property_list &PropList)
: MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(),
MEventsMap(), MInorderQueueMap(),
MEventsMap(), MInorderQueueMap(), MGraphMemPool(SyclContext, SyclDevice),
MID(NextAvailableID.fetch_add(1, std::memory_order_relaxed)) {
checkGraphPropertiesAndThrow(PropList);
if (PropList.has_property<property::graph::no_cycle_check>()) {
Expand Down Expand Up @@ -750,12 +754,12 @@ void graph_impl::beginRecording(
}
}

// Check if nodes are empty and if so loop back through predecessors until we
// find the real dependency.
// Check if nodes do not require enqueueing and if so loop back through
// predecessors until we find the real dependency.
void exec_graph_impl::findRealDeps(
std::vector<ur_exp_command_buffer_sync_point_t> &Deps,
std::shared_ptr<node_impl> CurrentNode, int ReferencePartitionNum) {
if (CurrentNode->isEmpty()) {
if (!CurrentNode->requiresEnqueue()) {
for (auto &N : CurrentNode->MPredecessors) {
auto NodeImpl = N.lock();
findRealDeps(Deps, NodeImpl, ReferencePartitionNum);
Expand Down Expand Up @@ -875,9 +879,9 @@ void exec_graph_impl::createCommandBuffers(
Partition->MCommandBuffers[Device] = OutCommandBuffer;

for (const auto &Node : Partition->MSchedule) {
// Empty nodes are not processed as other nodes, but only their
// Some nodes are not scheduled like other nodes, and only their
// dependencies are propagated in findRealDeps
if (Node->isEmpty())
if (!Node->requiresEnqueue())
continue;

sycl::detail::CGType type = Node->MCGType;
Expand Down Expand Up @@ -943,6 +947,8 @@ exec_graph_impl::exec_graph_impl(sycl::context Context,

exec_graph_impl::~exec_graph_impl() {
try {
MGraphImpl->markExecGraphDestroyed();

const sycl::detail::AdapterPtr &Adapter =
sycl::detail::getSyclObjImpl(MContext)->getAdapter();
MSchedule.clear();
Expand All @@ -952,6 +958,9 @@ exec_graph_impl::~exec_graph_impl() {
Event->wait(Event);
}

// Clean up any graph-owned allocations that were allocated
MGraphImpl->getMemPool().deallocateAndUnmapAll();

for (const auto &Partition : MPartitions) {
Partition->MSchedule.clear();
for (const auto &Iter : Partition->MCommandBuffers) {
Expand Down Expand Up @@ -1810,6 +1819,14 @@ modifiable_command_graph::finalize(const sycl::property_list &PropList) const {
// Graph is read and written in this scope so we lock
// this graph with full priviledges.
graph_impl::WriteLock Lock(impl->MMutex);
// If the graph uses graph-owned allocations and an executable graph already
// exists we must throw an error.
if (impl->getMemPool().hasAllocations() && impl->getExecGraphCount() > 0) {
throw sycl::exception(sycl::make_error_code(errc::invalid),
"Graphs containing allocations can only have a "
"single executable graph alive at any one time.");
}

return command_graph<graph_state::executable>{
this->impl, this->impl->getContext(), PropList};
}
Expand Down Expand Up @@ -1937,11 +1954,16 @@ executable_command_graph::executable_command_graph(
const property_list &PropList)
: impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph, PropList)) {
finalizeImpl(); // Create backend representation for executable graph
// Mark that we have created an executable graph from the modifiable graph.
Graph->markExecGraphCreated();
}

void executable_command_graph::finalizeImpl() {
impl->makePartitions();

// Handle any work required for graph-owned memory allocations
impl->finalizeMemoryAllocations();

auto Device = impl->getGraphImpl()->getDevice();
for (auto Partition : impl->getPartitions()) {
if (!Partition->isHostTask()) {
Expand Down Expand Up @@ -1969,6 +1991,13 @@ void executable_command_graph::update(const std::vector<node> &Nodes) {
impl->update(NodeImpls);
}

size_t executable_command_graph::get_required_mem_size() const {
// Since each graph has a unique mem pool, return the current memory usage for
// now. This call my change if we move to being able to share memory between
// unique graphs.
return impl->getGraphImpl()->getMemPool().getMemUseCurrent();
}

dynamic_parameter_base::dynamic_parameter_base(
command_graph<graph_state::modifiable> Graph)
: impl(std::make_shared<dynamic_parameter_impl>(
Expand Down
68 changes: 68 additions & 0 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <detail/accessor_impl.hpp>
#include <detail/cg.hpp>
#include <detail/event_impl.hpp>
#include <detail/graph_memory_pool.hpp>
#include <detail/host_task.hpp>
#include <detail/kernel_impl.hpp>
#include <detail/sycl_mem_obj_t.hpp>
Expand Down Expand Up @@ -73,6 +74,11 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) {
return node_type::subgraph;
case sycl::detail::CGType::EnqueueNativeCommand:
return node_type::native_command;
case sycl::detail::CGType::AsyncAlloc:
return node_type::async_malloc;
case sycl::detail::CGType::AsyncFree:
return node_type::async_free;

default:
assert(false && "Invalid Graph Node Type");
return node_type::empty;
Expand Down Expand Up @@ -473,6 +479,21 @@ class node_impl : public std::enable_shared_from_this<node_impl> {
}
}

/// Returns true if this node should be enqueued to the backend, if not only
/// its dependencies are considered.
bool requiresEnqueue() const {
switch (MNodeType) {
case node_type::empty:
case node_type::ext_oneapi_barrier:
case node_type::async_malloc:
case node_type::async_free:
return false;

default:
return true;
}
}

private:
void rebuildArgStorage(std::vector<sycl::detail::ArgDesc> &Args,
const std::vector<std::vector<char>> &OldArgStorage,
Expand Down Expand Up @@ -919,6 +940,12 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
/// @return Context associated with graph.
sycl::context getContext() const { return MContext; }

/// Query for the context impl tied to this graph.
/// @return shared_ptr ref for the context impl associated with graph.
const std::shared_ptr<sycl::detail::context_impl> &getContextImplPtr() const {
return sycl::detail::getSyclObjImpl(MContext);
}

/// Query for the device_impl tied to this graph.
/// @return device_impl shared ptr reference associated with graph.
const DeviceImplPtr &getDeviceImplPtr() const {
Expand Down Expand Up @@ -1139,6 +1166,32 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {

unsigned long long getID() const { return MID; }

/// Get the memory pool used for graph-owned allocations.
graph_mem_pool &getMemPool() { return MGraphMemPool; }

/// Mark that an executable graph was created from this modifiable graph, used
/// for tracking live graphs for graph-owned allocations.
void markExecGraphCreated() { MExecGraphCount++; }

/// Mark that an executable graph created from this modifiable graph was
/// destroyed, used for tracking live graphs for graph-owned allocations.
void markExecGraphDestroyed() {
while (true) {
size_t CurrentVal = MExecGraphCount;
if (CurrentVal == 0) {
break;
}
if (MExecGraphCount.compare_exchange_strong(CurrentVal, CurrentVal - 1) ==
false) {
continue;
}
}
}

/// Get the number of unique executable graph instances currently alive for
/// this graph.
size_t getExecGraphCount() const { return MExecGraphCount; }

private:
/// Check the graph for cycles by performing a depth-first search of the
/// graph. If a node is visited more than once in a given path through the
Expand Down Expand Up @@ -1206,10 +1259,17 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
std::map<std::weak_ptr<sycl::detail::queue_impl>, std::shared_ptr<node_impl>,
std::owner_less<std::weak_ptr<sycl::detail::queue_impl>>>
MBarrierDependencyMap;
/// Graph memory pool for handling graph-owned memory allocations for this
/// graph.
graph_mem_pool MGraphMemPool;

unsigned long long MID;
// Used for std::hash in order to create a unique hash for the instance.
inline static std::atomic<unsigned long long> NextAvailableID = 0;

// The number of live executable graphs that have been created from this
// modifiable graph
std::atomic<size_t> MExecGraphCount = 0;
};

/// Class representing the implementation of command_graph<executable>.
Expand Down Expand Up @@ -1334,6 +1394,14 @@ class exec_graph_impl {

unsigned long long getID() const { return MID; }

/// Do any work required during finalization to finalize graph-owned memory
/// allocations.
void finalizeMemoryAllocations() {
// This call allocates physical memory and maps all virtual device
// allocations
MGraphImpl->getMemPool().allocateAndMapAll();
}

private:
/// Create a command-group for the node and add it to command-buffer by going
/// through the scheduler.
Expand Down
Loading