Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions sycl/doc/design/CommandGraph.md
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,26 @@ safely assumed to be more performant. It is not likely we'll try to allow
in-order execution in more scenarios through a complicated (and imperfect)
heuristic but rather expose this as a hint the user can provide.

### Graph Allocation Memory Reuse

When adding a new allocation node to a graph, memory allocations which have
previously been freed are checked to see if they can be reused. Because we have
to return a pointer to the user immediately when the CGF for a node is
processed, we have to do this eagerly anytime `async_malloc()` is called.
Allocations track the last free node associated with them to represent the most
recent use of that allocation.

To be reused, the two allocations must meet these criteria:

- They must be of the same allocation type (device/host/shared).
- They must have a matching size.
- They must have the same properties (currently only read-only matters).
- There must be a path from the last free node associated with a given
allocation to the new allocation node being added.

If these criteria are met we update the last free node for the allocation and
return the existing pointer to the user.

## Backend Implementation

Implementation of UR command-buffers for each of the supported SYCL 2020
Expand Down
1 change: 1 addition & 0 deletions sycl/source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ set(SYCL_COMMON_SOURCES
"detail/memory_pool_impl.cpp"
"detail/async_alloc.cpp"
"detail/memory_pool.cpp"
"detail/graph_memory_pool.cpp"
"$<$<PLATFORM_ID:Windows>:detail/windows_ur.cpp>"
"$<$<OR:$<PLATFORM_ID:Linux>,$<PLATFORM_ID:Darwin>>:detail/posix_ur.cpp>"
)
Expand Down
53 changes: 41 additions & 12 deletions sycl/source/detail/async_alloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
//
//===----------------------------------------------------------------------===//

#include "sycl/accessor.hpp"
#include <detail/context_impl.hpp>
#include <detail/event_impl.hpp>
#include <detail/graph_impl.hpp>
#include <detail/queue_impl.hpp>
#include <sycl/detail/ur.hpp>
#include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
Expand All @@ -29,6 +31,27 @@ getUrEvents(const std::vector<std::shared_ptr<detail::event_impl>> &DepEvents) {
}
return RetUrEvents;
}

std::vector<std::shared_ptr<detail::node_impl>> getDepGraphNodes(
sycl::handler &Handler, const std::shared_ptr<detail::queue_impl> &Queue,
const std::shared_ptr<detail::graph_impl> &Graph,
const std::vector<std::shared_ptr<detail::event_impl>> &DepEvents) {
auto HandlerImpl = detail::getSyclObjImpl(Handler);
// Get dependent graph nodes from any events
auto DepNodes = Graph->getNodesForEvents(DepEvents);
// If this node was added explicitly we may have node deps in the handler as
// well, so add them to the list
DepNodes.insert(DepNodes.end(), HandlerImpl->MNodeDeps.begin(),
HandlerImpl->MNodeDeps.end());
// If this is being recorded from an in-order queue we need to get the last
// in-order node if any, since this will later become a dependency of the
// node being processed here.
if (const auto &LastInOrderNode = Graph->getLastInorderNode(Queue);
LastInOrderNode) {
DepNodes.push_back(LastInOrderNode);
}
return DepNodes;
}
} // namespace

__SYCL_EXPORT
Expand All @@ -46,22 +69,23 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) {

auto &Adapter = h.getContextImplPtr()->getAdapter();

// Get events to wait on.
auto depEvents = getUrEvents(h.impl->CGData.MEvents);
uint32_t numEvents = h.impl->CGData.MEvents.size();
// Get CG event dependencies for this allocation.
const auto &DepEvents = h.impl->CGData.MEvents;
auto UREvents = getUrEvents(DepEvents);

void *alloc = nullptr;

ur_event_handle_t Event = nullptr;
// If a graph is present do the allocation from the graph memory pool instead.
if (auto Graph = h.getCommandGraph(); Graph) {
alloc = Graph->getMemPool().malloc(size, kind);
auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents);
alloc = Graph->getMemPool().malloc(size, kind, DepNodes);
} else {
auto &Q = h.MQueue->getHandleRef();
Adapter->call<sycl::errc::runtime,
sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
Q, (ur_usm_pool_handle_t)0, size, nullptr, numEvents, depEvents.data(),
&alloc, &Event);
Q, (ur_usm_pool_handle_t)0, size, nullptr, UREvents.size(),
UREvents.data(), &alloc, &Event);
}

// Async malloc must return a void* immediately.
Expand Down Expand Up @@ -95,24 +119,26 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size,
auto &Adapter = h.getContextImplPtr()->getAdapter();
auto &memPoolImpl = sycl::detail::getSyclObjImpl(pool);

// Get events to wait on.
auto depEvents = getUrEvents(h.impl->CGData.MEvents);
uint32_t numEvents = h.impl->CGData.MEvents.size();
// Get CG event dependencies for this allocation.
const auto &DepEvents = h.impl->CGData.MEvents;
auto UREvents = getUrEvents(DepEvents);

void *alloc = nullptr;

ur_event_handle_t Event = nullptr;
// If a graph is present do the allocation from the graph memory pool instead.
if (auto Graph = h.getCommandGraph(); Graph) {
auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents);

// Memory pool is passed as the graph may use some properties of it.
alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(),
alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), DepNodes,
sycl::detail::getSyclObjImpl(pool));
} else {
auto &Q = h.MQueue->getHandleRef();
Adapter->call<sycl::errc::runtime,
sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
Q, memPoolImpl.get()->get_handle(), size, nullptr, numEvents,
depEvents.data(), &alloc, &Event);
Q, memPoolImpl.get()->get_handle(), size, nullptr, UREvents.size(),
UREvents.data(), &alloc, &Event);
}
// Async malloc must return a void* immediately.
// Set up CommandGroup which is a no-op and pass the event from the alloc.
Expand Down Expand Up @@ -140,6 +166,9 @@ async_malloc_from_pool(const sycl::queue &q, size_t size,
}

__SYCL_EXPORT void async_free(sycl::handler &h, void *ptr) {
// We only check for errors for the graph here because marking the allocation
// as free in the graph memory pool requires a node object which doesn't exist
// at this point.
if (auto Graph = h.getCommandGraph(); Graph) {
// Check if the pointer to be freed has an associated allocation node, and
// error if not
Expand Down
20 changes: 18 additions & 2 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,8 @@ graph_impl::graph_impl(const sycl::context &SyclContext,
const sycl::device &SyclDevice,
const sycl::property_list &PropList)
: MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(),
MEventsMap(), MInorderQueueMap(), MGraphMemPool(SyclContext, SyclDevice),
MEventsMap(), MInorderQueueMap(),
MGraphMemPool(*this, SyclContext, SyclDevice),
MID(NextAvailableID.fetch_add(1, std::memory_order_relaxed)) {
checkGraphPropertiesAndThrow(PropList);
if (PropList.has_property<property::graph::no_cycle_check>()) {
Expand Down Expand Up @@ -509,6 +510,10 @@ graph_impl::add(std::function<void(handler &)> CGF,
sycl::handler Handler{shared_from_this()};
#endif

// Pass the node deps to the handler so they are available when processing the
// CGF, need for async_malloc nodes.
Handler.impl->MNodeDeps = Deps;

#if XPTI_ENABLE_INSTRUMENTATION
// Save code location if one was set in TLS.
// Ideally it would be nice to capture user's call code location
Expand All @@ -532,6 +537,10 @@ graph_impl::add(std::function<void(handler &)> CGF,

Handler.finalize();

// In explicit mode the handler processing of the CGF does not need a write
// lock as it does not modify the graph, we extract information from it here
// and modify the graph.
graph_impl::WriteLock Lock(MMutex);
node_type NodeType =
Handler.impl->MUserFacingNodeType !=
ext::oneapi::experimental::node_type::empty
Expand Down Expand Up @@ -602,6 +611,14 @@ graph_impl::add(node_type NodeType,

addDepsToNode(NodeImpl, Deps);

if (NodeType == node_type::async_free) {
auto AsyncFreeCG =
static_cast<CGAsyncFree *>(NodeImpl->MCommandGroup.get());
// If this is an async free node mark that it is now available for reuse,
// and pass the async free node for tracking.
MGraphMemPool.markAllocationAsAvailable(AsyncFreeCG->getPtr(), NodeImpl);
}

return NodeImpl;
}

Expand Down Expand Up @@ -1789,7 +1806,6 @@ node modifiable_command_graph::addImpl(std::function<void(handler &)> CGF,
DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
}

graph_impl::WriteLock Lock(impl->MMutex);
std::shared_ptr<detail::node_impl> NodeImpl = impl->add(CGF, {}, DepImpls);
return sycl::detail::createSyclObjFromImpl<node>(std::move(NodeImpl));
}
Expand Down
38 changes: 38 additions & 0 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,12 @@ class node_impl : public std::enable_shared_from_this<node_impl> {
case sycl::detail::CGType::EnqueueNativeCommand:
Stream << "CGNativeCommand \\n";
break;
case sycl::detail::CGType::AsyncAlloc:
Stream << "CGAsyncAlloc \\n";
break;
case sycl::detail::CGType::AsyncFree:
Stream << "CGAsyncFree \\n";
break;
default:
Stream << "Other \\n";
break;
Expand Down Expand Up @@ -937,6 +943,31 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
"No node in this graph is associated with this event");
}

/// Find the nodes associated with a list of SYCL events. Throws if no node is
/// found for a given event.
/// @param Events Events to find nodes for.
/// @return A list of node counterparts for each event, in the same order.
std::vector<std::shared_ptr<node_impl>> getNodesForEvents(
const std::vector<std::shared_ptr<sycl::detail::event_impl>> &Events) {
std::vector<std::shared_ptr<node_impl>> NodeList{};
NodeList.reserve(Events.size());

ReadLock Lock(MMutex);

for (const auto &Event : Events) {
if (auto NodeFound = MEventsMap.find(Event);
NodeFound != std::end(MEventsMap)) {
NodeList.push_back(NodeFound->second);
} else {
throw sycl::exception(
sycl::make_error_code(errc::invalid),
"No node in this graph is associated with this event");
}
}

return NodeList;
}

/// Query for the context tied to this graph.
/// @return Context associated with graph.
sycl::context getContext() const { return MContext; }
Expand Down Expand Up @@ -1191,6 +1222,13 @@ class graph_impl : public std::enable_shared_from_this<graph_impl> {
/// this graph.
size_t getExecGraphCount() const { return MExecGraphCount; }

/// Resets the visited edges variable across all nodes in the graph to 0.
void resetNodeVisitedEdges() {
for (auto &Node : MNodeStorage) {
Node->MTotalVisitedEdges = 0;
}
}

private:
/// Check the graph for cycles by performing a depth-first search of the
/// graph. If a node is visited more than once in a given path through the
Expand Down
Loading
Loading