Skip to content

Commit

Permalink
Don't touch my records! (refactor Cuda/HIP/SYCL/Threads to not direct…
Browse files Browse the repository at this point in the history
…ly mess with `SharedAllocationRecord`) (#6732)

* Do not use SharedAllocationRecord directly

* Purge Cuda/HIP graph implementation from SharedAllocationRecord

* MemorySpace::{free -> allocate} and FIXMEs for size argument

* Properly cast allocated pointer from void* to size_type*

* Fixup Cuda/HIP graph header includes and pointer casting

* Fix the FIXMEs
  • Loading branch information
dalg24 committed Jan 22, 2024
1 parent 407e18d commit 5610068
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 191 deletions.
20 changes: 4 additions & 16 deletions core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp
Expand Up @@ -23,8 +23,7 @@

#include <Kokkos_Graph_fwd.hpp>

#include <impl/Kokkos_GraphImpl.hpp> // GraphAccess needs to be complete
#include <impl/Kokkos_SharedAlloc.hpp> // SharedAllocationRecord
#include <impl/Kokkos_GraphImpl.hpp> // GraphAccess needs to be complete

#include <Kokkos_Parallel.hpp>
#include <Kokkos_Parallel_Reduce.hpp>
Expand All @@ -50,10 +49,6 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag,
// covers and we're not modifying it
Kokkos::ObservingRawPtr<const cudaGraph_t> m_graph_ptr = nullptr;
Kokkos::ObservingRawPtr<cudaGraphNode_t> m_graph_node_ptr = nullptr;
// Note: owned pointer to CudaSpace memory (used for global memory launches),
// which we're responsible for deallocating, but not responsible for calling
// its destructor.
using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
// Basically, we have to make this mutable for the same reasons that the
// global kernel buffers in the Cuda instance are mutable...
mutable Kokkos::OwningRawPtr<base_t> m_driver_storage = nullptr;
Expand Down Expand Up @@ -82,9 +77,7 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag,

~GraphNodeKernelImpl() {
if (m_driver_storage) {
// We should be the only owner, but this is still the easiest way to
// allocate and deallocate aligned memory for these sorts of things
Record::decrement(Record::get_record(m_driver_storage));
Kokkos::CudaSpace().deallocate(m_driver_storage, sizeof(base_t));
}
}

Expand All @@ -99,13 +92,8 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag,

Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer() const {
KOKKOS_EXPECTS(m_driver_storage == nullptr)

auto* record = Record::allocate(
Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage",
sizeof(base_t));

Record::increment(record);
m_driver_storage = reinterpret_cast<base_t*>(record->data());
m_driver_storage = static_cast<base_t*>(Kokkos::CudaSpace().allocate(
"GraphNodeKernel global memory functor storage", sizeof(base_t)));
KOKKOS_ENSURES(m_driver_storage != nullptr)
return m_driver_storage;
}
Expand Down
99 changes: 44 additions & 55 deletions core/src/Cuda/Kokkos_Cuda_Instance.cpp
Expand Up @@ -336,22 +336,19 @@ void CudaInternal::initialize(cudaStream_t stream) {
Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {
if (verify_is_initialized("scratch_flags") &&
m_scratchFlagsCount < scratch_count(size)) {
m_scratchFlagsCount = scratch_count(size);
auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);

using Record =
Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
if (m_scratchFlags) {
mem_space.deallocate(m_scratchFlags,
m_scratchFlagsCount * sizeScratchGrain);
}

if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
m_scratchFlagsCount = scratch_count(size);

std::size_t alloc_size =
multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain);
Record *const r =
Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream),
"Kokkos::InternalScratchFlags", alloc_size);

Record::increment(r);

m_scratchFlags = reinterpret_cast<size_type *>(r->data());
m_scratchFlags = static_cast<size_type *>(
mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size));

KOKKOS_IMPL_CUDA_SAFE_CALL(
(cuda_memset_wrapper(m_scratchFlags, 0, alloc_size)));
Expand All @@ -363,22 +360,19 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {
Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const {
if (verify_is_initialized("scratch_space") &&
m_scratchSpaceCount < scratch_count(size)) {
m_scratchSpaceCount = scratch_count(size);
auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);

using Record =
Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
if (m_scratchSpace) {
mem_space.deallocate(m_scratchSpace,
m_scratchSpaceCount * sizeScratchGrain);
}

if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
m_scratchSpaceCount = scratch_count(size);

std::size_t alloc_size =
multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain);
Record *const r =
Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream),
"Kokkos::InternalScratchSpace", alloc_size);

Record::increment(r);

m_scratchSpace = reinterpret_cast<size_type *>(r->data());
m_scratchSpace = static_cast<size_type *>(
mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size));
}

return m_scratchSpace;
Expand All @@ -387,45 +381,37 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const {
Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const {
if (verify_is_initialized("scratch_unified") &&
m_scratchUnifiedCount < scratch_count(size)) {
m_scratchUnifiedCount = scratch_count(size);
auto mem_space =
Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream);

using Record =
Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>;
if (m_scratchUnified) {
mem_space.deallocate(m_scratchUnified,
m_scratchUnifiedCount * sizeScratchGrain);
}

if (m_scratchUnified)
Record::decrement(Record::get_record(m_scratchUnified));
m_scratchUnifiedCount = scratch_count(size);

std::size_t alloc_size =
multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain);
Record *const r =
Record::allocate(CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream),
"Kokkos::InternalScratchUnified", alloc_size);

Record::increment(r);

m_scratchUnified = reinterpret_cast<size_type *>(r->data());
m_scratchUnified = static_cast<size_type *>(
mem_space.allocate("Kokkos::InternalScratchUnified", alloc_size));
}

return m_scratchUnified;
}

Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const {
if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) {
m_scratchFunctorSize = size;

using Record =
Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);

if (m_scratchFunctor)
Record::decrement(Record::get_record(m_scratchFunctor));
if (m_scratchFunctor) {
mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
}

Record *const r = Record::allocate(
CudaSpace::impl_create(m_cudaDev, m_stream),
"Kokkos::InternalScratchFunctor", m_scratchFunctorSize);

Record::increment(r);
m_scratchFunctorSize = size;

m_scratchFunctor = reinterpret_cast<size_type *>(r->data());
m_scratchFunctor = static_cast<size_type *>(mem_space.allocate(
"Kokkos::InternalScratchFunctor", m_scratchFunctorSize));
}

return m_scratchFunctor;
Expand Down Expand Up @@ -480,15 +466,18 @@ void CudaInternal::finalize() {
was_finalized = true;

if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
using RecordCuda = Kokkos::Impl::SharedAllocationRecord<CudaSpace>;
using RecordHost =
Kokkos::Impl::SharedAllocationRecord<CudaHostPinnedSpace>;

RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags));
RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace));
RecordHost::decrement(RecordHost::get_record(m_scratchUnified));
if (m_scratchFunctorSize > 0)
RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor));
auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);
auto host_mem_space =
Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream);
cuda_mem_space.deallocate(m_scratchFlags,
m_scratchFlagsCount * sizeScratchGrain);
cuda_mem_space.deallocate(m_scratchSpace,
m_scratchSpaceCount * sizeScratchGrain);
host_mem_space.deallocate(m_scratchUnified,
m_scratchUnifiedCount * sizeScratchGrain);
if (m_scratchFunctorSize > 0) {
cuda_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
}
}

for (int i = 0; i < m_n_team_scratch; ++i) {
Expand Down
14 changes: 3 additions & 11 deletions core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp
Expand Up @@ -26,7 +26,6 @@
#include <Kokkos_Parallel_Reduce.hpp>
#include <Kokkos_PointerOwnership.hpp>

#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp>
#include <HIP/Kokkos_HIP_GraphNode_Impl.hpp>

namespace Kokkos {
Expand All @@ -43,7 +42,6 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...>
using base_t =
typename PatternImplSpecializationFromTag<PatternTag, Functor, Policy,
Args..., Kokkos::HIP>::type;
using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;

// TODO use the name and executionspace
template <typename PolicyDeduced, typename... ArgsDeduced>
Expand All @@ -60,7 +58,7 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...>

~GraphNodeKernelImpl() {
if (m_driver_storage) {
Record::decrement(Record::get_record(m_driver_storage));
Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t));
}
}

Expand All @@ -78,15 +76,9 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...>

Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer() const {
KOKKOS_EXPECTS(m_driver_storage == nullptr);

auto* record = Record::allocate(
Kokkos::HIPSpace{}, "GraphNodeKernel global memory functor storage",
sizeof(base_t));

Record::increment(record);
m_driver_storage = reinterpret_cast<base_t*>(record->data());
m_driver_storage = static_cast<base_t*>(Kokkos::HIPSpace().allocate(
"GraphNodeKernel global memory functor storage", sizeof(base_t)));
KOKKOS_ENSURES(m_driver_storage != nullptr);

return m_driver_storage;
}

Expand Down
74 changes: 33 additions & 41 deletions core/src/HIP/Kokkos_HIP_Instance.cpp
Expand Up @@ -191,20 +191,19 @@ void HIPInternal::initialize(hipStream_t stream) {
Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) {
if (verify_is_initialized("scratch_space") &&
m_scratchSpaceCount < scratch_count(size)) {
m_scratchSpaceCount = scratch_count(size);
Kokkos::HIPSpace mem_space;

using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
if (m_scratchSpace) {
mem_space.deallocate(m_scratchSpace,
m_scratchSpaceCount * sizeScratchGrain);
}

if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
m_scratchSpaceCount = scratch_count(size);

std::size_t alloc_size =
multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain);
Record *const r = Record::allocate(
Kokkos::HIPSpace(), "Kokkos::InternalScratchSpace", alloc_size);

Record::increment(r);

m_scratchSpace = reinterpret_cast<size_type *>(r->data());
m_scratchSpace = static_cast<size_type *>(
mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size));
}

return m_scratchSpace;
Expand All @@ -213,20 +212,19 @@ Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) {
Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) {
if (verify_is_initialized("scratch_flags") &&
m_scratchFlagsCount < scratch_count(size)) {
m_scratchFlagsCount = scratch_count(size);
Kokkos::HIPSpace mem_space;

using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
if (m_scratchFlags) {
mem_space.deallocate(m_scratchFlags,
m_scratchFlagsCount * sizeScratchGrain);
}

if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
m_scratchFlagsCount = scratch_count(size);

std::size_t alloc_size =
multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain);
Record *const r = Record::allocate(
Kokkos::HIPSpace(), "Kokkos::InternalScratchFlags", alloc_size);

Record::increment(r);

m_scratchFlags = reinterpret_cast<size_type *>(r->data());
m_scratchFlags = static_cast<size_type *>(
mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size));

KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size));
}
Expand All @@ -237,29 +235,20 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) {
Kokkos::HIP::size_type *HIPInternal::stage_functor_for_execution(
void const *driver, std::size_t const size) const {
if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) {
m_scratchFunctorSize = size;

using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
using RecordHost =
Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPHostPinnedSpace, void>;
Kokkos::HIPSpace device_mem_space;
Kokkos::HIPHostPinnedSpace host_mem_space;

if (m_scratchFunctor) {
Record::decrement(Record::get_record(m_scratchFunctor));
RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost));
device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize);
}

Record *const r =
Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor",
m_scratchFunctorSize);
RecordHost *const r_host = RecordHost::allocate(
Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost",
m_scratchFunctorSize);

Record::increment(r);
RecordHost::increment(r_host);
m_scratchFunctorSize = size;

m_scratchFunctor = reinterpret_cast<size_type *>(r->data());
m_scratchFunctorHost = reinterpret_cast<size_type *>(r_host->data());
m_scratchFunctor = static_cast<size_type *>(device_mem_space.allocate(
"Kokkos::InternalScratchFunctor", m_scratchFunctorSize));
m_scratchFunctorHost = static_cast<size_type *>(host_mem_space.allocate(
"Kokkos::InternalScratchFunctorHost", m_scratchFunctorSize));
}

// When using HSA_XNACK=1, it is necessary to copy the driver to the host to
Expand Down Expand Up @@ -323,14 +312,17 @@ void HIPInternal::finalize() {
was_finalized = true;

if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
using RecordHIP = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace>;
Kokkos::HIPSpace device_mem_space;

RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags));
RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace));
device_mem_space.deallocate(m_scratchFlags,
m_scratchSpaceCount * sizeScratchGrain);
device_mem_space.deallocate(m_scratchSpace,
m_scratchFlagsCount * sizeScratchGrain);

if (m_scratchFunctorSize > 0) {
RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctor));
RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctorHost));
device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
Kokkos::HIPHostPinnedSpace host_mem_space;
host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize);
}
}

Expand Down

0 comments on commit 5610068

Please sign in to comment.