Don't touch my records! (refactor Cuda/HIP/SYCL/Threads to not direct…

…ly mess with `SharedAllocationRecord`) (#6732) * Do not use SharedAllocationRecord directly * Purge Cuda/HIP graph implementation from SharedAllocationRecord * MemorySpace::{free -> allocate} and FIXMEs for size argument * Properly cast allocated pointer from void* to size_type* * Fixup Cuda/HIP graph header includes and pointer casting * Fix the FIXMEs
kokkos · Jan 22, 2024 · 5610068 · 5610068
1 parent 407e18d
commit 5610068
Show file tree

Hide file tree

Showing 6 changed files with 124 additions and 191 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp
@@ -23,8 +23,7 @@
 
 #include <Kokkos_Graph_fwd.hpp>
 
-#include <impl/Kokkos_GraphImpl.hpp>    // GraphAccess needs to be complete
-#include <impl/Kokkos_SharedAlloc.hpp>  // SharedAllocationRecord
+#include <impl/Kokkos_GraphImpl.hpp>  // GraphAccess needs to be complete
 
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_Parallel_Reduce.hpp>
@@ -50,10 +49,6 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag,
   // covers and we're not modifying it
   Kokkos::ObservingRawPtr<const cudaGraph_t> m_graph_ptr    = nullptr;
   Kokkos::ObservingRawPtr<cudaGraphNode_t> m_graph_node_ptr = nullptr;
-  // Note: owned pointer to CudaSpace memory (used for global memory launches),
-  // which we're responsible for deallocating, but not responsible for calling
-  // its destructor.
-  using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
   // Basically, we have to make this mutable for the same reasons that the
   // global kernel buffers in the Cuda instance are mutable...
   mutable Kokkos::OwningRawPtr<base_t> m_driver_storage = nullptr;
@@ -82,9 +77,7 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag,
 
   ~GraphNodeKernelImpl() {
     if (m_driver_storage) {
-      // We should be the only owner, but this is still the easiest way to
-      // allocate and deallocate aligned memory for these sorts of things
-      Record::decrement(Record::get_record(m_driver_storage));
+      Kokkos::CudaSpace().deallocate(m_driver_storage, sizeof(base_t));
     }
   }
 
@@ -99,13 +92,8 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag,
 
   Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer() const {
     KOKKOS_EXPECTS(m_driver_storage == nullptr)
-
-    auto* record = Record::allocate(
-        Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage",
-        sizeof(base_t));
-
-    Record::increment(record);
-    m_driver_storage = reinterpret_cast<base_t*>(record->data());
+    m_driver_storage = static_cast<base_t*>(Kokkos::CudaSpace().allocate(
+        "GraphNodeKernel global memory functor storage", sizeof(base_t)));
     KOKKOS_ENSURES(m_driver_storage != nullptr)
     return m_driver_storage;
   }

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -336,22 +336,19 @@ void CudaInternal::initialize(cudaStream_t stream) {
 Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {
   if (verify_is_initialized("scratch_flags") &&
       m_scratchFlagsCount < scratch_count(size)) {
-    m_scratchFlagsCount = scratch_count(size);
+    auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);
 
-    using Record =
-        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
+    if (m_scratchFlags) {
+      mem_space.deallocate(m_scratchFlags,
+                           m_scratchFlagsCount * sizeScratchGrain);
+    }
 
-    if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
+    m_scratchFlagsCount = scratch_count(size);
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain);
-    Record *const r =
-        Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream),
-                         "Kokkos::InternalScratchFlags", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchFlags = reinterpret_cast<size_type *>(r->data());
+    m_scratchFlags = static_cast<size_type *>(
+        mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size));
 
     KOKKOS_IMPL_CUDA_SAFE_CALL(
         (cuda_memset_wrapper(m_scratchFlags, 0, alloc_size)));
@@ -363,22 +360,19 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {
 Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const {
   if (verify_is_initialized("scratch_space") &&
       m_scratchSpaceCount < scratch_count(size)) {
-    m_scratchSpaceCount = scratch_count(size);
+    auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);
 
-    using Record =
-        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
+    if (m_scratchSpace) {
+      mem_space.deallocate(m_scratchSpace,
+                           m_scratchSpaceCount * sizeScratchGrain);
+    }
 
-    if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
+    m_scratchSpaceCount = scratch_count(size);
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain);
-    Record *const r =
-        Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream),
-                         "Kokkos::InternalScratchSpace", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchSpace = reinterpret_cast<size_type *>(r->data());
+    m_scratchSpace = static_cast<size_type *>(
+        mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size));
   }
 
   return m_scratchSpace;
@@ -387,45 +381,37 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const {
 Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const {
   if (verify_is_initialized("scratch_unified") &&
       m_scratchUnifiedCount < scratch_count(size)) {
-    m_scratchUnifiedCount = scratch_count(size);
+    auto mem_space =
+        Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream);
 
-    using Record =
-        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>;
+    if (m_scratchUnified) {
+      mem_space.deallocate(m_scratchUnified,
+                           m_scratchUnifiedCount * sizeScratchGrain);
+    }
 
-    if (m_scratchUnified)
-      Record::decrement(Record::get_record(m_scratchUnified));
+    m_scratchUnifiedCount = scratch_count(size);
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain);
-    Record *const r =
-        Record::allocate(CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream),
-                         "Kokkos::InternalScratchUnified", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchUnified = reinterpret_cast<size_type *>(r->data());
+    m_scratchUnified = static_cast<size_type *>(
+        mem_space.allocate("Kokkos::InternalScratchUnified", alloc_size));
   }
 
   return m_scratchUnified;
 }
 
 Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const {
   if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) {
-    m_scratchFunctorSize = size;
-
-    using Record =
-        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
+    auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);
 
-    if (m_scratchFunctor)
-      Record::decrement(Record::get_record(m_scratchFunctor));
+    if (m_scratchFunctor) {
+      mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
+    }
 
-    Record *const r = Record::allocate(
-        CudaSpace::impl_create(m_cudaDev, m_stream),
-        "Kokkos::InternalScratchFunctor", m_scratchFunctorSize);
-
-    Record::increment(r);
+    m_scratchFunctorSize = size;
 
-    m_scratchFunctor = reinterpret_cast<size_type *>(r->data());
+    m_scratchFunctor = static_cast<size_type *>(mem_space.allocate(
+        "Kokkos::InternalScratchFunctor", m_scratchFunctorSize));
   }
 
   return m_scratchFunctor;
@@ -480,15 +466,18 @@ void CudaInternal::finalize() {
   was_finalized = true;
 
   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
-    using RecordCuda = Kokkos::Impl::SharedAllocationRecord<CudaSpace>;
-    using RecordHost =
-        Kokkos::Impl::SharedAllocationRecord<CudaHostPinnedSpace>;
-
-    RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags));
-    RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace));
-    RecordHost::decrement(RecordHost::get_record(m_scratchUnified));
-    if (m_scratchFunctorSize > 0)
-      RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor));
+    auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream);
+    auto host_mem_space =
+        Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream);
+    cuda_mem_space.deallocate(m_scratchFlags,
+                              m_scratchFlagsCount * sizeScratchGrain);
+    cuda_mem_space.deallocate(m_scratchSpace,
+                              m_scratchSpaceCount * sizeScratchGrain);
+    host_mem_space.deallocate(m_scratchUnified,
+                              m_scratchUnifiedCount * sizeScratchGrain);
+    if (m_scratchFunctorSize > 0) {
+      cuda_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
+    }
   }
 
   for (int i = 0; i < m_n_team_scratch; ++i) {

diff --git a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp
@@ -26,7 +26,6 @@
 #include <Kokkos_Parallel_Reduce.hpp>
 #include <Kokkos_PointerOwnership.hpp>
 
-#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp>
 #include <HIP/Kokkos_HIP_GraphNode_Impl.hpp>
 
 namespace Kokkos {
@@ -43,7 +42,6 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...>
   using base_t =
       typename PatternImplSpecializationFromTag<PatternTag, Functor, Policy,
                                                 Args..., Kokkos::HIP>::type;
-  using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
 
   // TODO use the name and executionspace
   template <typename PolicyDeduced, typename... ArgsDeduced>
@@ -60,7 +58,7 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...>
 
   ~GraphNodeKernelImpl() {
     if (m_driver_storage) {
-      Record::decrement(Record::get_record(m_driver_storage));
+      Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t));
     }
   }
 
@@ -78,15 +76,9 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...>
 
   Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer() const {
     KOKKOS_EXPECTS(m_driver_storage == nullptr);
-
-    auto* record = Record::allocate(
-        Kokkos::HIPSpace{}, "GraphNodeKernel global memory functor storage",
-        sizeof(base_t));
-
-    Record::increment(record);
-    m_driver_storage = reinterpret_cast<base_t*>(record->data());
+    m_driver_storage = static_cast<base_t*>(Kokkos::HIPSpace().allocate(
+        "GraphNodeKernel global memory functor storage", sizeof(base_t)));
     KOKKOS_ENSURES(m_driver_storage != nullptr);
-
     return m_driver_storage;
   }
 

diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -191,20 +191,19 @@ void HIPInternal::initialize(hipStream_t stream) {
 Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) {
   if (verify_is_initialized("scratch_space") &&
       m_scratchSpaceCount < scratch_count(size)) {
-    m_scratchSpaceCount = scratch_count(size);
+    Kokkos::HIPSpace mem_space;
 
-    using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
+    if (m_scratchSpace) {
+      mem_space.deallocate(m_scratchSpace,
+                           m_scratchSpaceCount * sizeScratchGrain);
+    }
 
-    if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));
+    m_scratchSpaceCount = scratch_count(size);
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain);
-    Record *const r = Record::allocate(
-        Kokkos::HIPSpace(), "Kokkos::InternalScratchSpace", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchSpace = reinterpret_cast<size_type *>(r->data());
+    m_scratchSpace = static_cast<size_type *>(
+        mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size));
   }
 
   return m_scratchSpace;
@@ -213,20 +212,19 @@ Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) {
 Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) {
   if (verify_is_initialized("scratch_flags") &&
       m_scratchFlagsCount < scratch_count(size)) {
-    m_scratchFlagsCount = scratch_count(size);
+    Kokkos::HIPSpace mem_space;
 
-    using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
+    if (m_scratchFlags) {
+      mem_space.deallocate(m_scratchFlags,
+                           m_scratchFlagsCount * sizeScratchGrain);
+    }
 
-    if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));
+    m_scratchFlagsCount = scratch_count(size);
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain);
-    Record *const r = Record::allocate(
-        Kokkos::HIPSpace(), "Kokkos::InternalScratchFlags", alloc_size);
-
-    Record::increment(r);
-
-    m_scratchFlags = reinterpret_cast<size_type *>(r->data());
+    m_scratchFlags = static_cast<size_type *>(
+        mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size));
 
     KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size));
   }
@@ -237,29 +235,20 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) {
 Kokkos::HIP::size_type *HIPInternal::stage_functor_for_execution(
     void const *driver, std::size_t const size) const {
   if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) {
-    m_scratchFunctorSize = size;
-
-    using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
-    using RecordHost =
-        Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPHostPinnedSpace, void>;
+    Kokkos::HIPSpace device_mem_space;
+    Kokkos::HIPHostPinnedSpace host_mem_space;
 
     if (m_scratchFunctor) {
-      Record::decrement(Record::get_record(m_scratchFunctor));
-      RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost));
+      device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
+      host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize);
     }
 
-    Record *const r =
-        Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor",
-                         m_scratchFunctorSize);
-    RecordHost *const r_host = RecordHost::allocate(
-        Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost",
-        m_scratchFunctorSize);
-
-    Record::increment(r);
-    RecordHost::increment(r_host);
+    m_scratchFunctorSize = size;
 
-    m_scratchFunctor     = reinterpret_cast<size_type *>(r->data());
-    m_scratchFunctorHost = reinterpret_cast<size_type *>(r_host->data());
+    m_scratchFunctor     = static_cast<size_type *>(device_mem_space.allocate(
+        "Kokkos::InternalScratchFunctor", m_scratchFunctorSize));
+    m_scratchFunctorHost = static_cast<size_type *>(host_mem_space.allocate(
+        "Kokkos::InternalScratchFunctorHost", m_scratchFunctorSize));
   }
 
   // When using HSA_XNACK=1, it is necessary to copy the driver to the host to
@@ -323,14 +312,17 @@ void HIPInternal::finalize() {
   was_finalized = true;
 
   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
-    using RecordHIP = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace>;
+    Kokkos::HIPSpace device_mem_space;
 
-    RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags));
-    RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace));
+    device_mem_space.deallocate(m_scratchFlags,
+                                m_scratchSpaceCount * sizeScratchGrain);
+    device_mem_space.deallocate(m_scratchSpace,
+                                m_scratchFlagsCount * sizeScratchGrain);
 
     if (m_scratchFunctorSize > 0) {
-      RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctor));
-      RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctorHost));
+      device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize);
+      Kokkos::HIPHostPinnedSpace host_mem_space;
+      host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize);
     }
   }