From 0794ade1333f2677e3a26746e1943a7fff159ce9 Mon Sep 17 00:00:00 2001
From: Daniel Arndt <arndtd@ornl.gov>
Date: Fri, 26 Jan 2024 16:39:51 +0000
Subject: [PATCH 1/5] Kokkos::fence should fence all devices

---
 core/src/Cuda/Kokkos_Cuda_Instance.cpp | 18 +++++++++---------
 core/src/Cuda/Kokkos_Cuda_Instance.hpp | 11 +++++------
 2 files changed, 14 insertions(+), 15 deletions(-)
diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 870284b3723..1e2e9e8c72f 100644
--- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -135,7 +135,6 @@ Kokkos::View<uint32_t *, Kokkos::CudaSpace> cuda_global_unique_token_locks(
   return locks;
 }
 
-// FIXME_CUDA_MULTIPLE_DEVICES
 void cuda_device_synchronize(const std::string &name) {
   Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
       name,
@@ -144,16 +143,16 @@ void cuda_device_synchronize(const std::string &name) {
 #if defined(KOKKOS_COMPILER_CLANG)
       // annotate with __host__ silence a clang warning about using
       // cudaDeviceSynchronize in device code
-      [] __host__() {
-        KOKKOS_IMPL_CUDA_SAFE_CALL(
-            (CudaInternal::singleton().cuda_device_synchronize_wrapper()));
-      });
+      [] __host__()
 #else
-      []() {
-        KOKKOS_IMPL_CUDA_SAFE_CALL(
-            (CudaInternal::singleton().cuda_device_synchronize_wrapper()));
-      });
+      []()
 #endif
+      {
+        for (int cuda_device : CudaInternal::cuda_devices) {
+          KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device));
+          KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+        }
+      });
 }
 
 void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr,
@@ -293,6 +292,7 @@ void CudaInternal::initialize(cudaStream_t stream) {
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
 
   m_stream = stream;
+  CudaInternal::cuda_devices.insert(m_cudaDev);
 
   //----------------------------------
   // Multiblock reduction uses scratch flags for counters
diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
index 9c452573a51..d6ef38708e1 100644
--- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -23,6 +23,10 @@
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <cuda_runtime_api.h>
 #include "Kokkos_CudaSpace.hpp"
+
+#include <set>
+#include <map>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // These functions fulfill the purpose of allowing to work around
@@ -116,6 +120,7 @@ class CudaInternal {
   bool was_initialized = false;
   bool was_finalized   = false;
 
+  inline static std::set<int> cuda_devices = {};
   // FIXME_CUDA: these want to be per-device, not per-stream...  use of 'static'
   //  here will break once there are multiple devices though
   inline static unsigned long* constantMemHostStaging = nullptr;
@@ -221,12 +226,6 @@ class CudaInternal {
     return cudaDeviceSetLimit(limit, value);
   }
 
-  template <bool setCudaDevice = true>
-  cudaError_t cuda_device_synchronize_wrapper() const {
-    if constexpr (setCudaDevice) set_cuda_device();
-    return cudaDeviceSynchronize();
-  }
-
   template <bool setCudaDevice = true>
   cudaError_t cuda_event_create_wrapper(cudaEvent_t* event) const {
     if constexpr (setCudaDevice) set_cuda_device();

From c473b3f57921cba9f73e2ad3bbfe142afdc054a6 Mon Sep 17 00:00:00 2001
From: Daniel Arndt <arndtd@ornl.gov>
Date: Fri, 26 Jan 2024 17:33:02 +0000
Subject: [PATCH 2/5] Create a couple more variables per device

---
 core/src/Cuda/Kokkos_Cuda_Instance.cpp     | 38 ++++++++++++----------
 core/src/Cuda/Kokkos_Cuda_Instance.hpp     |  9 +++--
 core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 11 ++++---
 3 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 1e2e9e8c72f..f8ee6a86464 100644
--- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -148,7 +148,7 @@ void cuda_device_synchronize(const std::string &name) {
       []()
 #endif
       {
-        for (int cuda_device : CudaInternal::cuda_devices) {
+        for (int cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) {
           KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device));
           KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
         }
@@ -313,6 +313,17 @@ void CudaInternal::initialize(cudaStream_t stream) {
     (void)scratch_space(reduce_block_count * 16 * sizeof(size_type));
   }
 
+  // Allocate a staging buffer for constant mem in pinned host memory
+  // and an event to avoid overwriting driver for previous kernel launches
+  if (!constantMemHostStagingPerDevice[m_cudaDev])
+    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper(
+        reinterpret_cast<void **>(&constantMemHostStagingPerDevice[m_cudaDev]),
+        CudaTraits::ConstantMemoryUsage)));
+
+  if (!constantMemReusablePerDevice[m_cudaDev])
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev])));
+
   for (int i = 0; i < m_n_team_scratch; ++i) {
     m_team_scratch_current_size[i] = 0;
     m_team_scratch_ptr[i]          = nullptr;
@@ -612,30 +623,21 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream));
 
-  // Init the array for used for arbitrarily sized atomics
-  desul::Impl::init_lock_arrays();  // FIXME
-
-  // Allocate a staging buffer for constant mem in pinned host memory and an
-  // event to avoid overwriting driver for previous kernel launches
-  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMallocHost(
-      reinterpret_cast<void **>(&Impl::CudaInternal::constantMemHostStaging),
-      Impl::CudaTraits::ConstantMemoryUsage));
-
-  KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaEventCreate(&Impl::CudaInternal::constantMemReusable));
-
   Impl::CudaInternal::singleton().initialize(singleton_stream);
 }
 
 void Cuda::impl_finalize() {
   (void)Impl::cuda_global_unique_token_locks(true);
-
   desul::Impl::finalize_lock_arrays();  // FIXME
 
-  KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaEventDestroy(Impl::CudaInternal::constantMemReusable));
-  KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaFreeHost(Impl::CudaInternal::constantMemHostStaging));
+  for (auto &cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice
+                         [cuda_device]));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(
+        Kokkos::Impl::CudaInternal::constantMemReusablePerDevice[cuda_device]));
+  }
 
   auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false);
   if (deep_copy_space)
diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
index d6ef38708e1..15fce92205e 100644
--- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -121,11 +121,10 @@ class CudaInternal {
   bool was_finalized   = false;
 
   inline static std::set<int> cuda_devices = {};
-  // FIXME_CUDA: these want to be per-device, not per-stream...  use of 'static'
-  //  here will break once there are multiple devices though
-  inline static unsigned long* constantMemHostStaging = nullptr;
-  inline static cudaEvent_t constantMemReusable       = nullptr;
-  inline static std::mutex constantMemMutex;
+  inline static std::map<int, unsigned long*> constantMemHostStagingPerDevice =
+      {};
+  inline static std::map<int, cudaEvent_t> constantMemReusablePerDevice = {};
+  inline static std::map<int, std::mutex> constantMemMutexPerDevice;
 
   static CudaInternal& singleton();
 
diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index b68eec13a01..1920cc7936a 100644
--- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -576,13 +576,16 @@ struct CudaParallelLaunchKernelInvoker<
   static void invoke_kernel(DriverType const& driver, dim3 const& grid,
                             dim3 const& block, int shmem,
                             CudaInternal const* cuda_instance) {
+    int cuda_device = cuda_instance->m_cudaDev;
     // Wait until the previous kernel that uses the constant buffer is done
-    std::lock_guard<std::mutex> lock(CudaInternal::constantMemMutex);
+    std::lock_guard<std::mutex> lock(
+        CudaInternal::constantMemMutexPerDevice[cuda_device]);
     KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper(
-        CudaInternal::constantMemReusable)));
+        CudaInternal::constantMemReusablePerDevice[cuda_device])));
 
     // Copy functor (synchronously) to staging buffer in pinned host memory
-    unsigned long* staging = cuda_instance->constantMemHostStaging;
+    unsigned long* staging =
+        cuda_instance->constantMemHostStagingPerDevice[cuda_device];
     memcpy(staging, &driver, sizeof(DriverType));
 
     // Copy functor asynchronously from there to constant memory on the device
@@ -597,7 +600,7 @@ struct CudaParallelLaunchKernelInvoker<
 
     // Record an event that says when the constant buffer can be reused
     KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper(
-        CudaInternal::constantMemReusable)));
+        CudaInternal::constantMemReusablePerDevice[cuda_device])));
   }
 
   inline static void create_parallel_launch_graph_node(

From 191003f8c4af1dec50d3abe1339a07cca7709cc1 Mon Sep 17 00:00:00 2001
From: Daniel Arndt <arndtd@ornl.gov>
Date: Fri, 26 Jan 2024 18:28:53 -0500
Subject: [PATCH 3/5] Don't forget desul::Impl::init_lock_arrays();

---
 core/src/Cuda/Kokkos_Cuda_Instance.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index f8ee6a86464..d4aa7eecddd 100644
--- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -623,6 +623,9 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream));
 
+  // Init the array for used for arbitrarily sized atomics
+  desul::Impl::init_lock_arrays();  // FIXME
+
   Impl::CudaInternal::singleton().initialize(singleton_stream);
 }
 

From 1b9118c7b3b72535e58808b6dead7cee5366b992 Mon Sep 17 00:00:00 2001
From: Daniel Arndt <arndtd@ornl.gov>
Date: Thu, 1 Feb 2024 15:34:01 -0500
Subject: [PATCH 4/5] Address reviewer comments

---
 core/src/Cuda/Kokkos_Cuda_Instance.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index d4aa7eecddd..e68e3f3e0b0 100644
--- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -294,6 +294,17 @@ void CudaInternal::initialize(cudaStream_t stream) {
   m_stream = stream;
   CudaInternal::cuda_devices.insert(m_cudaDev);
 
+  // Allocate a staging buffer for constant mem in pinned host memory
+  // and an event to avoid overwriting driver for previous kernel launches
+  if (!constantMemHostStagingPerDevice[m_cudaDev])
+    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper(
+        reinterpret_cast<void **>(&constantMemHostStagingPerDevice[m_cudaDev]),
+        CudaTraits::ConstantMemoryUsage)));
+
+  if (!constantMemReusablePerDevice[m_cudaDev])
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev])));
+
   //----------------------------------
   // Multiblock reduction uses scratch flags for counters
   // and scratch space for partial reduction values.
@@ -313,17 +324,6 @@ void CudaInternal::initialize(cudaStream_t stream) {
     (void)scratch_space(reduce_block_count * 16 * sizeof(size_type));
   }
 
-  // Allocate a staging buffer for constant mem in pinned host memory
-  // and an event to avoid overwriting driver for previous kernel launches
-  if (!constantMemHostStagingPerDevice[m_cudaDev])
-    KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper(
-        reinterpret_cast<void **>(&constantMemHostStagingPerDevice[m_cudaDev]),
-        CudaTraits::ConstantMemoryUsage)));
-
-  if (!constantMemReusablePerDevice[m_cudaDev])
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev])));
-
   for (int i = 0; i < m_n_team_scratch; ++i) {
     m_team_scratch_current_size[i] = 0;
     m_team_scratch_ptr[i]          = nullptr;
@@ -633,7 +633,7 @@ void Cuda::impl_finalize() {
   (void)Impl::cuda_global_unique_token_locks(true);
   desul::Impl::finalize_lock_arrays();  // FIXME
 
-  for (auto &cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) {
+  for (const auto cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) {
     KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device));
     KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice

From 871ab8f5859a8d01088ca882c0504864ac32e60d Mon Sep 17 00:00:00 2001
From: Daniel Arndt <arndtd@ornl.gov>
Date: Thu, 1 Feb 2024 16:44:15 -0500
Subject: [PATCH 5/5] Add {} for std::map initialization

Co-authored-by: Damien L-G <dalg24@gmail.com>
---
 core/src/Cuda/Kokkos_Cuda_Instance.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
index 15fce92205e..581817fcb93 100644
--- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -124,7 +124,7 @@ class CudaInternal {
   inline static std::map<int, unsigned long*> constantMemHostStagingPerDevice =
       {};
   inline static std::map<int, cudaEvent_t> constantMemReusablePerDevice = {};
-  inline static std::map<int, std::mutex> constantMemMutexPerDevice;
+  inline static std::map<int, std::mutex> constantMemMutexPerDevice     = {};
 
   static CudaInternal& singleton();