From 0794ade1333f2677e3a26746e1943a7fff159ce9 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 26 Jan 2024 16:39:51 +0000 Subject: [PATCH 1/5] Kokkos::fence should fence all devices --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 18 +++++++++--------- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 11 +++++------ 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 870284b3723..1e2e9e8c72f 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -135,7 +135,6 @@ Kokkos::View cuda_global_unique_token_locks( return locks; } -// FIXME_CUDA_MULTIPLE_DEVICES void cuda_device_synchronize(const std::string &name) { Kokkos::Tools::Experimental::Impl::profile_fence_event( name, @@ -144,16 +143,16 @@ void cuda_device_synchronize(const std::string &name) { #if defined(KOKKOS_COMPILER_CLANG) // annotate with __host__ silence a clang warning about using // cudaDeviceSynchronize in device code - [] __host__() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + [] __host__() #else - []() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + []() #endif + { + for (int cuda_device : CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } + }); } void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, @@ -293,6 +292,7 @@ void CudaInternal::initialize(cudaStream_t stream) { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); m_stream = stream; + CudaInternal::cuda_devices.insert(m_cudaDev); //---------------------------------- // Multiblock reduction uses scratch flags for counters diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 9c452573a51..d6ef38708e1 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -23,6 +23,10 @@ #include #include #include "Kokkos_CudaSpace.hpp" + +#include +#include + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- // These functions fulfill the purpose of allowing to work around @@ -116,6 +120,7 @@ class CudaInternal { bool was_initialized = false; bool was_finalized = false; + inline static std::set cuda_devices = {}; // FIXME_CUDA: these want to be per-device, not per-stream... use of 'static' // here will break once there are multiple devices though inline static unsigned long* constantMemHostStaging = nullptr; @@ -221,12 +226,6 @@ class CudaInternal { return cudaDeviceSetLimit(limit, value); } - template - cudaError_t cuda_device_synchronize_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaDeviceSynchronize(); - } - template cudaError_t cuda_event_create_wrapper(cudaEvent_t* event) const { if constexpr (setCudaDevice) set_cuda_device(); From c473b3f57921cba9f73e2ad3bbfe142afdc054a6 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 26 Jan 2024 17:33:02 +0000 Subject: [PATCH 2/5] Create a couple more variables per device --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 38 ++++++++++++---------- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 9 +++-- core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 11 ++++--- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 1e2e9e8c72f..f8ee6a86464 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -148,7 +148,7 @@ void cuda_device_synchronize(const std::string &name) { []() #endif { - for (int cuda_device : CudaInternal::cuda_devices) { + for (int cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); } @@ -313,6 +313,17 @@ void CudaInternal::initialize(cudaStream_t stream) { (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); } + // Allocate a staging buffer for constant mem in pinned host memory + // and an event to avoid overwriting driver for previous kernel launches + if (!constantMemHostStagingPerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( + reinterpret_cast(&constantMemHostStagingPerDevice[m_cudaDev]), + CudaTraits::ConstantMemoryUsage))); + + if (!constantMemReusablePerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL( + (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev]))); + for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -612,30 +623,21 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); - // Init the array for used for arbitrarily sized atomics - desul::Impl::init_lock_arrays(); // FIXME - - // Allocate a staging buffer for constant mem in pinned host memory and an - // event to avoid overwriting driver for previous kernel launches - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMallocHost( - reinterpret_cast(&Impl::CudaInternal::constantMemHostStaging), - Impl::CudaTraits::ConstantMemoryUsage)); - - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaEventCreate(&Impl::CudaInternal::constantMemReusable)); - Impl::CudaInternal::singleton().initialize(singleton_stream); } void Cuda::impl_finalize() { (void)Impl::cuda_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaEventDestroy(Impl::CudaInternal::constantMemReusable)); - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaFreeHost(Impl::CudaInternal::constantMemHostStaging)); + for (auto &cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice + [cuda_device])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy( + Kokkos::Impl::CudaInternal::constantMemReusablePerDevice[cuda_device])); + } auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false); if (deep_copy_space) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index d6ef38708e1..15fce92205e 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -121,11 +121,10 @@ class CudaInternal { bool was_finalized = false; inline static std::set cuda_devices = {}; - // FIXME_CUDA: these want to be per-device, not per-stream... use of 'static' - // here will break once there are multiple devices though - inline static unsigned long* constantMemHostStaging = nullptr; - inline static cudaEvent_t constantMemReusable = nullptr; - inline static std::mutex constantMemMutex; + inline static std::map constantMemHostStagingPerDevice = + {}; + inline static std::map constantMemReusablePerDevice = {}; + inline static std::map constantMemMutexPerDevice; static CudaInternal& singleton(); diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index b68eec13a01..1920cc7936a 100644 --- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -576,13 +576,16 @@ struct CudaParallelLaunchKernelInvoker< static void invoke_kernel(DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, CudaInternal const* cuda_instance) { + int cuda_device = cuda_instance->m_cudaDev; // Wait until the previous kernel that uses the constant buffer is done - std::lock_guard lock(CudaInternal::constantMemMutex); + std::lock_guard lock( + CudaInternal::constantMemMutexPerDevice[cuda_device]); KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; + unsigned long* staging = + cuda_instance->constantMemHostStagingPerDevice[cuda_device]; memcpy(staging, &driver, sizeof(DriverType)); // Copy functor asynchronously from there to constant memory on the device @@ -597,7 +600,7 @@ struct CudaParallelLaunchKernelInvoker< // Record an event that says when the constant buffer can be reused KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); } inline static void create_parallel_launch_graph_node( From 191003f8c4af1dec50d3abe1339a07cca7709cc1 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 26 Jan 2024 18:28:53 -0500 Subject: [PATCH 3/5] Don't forget desul::Impl::init_lock_arrays(); --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index f8ee6a86464..d4aa7eecddd 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -623,6 +623,9 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); + // Init the array for used for arbitrarily sized atomics + desul::Impl::init_lock_arrays(); // FIXME + Impl::CudaInternal::singleton().initialize(singleton_stream); } From 1b9118c7b3b72535e58808b6dead7cee5366b992 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 1 Feb 2024 15:34:01 -0500 Subject: [PATCH 4/5] Address reviewer comments --- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index d4aa7eecddd..e68e3f3e0b0 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -294,6 +294,17 @@ void CudaInternal::initialize(cudaStream_t stream) { m_stream = stream; CudaInternal::cuda_devices.insert(m_cudaDev); + // Allocate a staging buffer for constant mem in pinned host memory + // and an event to avoid overwriting driver for previous kernel launches + if (!constantMemHostStagingPerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( + reinterpret_cast(&constantMemHostStagingPerDevice[m_cudaDev]), + CudaTraits::ConstantMemoryUsage))); + + if (!constantMemReusablePerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL( + (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev]))); + //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. @@ -313,17 +324,6 @@ void CudaInternal::initialize(cudaStream_t stream) { (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); } - // Allocate a staging buffer for constant mem in pinned host memory - // and an event to avoid overwriting driver for previous kernel launches - if (!constantMemHostStagingPerDevice[m_cudaDev]) - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( - reinterpret_cast(&constantMemHostStagingPerDevice[m_cudaDev]), - CudaTraits::ConstantMemoryUsage))); - - if (!constantMemReusablePerDevice[m_cudaDev]) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev]))); - for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -633,7 +633,7 @@ void Cuda::impl_finalize() { (void)Impl::cuda_global_unique_token_locks(true); desul::Impl::finalize_lock_arrays(); // FIXME - for (auto &cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + for (const auto cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); KOKKOS_IMPL_CUDA_SAFE_CALL( cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice From 871ab8f5859a8d01088ca882c0504864ac32e60d Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 1 Feb 2024 16:44:15 -0500 Subject: [PATCH 5/5] Add {} for std::map initialization Co-authored-by: Damien L-G --- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 15fce92205e..581817fcb93 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -124,7 +124,7 @@ class CudaInternal { inline static std::map constantMemHostStagingPerDevice = {}; inline static std::map constantMemReusablePerDevice = {}; - inline static std::map constantMemMutexPerDevice; + inline static std::map constantMemMutexPerDevice = {}; static CudaInternal& singleton();