diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 870284b3723..e68e3f3e0b0 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -135,7 +135,6 @@ Kokkos::View cuda_global_unique_token_locks( return locks; } -// FIXME_CUDA_MULTIPLE_DEVICES void cuda_device_synchronize(const std::string &name) { Kokkos::Tools::Experimental::Impl::profile_fence_event( name, @@ -144,16 +143,16 @@ void cuda_device_synchronize(const std::string &name) { #if defined(KOKKOS_COMPILER_CLANG) // annotate with __host__ silence a clang warning about using // cudaDeviceSynchronize in device code - [] __host__() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + [] __host__() #else - []() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + []() #endif + { + for (int cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } + }); } void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, @@ -293,6 +292,18 @@ void CudaInternal::initialize(cudaStream_t stream) { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); m_stream = stream; + CudaInternal::cuda_devices.insert(m_cudaDev); + + // Allocate a staging buffer for constant mem in pinned host memory + // and an event to avoid overwriting driver for previous kernel launches + if (!constantMemHostStagingPerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( + reinterpret_cast(&constantMemHostStagingPerDevice[m_cudaDev]), + CudaTraits::ConstantMemoryUsage))); + + if (!constantMemReusablePerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL( + (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev]))); //---------------------------------- // Multiblock reduction uses scratch flags for counters @@ -615,27 +626,21 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default // Init the array for used for arbitrarily sized atomics desul::Impl::init_lock_arrays(); // FIXME - // Allocate a staging buffer for constant mem in pinned host memory and an - // event to avoid overwriting driver for previous kernel launches - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMallocHost( - reinterpret_cast(&Impl::CudaInternal::constantMemHostStaging), - Impl::CudaTraits::ConstantMemoryUsage)); - - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaEventCreate(&Impl::CudaInternal::constantMemReusable)); - Impl::CudaInternal::singleton().initialize(singleton_stream); } void Cuda::impl_finalize() { (void)Impl::cuda_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaEventDestroy(Impl::CudaInternal::constantMemReusable)); - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaFreeHost(Impl::CudaInternal::constantMemHostStaging)); + for (const auto cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice + [cuda_device])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy( + Kokkos::Impl::CudaInternal::constantMemReusablePerDevice[cuda_device])); + } auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false); if (deep_copy_space) diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 9c452573a51..581817fcb93 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -23,6 +23,10 @@ #include #include #include "Kokkos_CudaSpace.hpp" + +#include +#include + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- // These functions fulfill the purpose of allowing to work around @@ -116,11 +120,11 @@ class CudaInternal { bool was_initialized = false; bool was_finalized = false; - // FIXME_CUDA: these want to be per-device, not per-stream... use of 'static' - // here will break once there are multiple devices though - inline static unsigned long* constantMemHostStaging = nullptr; - inline static cudaEvent_t constantMemReusable = nullptr; - inline static std::mutex constantMemMutex; + inline static std::set cuda_devices = {}; + inline static std::map constantMemHostStagingPerDevice = + {}; + inline static std::map constantMemReusablePerDevice = {}; + inline static std::map constantMemMutexPerDevice = {}; static CudaInternal& singleton(); @@ -221,12 +225,6 @@ class CudaInternal { return cudaDeviceSetLimit(limit, value); } - template - cudaError_t cuda_device_synchronize_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaDeviceSynchronize(); - } - template cudaError_t cuda_event_create_wrapper(cudaEvent_t* event) const { if constexpr (setCudaDevice) set_cuda_device(); diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index b68eec13a01..1920cc7936a 100644 --- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -576,13 +576,16 @@ struct CudaParallelLaunchKernelInvoker< static void invoke_kernel(DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, CudaInternal const* cuda_instance) { + int cuda_device = cuda_instance->m_cudaDev; // Wait until the previous kernel that uses the constant buffer is done - std::lock_guard lock(CudaInternal::constantMemMutex); + std::lock_guard lock( + CudaInternal::constantMemMutexPerDevice[cuda_device]); KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; + unsigned long* staging = + cuda_instance->constantMemHostStagingPerDevice[cuda_device]; memcpy(staging, &driver, sizeof(DriverType)); // Copy functor asynchronously from there to constant memory on the device @@ -597,7 +600,7 @@ struct CudaParallelLaunchKernelInvoker< // Record an event that says when the constant buffer can be reused KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); } inline static void create_parallel_launch_graph_node(