-
Notifications
You must be signed in to change notification settings - Fork 408
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for CUDA unified memory architectures i.e. Grace Hopper #6823
base: develop
Are you sure you want to change the base?
Changes from all commits
4fb242a
18676d1
50f926e
860839e
8ffa9e7
18666ff
97cf64a
b649924
b802336
4f50e66
b32e29d
13016bb
a38d214
25b7a43
52c8ec1
85803bf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,6 @@ | |
#include <algorithm> | ||
#include <atomic> | ||
|
||
//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> | ||
#include <impl/Kokkos_Error.hpp> | ||
|
||
#include <impl/Kokkos_Tools.hpp> | ||
|
@@ -184,6 +183,36 @@ void *impl_allocate_common(const int device_id, | |
cudaError_t error_code = cudaSuccess; | ||
#ifndef CUDART_VERSION | ||
#error CUDART_VERSION undefined! | ||
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY) | ||
// This is inteded to simulate Grace-Hopper like behavior | ||
cedricchevalier19 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); | ||
if (error_code == cudaSuccess) { | ||
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); | ||
} | ||
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) | ||
// This is intended for Grace-Hopper (and future unified memory architectures) | ||
// The idea is to use host allocator and then advise to keep it in HBM on | ||
// device, but that requires CUDA 12.2 | ||
static_assert(CUDART_VERSION >= 12020, | ||
"CUDA runtime version >=12.2 required when " | ||
"Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY is set"); | ||
if (arg_alloc_size) { // cudaMemAdvise_v2 does not work with nullptr | ||
ptr = malloc(arg_alloc_size); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Do we care ? |
||
if (ptr != nullptr) { | ||
// One would think cudaMemLocation{device_id, | ||
// cudaMemLocationTypeDeivce} would work but it doesn't. I.e. the order of | ||
// members doesn't seem to be defined. | ||
cudaMemLocation loc; | ||
loc.id = device_id; | ||
loc.type = cudaMemLocationTypeDevice; | ||
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2( | ||
ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc)); | ||
} else { | ||
// I think this is the most logical error to return unless we | ||
// want a different mechanism for this code path | ||
error_code = cudaErrorMemoryAllocation; | ||
} | ||
} | ||
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) | ||
if (arg_alloc_size >= memory_threshold_g) { | ||
error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); | ||
|
@@ -196,9 +225,13 @@ void *impl_allocate_common(const int device_id, | |
"Kokkos::Cuda: backend fence after async malloc"); | ||
} | ||
} | ||
} else | ||
} else { | ||
error_code = cudaMalloc(&ptr, arg_alloc_size); | ||
} | ||
#else | ||
error_code = cudaMalloc(&ptr, arg_alloc_size); | ||
#endif | ||
{ error_code = cudaMalloc(&ptr, arg_alloc_size); } | ||
|
||
if (error_code != cudaSuccess) { // TODO tag as unlikely branch | ||
// This is the only way to clear the last error, which | ||
// we should do here since we're turning it into an | ||
|
@@ -344,6 +377,13 @@ void CudaSpace::impl_deallocate( | |
try { | ||
#ifndef CUDART_VERSION | ||
#error CUDART_VERSION undefined! | ||
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY) | ||
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); | ||
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); | ||
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) | ||
Impl::cuda_device_synchronize( | ||
"Kokkos::Cuda: backend fence before unified memory free"); | ||
free(arg_alloc_ptr); | ||
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) | ||
if (arg_alloc_size >= memory_threshold_g) { | ||
Impl::cuda_device_synchronize( | ||
|
@@ -463,8 +503,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, | |
|
||
#include <impl/Kokkos_SharedAlloc_timpl.hpp> | ||
|
||
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) | ||
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( | ||
Kokkos::CudaSpace); | ||
#else | ||
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace); | ||
#endif | ||
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( | ||
Kokkos::CudaUVMSpace); | ||
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -537,6 +537,15 @@ static constexpr bool kokkos_omp_on_host() { return false; } | |
#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC | ||
#endif | ||
|
||
#if defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY) | ||
#define KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY | ||
#endif | ||
|
||
// TODO: enable the following when we are sure it is the right thing to do | ||
//#if defined(KOKKOS_ARCH_ARMV9_GRACE) && defined(KOKKOS_ARCH_HOPPER90) | ||
//#define KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY | ||
//#endif | ||
Comment on lines
+544
to
+547
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we only care about emulating for now? Or do we want to enable this before merging after testing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a cmake option for this so you can enable it explicitly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. I still think we should test this in at least one CI build before merging. |
||
|
||
#define KOKKOS_INVALID_INDEX (~std::size_t(0)) | ||
|
||
#define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
neoverse-v2
is not a validgcc-12.2
architecture, do we need to protect this flag?