kokkos · crtrott · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024 · Mar 5, 2024
diff --git a/Makefile.kokkos b/Makefile.kokkos
@@ -11,7 +11,7 @@ KOKKOS_DEVICES ?= "Threads"
 # Options:
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90
-# ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
+# ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX,ARMv9-Grace
 # IBM:      Power8,Power9
 # AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
@@ -415,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8
 KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-ThunderX)
 KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-TX2)
 KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX)
-KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc))
+KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv9-Grace)
+KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE) | bc))
 
 # IBM based.
 KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8)
@@ -778,6 +779,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1)
   endif
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV9_GRACE")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON")
+
+  KOKKOS_CXXFLAGS += -mcpu=neoverse-v2  -msve-vector-bits=128
+  KOKKOS_LDFLAGS += -mcpu=neoverse-v2  -msve-vector-bits=128
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1)
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN")
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2")

diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in
@@ -37,6 +37,8 @@
 #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA  // deprecated
 #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
 #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
+#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
+#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY
 #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
 #cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY
@@ -65,6 +67,7 @@
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
 #cmakedefine KOKKOS_ARCH_ARMV81
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
+#cmakedefine KOKKOS_ARCH_ARMV9_GRACE
 #cmakedefine KOKKOS_ARCH_A64FX
 #cmakedefine KOKKOS_ARCH_AVX
 #cmakedefine KOKKOS_ARCH_AVX2

diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake
@@ -28,6 +28,7 @@ KOKKOS_CHECK_DEPRECATED_OPTIONS(
 #-------------------------------------------------------------------------------
 SET(KOKKOS_ARCH_LIST)
 
+include(CheckCXXCompilerFlag)
 
 KOKKOS_DEPRECATED_LIST(ARCH ARCH)
 
@@ -49,6 +50,7 @@ DECLARE_AND_CHECK_HOST_ARCH(ARMV81            "ARMv8.1 Compatible CPU")
 DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX    "ARMv8 Cavium ThunderX CPU")
 DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2   "ARMv8 Cavium ThunderX2 CPU")
 DECLARE_AND_CHECK_HOST_ARCH(A64FX             "ARMv8.2 with SVE Support")
+DECLARE_AND_CHECK_HOST_ARCH(ARMV9_GRACE       "ARMv9 NVIDIA Grace CPU")
 DECLARE_AND_CHECK_HOST_ARCH(SNB               "Intel Sandy/Ivy Bridge CPUs")
 DECLARE_AND_CHECK_HOST_ARCH(HSW               "Intel Haswell CPUs")
 DECLARE_AND_CHECK_HOST_ARCH(BDW               "Intel Broadwell Xeon E-class CPUs")
@@ -301,6 +303,20 @@ IF (KOKKOS_ARCH_A64FX)
   )
 ENDIF()
 
+IF (KOKKOS_ARCH_ARMV9_GRACE)
+  SET(KOKKOS_ARCH_ARM_NEON ON)
+  check_cxx_compiler_flag("-mcpu=neoverse-n2" COMPILER_SUPPORTS_NEOVERSE_N2)
+  check_cxx_compiler_flag("-msve-vector-bits=128" COMPILER_SUPPORTS_SVE_VECTOR_BITS)
+  IF (COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS)
+    COMPILER_SPECIFIC_FLAGS(
+      COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+      DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128
+    )
+  ELSE()
+    MESSAGE(WARNING "Compiler does not support ARMv9 Grace architecture")
+  ENDIF()
+ENDIF()
+
 IF (KOKKOS_ARCH_ZEN)
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID

diff --git a/cmake/kokkos_enable_devices.cmake b/cmake/kokkos_enable_devices.cmake
@@ -40,7 +40,6 @@ ELSE()
 ENDIF()
 KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend")
 
-
 # We want this to default to OFF for cache reasons, but if no
 # host space is given, then activate serial
 IF (KOKKOS_HAS_TRILINOS)

diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake
@@ -48,6 +48,9 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda
 # resolved but we keep the option around a bit longer to be safe.
 KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
 KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA")
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_EMULATE_UNIFIED_MEMORY OFF "Whether to emulate unified memory architectures for CUDA")
+
 KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4    ON "Whether code deprecated in major release 4 is available" )
 KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
 KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for HIP")

diff --git a/containers/unit_tests/TestWithoutInitializing.hpp b/containers/unit_tests/TestWithoutInitializing.hpp
@@ -37,6 +37,18 @@
 #endif
 ///@}
 
+/// Some tests are skipped for unified memory space
+#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) || \
+    defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
+#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE                               \
+  if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space,    \
+                               Kokkos::CudaSpace>)                       \
+    GTEST_SKIP() << "skipping since unified memory requires additional " \
+                    "fences";
+#else
+#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE
+#endif
+
 TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) {
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableKernels());
@@ -657,6 +669,7 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) {
 
 TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) {
   GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
+  GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE
 
   using namespace Kokkos::Test::Tools;
   listen_tool_events(Config::DisableAll(), Config::EnableKernels(),

diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -31,7 +31,6 @@
 #include <algorithm>
 #include <atomic>
 
-//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <impl/Kokkos_Error.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
@@ -184,6 +183,36 @@ void *impl_allocate_common(const int device_id,
   cudaError_t error_code = cudaSuccess;
 #ifndef CUDART_VERSION
 #error CUDART_VERSION undefined!
+#elif defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
+  // This is inteded to simulate Grace-Hopper like behavior
+  error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
+  if (error_code == cudaSuccess) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  }
+#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+  // This is intended for Grace-Hopper (and future unified memory architectures)
+  // The idea is to use host allocator and then advise to keep it in HBM on
+  // device, but that requires CUDA 12.2
+  static_assert(CUDART_VERSION >= 12020,
+                "CUDA runtime version >=12.2 required when "
+                "Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY is set");
+  if (arg_alloc_size) {  // cudaMemAdvise_v2 does not work with nullptr
+    ptr = malloc(arg_alloc_size);
+    if (ptr != nullptr) {
+      // One would think cudaMemLocation{device_id,
+      // cudaMemLocationTypeDeivce} would work but it doesn't. I.e. the order of
+      // members doesn't seem to be defined.
+      cudaMemLocation loc;
+      loc.id   = device_id;
+      loc.type = cudaMemLocationTypeDevice;
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2(
+          ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc));
+    } else {
+      // I think this is the most logical error to return unless we
+      // want a different mechanism for this code path
+      error_code = cudaErrorMemoryAllocation;
+    }
+  }
 #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
   if (arg_alloc_size >= memory_threshold_g) {
     error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
@@ -196,9 +225,13 @@ void *impl_allocate_common(const int device_id,
             "Kokkos::Cuda: backend fence after async malloc");
       }
     }
-  } else
+  } else {
+    error_code = cudaMalloc(&ptr, arg_alloc_size);
+  }
+#else
+  error_code = cudaMalloc(&ptr, arg_alloc_size);
 #endif
-  { error_code = cudaMalloc(&ptr, arg_alloc_size); }
+
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
     // This is the only way to clear the last error, which
     // we should do here since we're turning it into an
@@ -344,6 +377,13 @@ void CudaSpace::impl_deallocate(
   try {
 #ifndef CUDART_VERSION
 #error CUDART_VERSION undefined!
+#elif defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+    Impl::cuda_device_synchronize(
+        "Kokkos::Cuda: backend fence before unified memory free");
+    free(arg_alloc_ptr);
 #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
     if (arg_alloc_size >= memory_threshold_g) {
       Impl::cuda_device_synchronize(
@@ -463,8 +503,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
 
 #include <impl/Kokkos_SharedAlloc_timpl.hpp>
 
+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
 KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
     Kokkos::CudaSpace);
+#else
+KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace);
+#endif
 KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
     Kokkos::CudaUVMSpace);
 KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(

diff --git a/core/src/Cuda/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp
@@ -88,6 +88,19 @@ class CudaSpace {
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
 
+#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
+#endif
+
   /**\brief  Deallocate untracked memory in the cuda space */
   void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
   void deallocate(const char* arg_label, void* const arg_alloc_ptr,
@@ -337,7 +350,11 @@ static_assert(
 template <>
 struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> {
   enum : bool { assignable = false };
-  enum : bool { accessible = false };
+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+  enum : bool{accessible = false};
+#else
+  enum : bool { accessible = true };
+#endif
   enum : bool { deepcopy = true };
 };
 
@@ -558,8 +575,12 @@ struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
 KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
     Kokkos::CudaSpace);
+#else
+KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaSpace);
+#endif
 KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace);
 KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace);
 

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -607,6 +607,22 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
 
   //----------------------------------
 
+#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
+  // Check if unified memory is available
+  int cuda_result;
+  cudaDeviceGetAttribute(&cuda_result, cudaDevAttrConcurrentManagedAccess,
+                         cuda_device_id);
+  if (cuda_result == 0) {
+    Kokkos::abort(
+        "Kokkos::Cuda::initialize ERROR: Unified memory is not available on "
+        "this device\n"
+        "Please recompile Kokkos with "
+        "-DKokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF\n");
+  }
+#endif
+
+  //----------------------------------
+
   cudaStream_t singleton_stream;
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream));
@@ -705,6 +721,14 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
 #else
   os << "no\n";
 #endif
+#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
+  os << "  KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY: ";
+  os << "yes\n";
+#endif
+#ifdef KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY
+  os << "  KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY: ";
+  os << "yes\n";
+#endif
 
   os << "\nCuda Runtime Configuration:\n";
 

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -325,7 +325,13 @@ class CudaInternal {
   template <bool setCudaDevice = true>
   cudaError_t cuda_malloc_wrapper(void** devPtr, size_t size) const {
     if constexpr (setCudaDevice) set_cuda_device();
+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
     return cudaMalloc(devPtr, size);
+#else
+    auto ptr = cudaMallocManaged(devPtr, size, cudaMemAttachGlobal);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    return ptr;
+#endif
   }
 
   template <bool setCudaDevice = true>

diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp
@@ -537,6 +537,15 @@ static constexpr bool kokkos_omp_on_host() { return false; }
 #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
 #endif
 
+#if defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
+#define KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
+#endif
+
+// TODO: enable the following when we are sure it is the right thing to do
+//#if defined(KOKKOS_ARCH_ARMV9_GRACE) && defined(KOKKOS_ARCH_HOPPER90)
+//#define KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
+//#endif
+
 #define KOKKOS_INVALID_INDEX (~std::size_t(0))
 
 #define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX

diff --git a/core/unit_test/cuda/TestCuda_Spaces.cpp b/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -39,9 +39,14 @@ TEST(cuda, space_access) {
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
                                        Kokkos::CudaSpace>::assignable);
 
+#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
                                        Kokkos::CudaSpace>::accessible);
+#else
+  static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
+                                                Kokkos::CudaSpace>::accessible);
+#endif
 
   static_assert(
       !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
@@ -147,8 +152,13 @@ TEST(cuda, space_access) {
       Kokkos::SpaceAccessibility<Kokkos::Cuda,
                                  Kokkos::CudaHostPinnedSpace>::accessible);
 
+#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
   static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace,
                                             Kokkos::CudaSpace>::accessible);
+#else
+  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                           Kokkos::CudaSpace>::accessible);
+#endif
 
   static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
                                            Kokkos::CudaUVMSpace>::accessible);
@@ -157,8 +167,14 @@ TEST(cuda, space_access) {
       Kokkos::SpaceAccessibility<Kokkos::HostSpace,
                                  Kokkos::CudaHostPinnedSpace>::accessible);
 
+#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
   static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
                              Kokkos::HostSpace>::value);
+#else
+  static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
+                             Kokkos::Device<Kokkos::HostSpace::execution_space,
+                                            Kokkos::CudaSpace>>::value);
+#endif
 
   static_assert(
       std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space,