Deprecate {Cuda,HIP}::detect_device_count() and `Cuda::[detect_]dev…

…ice_arch()` (kokkos#6710) * CUDA/HIP: Inline getting device count get_device_count() -> int cannot be generic. Get rid of the extra indirection because it brings no benefit. * Get rid of CudaInternalDevices and cleanup Cuda::print_configuration() * Get rid of cuda_get_device_{count,properties}_wrapper() * Missed a few CudaInternalDevices and get rid of Cuda::detect_device_arch() * Get rid of Cuda::device_arch() * Fixup Co-authored-by: Bruno Turcksin <bruno.turcksin@gmail.com> * Don’t mess with Voodoo * Be more conservative and deprecate before removing * Clang-format for suggestion made on GH * Remove stray const qualifier * Forgot that device_arch() was static * Who Let the Bugs Out?? * The same thing we do every night, Pinky - try to take over the world! * this should fix it --------- Co-authored-by: Bruno Turcksin <bruno.turcksin@gmail.com>
japlews · Jan 12, 2024 · c75d730 · c75d730
1 parent 9393b35
commit c75d730
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 163 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp
@@ -178,18 +178,37 @@ class Cuda {
   //! Initialize, telling the CUDA run-time library which device to use.
   static void impl_initialize(InitializationSettings const&);
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   /// \brief Cuda device architecture of the selected device.
   ///
   /// This matches the __CUDA_ARCH__ specification.
-  static size_type device_arch();
+  KOKKOS_DEPRECATED static size_type device_arch() {
+    const cudaDeviceProp& cudaProp = Cuda().cuda_device_prop();
+    return cudaProp.major * 100 + cudaProp.minor;
+  }
 
   //! Query device count.
-  static size_type detect_device_count();
+  KOKKOS_DEPRECATED static size_type detect_device_count() {
+    int count;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
+    return count;
+  }
 
   /** \brief  Detect the available devices and their architecture
    *          as defined by the __CUDA_ARCH__ specification.
    */
-  static std::vector<unsigned> detect_device_arch();
+  KOKKOS_DEPRECATED static std::vector<unsigned> detect_device_arch() {
+    int count;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
+    std::vector<unsigned> out;
+    for (int i = 0; i < count; ++i) {
+      cudaDeviceProp prop;
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i));
+      out.push_back(prop.major * 100 + prop.minor);
+    }
+    return out;
+  }
+#endif
 
   cudaStream_t cuda_stream() const;
   int cuda_device() const;

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -208,96 +208,6 @@ void cuda_internal_error_abort(cudaError e, const char *name, const char *file,
   host_abort(out.str().c_str());
 }
 
-//----------------------------------------------------------------------------
-// Some significant cuda device properties:
-//
-// cudaDeviceProp::name                : Text label for device
-// cudaDeviceProp::major               : Device major number
-// cudaDeviceProp::minor               : Device minor number
-// cudaDeviceProp::warpSize            : number of threads per warp
-// cudaDeviceProp::multiProcessorCount : number of multiprocessors
-// cudaDeviceProp::sharedMemPerBlock   : capacity of shared memory per block
-// cudaDeviceProp::totalConstMem       : capacity of constant memory
-// cudaDeviceProp::totalGlobalMem      : capacity of global memory
-// cudaDeviceProp::maxGridSize[3]      : maximum grid size
-
-//
-//  Section 4.4.2.4 of the CUDA Toolkit Reference Manual
-//
-// struct cudaDeviceProp {
-//   char name[256];
-//   size_t totalGlobalMem;
-//   size_t sharedMemPerBlock;
-//   int regsPerBlock;
-//   int warpSize;
-//   size_t memPitch;
-//   int maxThreadsPerBlock;
-//   int maxThreadsDim[3];
-//   int maxGridSize[3];
-//   size_t totalConstMem;
-//   int major;
-//   int minor;
-//   int clockRate;
-//   size_t textureAlignment;
-//   int deviceOverlap;
-//   int multiProcessorCount;
-//   int kernelExecTimeoutEnabled;
-//   int integrated;
-//   int canMapHostMemory;
-//   int computeMode;
-//   int concurrentKernels;
-//   int ECCEnabled;
-//   int pciBusID;
-//   int pciDeviceID;
-//   int tccDriver;
-//   int asyncEngineCount;
-//   int unifiedAddressing;
-//   int memoryClockRate;
-//   int memoryBusWidth;
-//   int l2CacheSize;
-//   int maxThreadsPerMultiProcessor;
-// };
-
-namespace {
-
-class CudaInternalDevices {
- public:
-  enum { MAXIMUM_DEVICE_COUNT = 64 };
-  struct cudaDeviceProp m_cudaProp[MAXIMUM_DEVICE_COUNT];
-  int m_cudaDevCount;
-
-  CudaInternalDevices();
-
-  static const CudaInternalDevices &singleton();
-};
-
-CudaInternalDevices::CudaInternalDevices() {
-  // See 'cudaSetDeviceFlags' for host-device thread interaction
-  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
-
-  KOKKOS_IMPL_CUDA_SAFE_CALL(
-      (CudaInternal::singleton().cuda_get_device_count_wrapper<false>(
-          &m_cudaDevCount)));
-
-  if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
-    Kokkos::abort(
-        "Sorry, you have more GPUs per node than we thought anybody would ever "
-        "have. Please report this to github.com/kokkos/kokkos.");
-  }
-  for (int i = 0; i < m_cudaDevCount; ++i) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(
-        (CudaInternal::singleton().cuda_get_device_properties_wrapper<false>(
-            m_cudaProp + i, i)));
-  }
-}
-
-const CudaInternalDevices &CudaInternalDevices::singleton() {
-  static CudaInternalDevices self;
-  return self;
-}
-
-}  // namespace
-
 //----------------------------------------------------------------------------
 
 int Impl::CudaInternal::concurrency() {
@@ -307,8 +217,6 @@ int Impl::CudaInternal::concurrency() {
 }
 
 void CudaInternal::print_configuration(std::ostream &s) const {
-  const CudaInternalDevices &dev_info = CudaInternalDevices::singleton();
-
 #if defined(KOKKOS_ENABLE_CUDA)
   s << "macro  KOKKOS_ENABLE_CUDA      : defined\n";
 #endif
@@ -317,15 +225,19 @@ void CudaInternal::print_configuration(std::ostream &s) const {
     << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n';
 #endif
 
-  for (int i = 0; i < dev_info.m_cudaDevCount; ++i) {
-    s << "Kokkos::Cuda[ " << i << " ] " << dev_info.m_cudaProp[i].name
-      << " capability " << dev_info.m_cudaProp[i].major << "."
-      << dev_info.m_cudaProp[i].minor << ", Total Global Memory: "
-      << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem)
+  int count;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
+
+  for (int i = 0; i < count; ++i) {
+    cudaDeviceProp prop;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i));
+    s << "Kokkos::Cuda[ " << i << " ] " << prop.name << " capability "
+      << prop.major << "." << prop.minor
+      << ", Total Global Memory: " << human_memory_size(prop.totalGlobalMem)
       << ", Shared Memory per Block: "
-      << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
+      << human_memory_size(prop.sharedMemPerBlock);
     if (m_cudaDev == i) s << " : Selected";
-    s << std::endl;
+    s << '\n';
   }
 }
 
@@ -666,10 +578,6 @@ Cuda::size_type *cuda_internal_scratch_unified(const Cuda &instance,
 
 namespace Kokkos {
 
-Cuda::size_type Cuda::detect_device_count() {
-  return Impl::CudaInternalDevices::singleton().m_cudaDevCount;
-}
-
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
 int Cuda::concurrency() {
 #else
@@ -684,11 +592,11 @@ int Cuda::impl_is_initialized() {
 
 void Cuda::impl_initialize(InitializationSettings const &settings) {
   const int cuda_device_id = Impl::get_gpu(settings);
-  const auto &dev_info     = Impl::CudaInternalDevices::singleton();
-
-  const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id];
-  Impl::CudaInternal::m_deviceProp      = cudaProp;
 
+  cudaDeviceProp cudaProp;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaGetDeviceProperties(&cudaProp, cuda_device_id));
+  Impl::CudaInternal::m_deviceProp = cudaProp;
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
 
@@ -765,33 +673,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
                                              /*manage*/ true);
 }
 
-std::vector<unsigned> Cuda::detect_device_arch() {
-  const Impl::CudaInternalDevices &s = Impl::CudaInternalDevices::singleton();
-
-  std::vector<unsigned> output(s.m_cudaDevCount);
-
-  for (int i = 0; i < s.m_cudaDevCount; ++i) {
-    output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor;
-  }
-
-  return output;
-}
-
-Cuda::size_type Cuda::device_arch() {
-  const int dev_id = Impl::CudaInternal::singleton().m_cudaDev;
-
-  int dev_arch = 0;
-
-  if (0 <= dev_id) {
-    const struct cudaDeviceProp &cudaProp =
-        Impl::CudaInternalDevices::singleton().m_cudaProp[dev_id];
-
-    dev_arch = cudaProp.major * 100 + cudaProp.minor;
-  }
-
-  return dev_arch;
-}
-
 void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); }
 
 Cuda::Cuda()

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -265,19 +265,6 @@ class CudaInternal {
     return cudaFreeHost(ptr);
   }
 
-  template <bool setCudaDevice = true>
-  cudaError_t cuda_get_device_count_wrapper(int* count) const {
-    if constexpr (setCudaDevice) set_cuda_device();
-    return cudaGetDeviceCount(count);
-  }
-
-  template <bool setCudaDevice = true>
-  cudaError_t cuda_get_device_properties_wrapper(cudaDeviceProp* prop,
-                                                 int device) const {
-    if constexpr (setCudaDevice) set_cuda_device();
-    return cudaGetDeviceProperties(prop, device);
-  }
-
   template <bool setCudaDevice = true>
   const char* cuda_get_error_name_wrapper(cudaError_t error) const {
     if constexpr (setCudaDevice) set_cuda_device();

diff --git a/core/src/HIP/Kokkos_HIP.hpp b/core/src/HIP/Kokkos_HIP.hpp
@@ -94,9 +94,13 @@ class HIP {
 
   static int impl_is_initialized();
 
-  //  static size_type device_arch();
-
-  static size_type detect_device_count();
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  KOKKOS_DEPRECATED static size_type detect_device_count() {
+    int count;
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count));
+    return count;
+  }
+#endif
 
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
   static int concurrency();

diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -420,12 +420,4 @@ void Kokkos::Impl::create_HIP_instances(std::vector<HIP> &instances) {
   }
 }
 
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-HIP::size_type HIP::detect_device_count() {
-  int hipDevCount;
-  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount));
-  return hipDevCount;
-}
 }  // namespace Kokkos
diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp
@@ -131,9 +131,13 @@ void combine(Kokkos::Tools::InitArguments& out,
 
 int get_device_count() {
 #if defined(KOKKOS_ENABLE_CUDA)
-  return Kokkos::Cuda::detect_device_count();
+  int count;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
+  return count;
 #elif defined(KOKKOS_ENABLE_HIP)
-  return Kokkos::HIP::detect_device_count();
+  int count;
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count));
+  return count;
 #elif defined(KOKKOS_ENABLE_SYCL)
   return sycl::device::get_devices(sycl::info::device_type::gpu).size();
 #elif defined(KOKKOS_ENABLE_OPENACC)