Skip to content

Commit

Permalink
Deprecate {Cuda,HIP}::detect_device_count() and `Cuda::[detect_]dev…
Browse files Browse the repository at this point in the history
…ice_arch()` (kokkos#6710)

* CUDA/HIP: Inline getting device count

get_device_count() -> int cannot be generic. Get rid of the extra
indirection because it brings no benefit.

* Get rid of CudaInternalDevices and cleanup Cuda::print_configuration()

* Get rid of cuda_get_device_{count,properties}_wrapper()

* Missed a few CudaInternalDevices and get rid of Cuda::detect_device_arch()

* Get rid of Cuda::device_arch()

* Fixup

Co-authored-by: Bruno Turcksin <bruno.turcksin@gmail.com>

* Don’t mess with Voodoo

* Be more conservative and deprecate before removing

* Clang-format for suggestion made on GH

* Remove stray const qualifier

* Forgot that device_arch() was static

* Who Let the Bugs Out??

* The same thing we do every night, Pinky - try to take over the world!

* this should fix it

---------

Co-authored-by: Bruno Turcksin <bruno.turcksin@gmail.com>
  • Loading branch information
dalg24 and Rombur committed Jan 12, 2024
1 parent 9393b35 commit c75d730
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 163 deletions.
25 changes: 22 additions & 3 deletions core/src/Cuda/Kokkos_Cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,18 +178,37 @@ class Cuda {
//! Initialize, telling the CUDA run-time library which device to use.
static void impl_initialize(InitializationSettings const&);

#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
/// \brief Cuda device architecture of the selected device.
///
/// This matches the __CUDA_ARCH__ specification.
static size_type device_arch();
KOKKOS_DEPRECATED static size_type device_arch() {
const cudaDeviceProp& cudaProp = Cuda().cuda_device_prop();
return cudaProp.major * 100 + cudaProp.minor;
}

//! Query device count.
static size_type detect_device_count();
KOKKOS_DEPRECATED static size_type detect_device_count() {
int count;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
return count;
}

/** \brief Detect the available devices and their architecture
* as defined by the __CUDA_ARCH__ specification.
*/
static std::vector<unsigned> detect_device_arch();
KOKKOS_DEPRECATED static std::vector<unsigned> detect_device_arch() {
int count;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
std::vector<unsigned> out;
for (int i = 0; i < count; ++i) {
cudaDeviceProp prop;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i));
out.push_back(prop.major * 100 + prop.minor);
}
return out;
}
#endif

cudaStream_t cuda_stream() const;
int cuda_device() const;
Expand Down
149 changes: 15 additions & 134 deletions core/src/Cuda/Kokkos_Cuda_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,96 +208,6 @@ void cuda_internal_error_abort(cudaError e, const char *name, const char *file,
host_abort(out.str().c_str());
}

//----------------------------------------------------------------------------
// Some significant cuda device properties:
//
// cudaDeviceProp::name : Text label for device
// cudaDeviceProp::major : Device major number
// cudaDeviceProp::minor : Device minor number
// cudaDeviceProp::warpSize : number of threads per warp
// cudaDeviceProp::multiProcessorCount : number of multiprocessors
// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block
// cudaDeviceProp::totalConstMem : capacity of constant memory
// cudaDeviceProp::totalGlobalMem : capacity of global memory
// cudaDeviceProp::maxGridSize[3] : maximum grid size

//
// Section 4.4.2.4 of the CUDA Toolkit Reference Manual
//
// struct cudaDeviceProp {
// char name[256];
// size_t totalGlobalMem;
// size_t sharedMemPerBlock;
// int regsPerBlock;
// int warpSize;
// size_t memPitch;
// int maxThreadsPerBlock;
// int maxThreadsDim[3];
// int maxGridSize[3];
// size_t totalConstMem;
// int major;
// int minor;
// int clockRate;
// size_t textureAlignment;
// int deviceOverlap;
// int multiProcessorCount;
// int kernelExecTimeoutEnabled;
// int integrated;
// int canMapHostMemory;
// int computeMode;
// int concurrentKernels;
// int ECCEnabled;
// int pciBusID;
// int pciDeviceID;
// int tccDriver;
// int asyncEngineCount;
// int unifiedAddressing;
// int memoryClockRate;
// int memoryBusWidth;
// int l2CacheSize;
// int maxThreadsPerMultiProcessor;
// };

namespace {

class CudaInternalDevices {
public:
enum { MAXIMUM_DEVICE_COUNT = 64 };
struct cudaDeviceProp m_cudaProp[MAXIMUM_DEVICE_COUNT];
int m_cudaDevCount;

CudaInternalDevices();

static const CudaInternalDevices &singleton();
};

CudaInternalDevices::CudaInternalDevices() {
// See 'cudaSetDeviceFlags' for host-device thread interaction
// Section 4.4.2.6 of the CUDA Toolkit Reference Manual

KOKKOS_IMPL_CUDA_SAFE_CALL(
(CudaInternal::singleton().cuda_get_device_count_wrapper<false>(
&m_cudaDevCount)));

if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
Kokkos::abort(
"Sorry, you have more GPUs per node than we thought anybody would ever "
"have. Please report this to github.com/kokkos/kokkos.");
}
for (int i = 0; i < m_cudaDevCount; ++i) {
KOKKOS_IMPL_CUDA_SAFE_CALL(
(CudaInternal::singleton().cuda_get_device_properties_wrapper<false>(
m_cudaProp + i, i)));
}
}

const CudaInternalDevices &CudaInternalDevices::singleton() {
static CudaInternalDevices self;
return self;
}

} // namespace

//----------------------------------------------------------------------------

int Impl::CudaInternal::concurrency() {
Expand All @@ -307,8 +217,6 @@ int Impl::CudaInternal::concurrency() {
}

void CudaInternal::print_configuration(std::ostream &s) const {
const CudaInternalDevices &dev_info = CudaInternalDevices::singleton();

#if defined(KOKKOS_ENABLE_CUDA)
s << "macro KOKKOS_ENABLE_CUDA : defined\n";
#endif
Expand All @@ -317,15 +225,19 @@ void CudaInternal::print_configuration(std::ostream &s) const {
<< CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n';
#endif

for (int i = 0; i < dev_info.m_cudaDevCount; ++i) {
s << "Kokkos::Cuda[ " << i << " ] " << dev_info.m_cudaProp[i].name
<< " capability " << dev_info.m_cudaProp[i].major << "."
<< dev_info.m_cudaProp[i].minor << ", Total Global Memory: "
<< human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem)
int count;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));

for (int i = 0; i < count; ++i) {
cudaDeviceProp prop;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i));
s << "Kokkos::Cuda[ " << i << " ] " << prop.name << " capability "
<< prop.major << "." << prop.minor
<< ", Total Global Memory: " << human_memory_size(prop.totalGlobalMem)
<< ", Shared Memory per Block: "
<< human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
<< human_memory_size(prop.sharedMemPerBlock);
if (m_cudaDev == i) s << " : Selected";
s << std::endl;
s << '\n';
}
}

Expand Down Expand Up @@ -666,10 +578,6 @@ Cuda::size_type *cuda_internal_scratch_unified(const Cuda &instance,

namespace Kokkos {

Cuda::size_type Cuda::detect_device_count() {
return Impl::CudaInternalDevices::singleton().m_cudaDevCount;
}

#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
int Cuda::concurrency() {
#else
Expand All @@ -684,11 +592,11 @@ int Cuda::impl_is_initialized() {

void Cuda::impl_initialize(InitializationSettings const &settings) {
const int cuda_device_id = Impl::get_gpu(settings);
const auto &dev_info = Impl::CudaInternalDevices::singleton();

const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id];
Impl::CudaInternal::m_deviceProp = cudaProp;

cudaDeviceProp cudaProp;
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaGetDeviceProperties(&cudaProp, cuda_device_id));
Impl::CudaInternal::m_deviceProp = cudaProp;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());

Expand Down Expand Up @@ -765,33 +673,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
/*manage*/ true);
}

std::vector<unsigned> Cuda::detect_device_arch() {
const Impl::CudaInternalDevices &s = Impl::CudaInternalDevices::singleton();

std::vector<unsigned> output(s.m_cudaDevCount);

for (int i = 0; i < s.m_cudaDevCount; ++i) {
output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor;
}

return output;
}

Cuda::size_type Cuda::device_arch() {
const int dev_id = Impl::CudaInternal::singleton().m_cudaDev;

int dev_arch = 0;

if (0 <= dev_id) {
const struct cudaDeviceProp &cudaProp =
Impl::CudaInternalDevices::singleton().m_cudaProp[dev_id];

dev_arch = cudaProp.major * 100 + cudaProp.minor;
}

return dev_arch;
}

void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); }

Cuda::Cuda()
Expand Down
13 changes: 0 additions & 13 deletions core/src/Cuda/Kokkos_Cuda_Instance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,19 +265,6 @@ class CudaInternal {
return cudaFreeHost(ptr);
}

template <bool setCudaDevice = true>
cudaError_t cuda_get_device_count_wrapper(int* count) const {
if constexpr (setCudaDevice) set_cuda_device();
return cudaGetDeviceCount(count);
}

template <bool setCudaDevice = true>
cudaError_t cuda_get_device_properties_wrapper(cudaDeviceProp* prop,
int device) const {
if constexpr (setCudaDevice) set_cuda_device();
return cudaGetDeviceProperties(prop, device);
}

template <bool setCudaDevice = true>
const char* cuda_get_error_name_wrapper(cudaError_t error) const {
if constexpr (setCudaDevice) set_cuda_device();
Expand Down
10 changes: 7 additions & 3 deletions core/src/HIP/Kokkos_HIP.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,13 @@ class HIP {

static int impl_is_initialized();

// static size_type device_arch();

static size_type detect_device_count();
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
KOKKOS_DEPRECATED static size_type detect_device_count() {
int count;
KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count));
return count;
}
#endif

#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
static int concurrency();
Expand Down
8 changes: 0 additions & 8 deletions core/src/HIP/Kokkos_HIP_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,12 +420,4 @@ void Kokkos::Impl::create_HIP_instances(std::vector<HIP> &instances) {
}
}

//----------------------------------------------------------------------------

namespace Kokkos {
HIP::size_type HIP::detect_device_count() {
int hipDevCount;
KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount));
return hipDevCount;
}
} // namespace Kokkos
8 changes: 6 additions & 2 deletions core/src/impl/Kokkos_Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,13 @@ void combine(Kokkos::Tools::InitArguments& out,

int get_device_count() {
#if defined(KOKKOS_ENABLE_CUDA)
return Kokkos::Cuda::detect_device_count();
int count;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
return count;
#elif defined(KOKKOS_ENABLE_HIP)
return Kokkos::HIP::detect_device_count();
int count;
KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count));
return count;
#elif defined(KOKKOS_ENABLE_SYCL)
return sycl::device::get_devices(sycl::info::device_type::gpu).size();
#elif defined(KOKKOS_ENABLE_OPENACC)
Expand Down

0 comments on commit c75d730

Please sign in to comment.