Skip to content

Commit

Permalink
fix CUB inf handling
Browse files Browse the repository at this point in the history
  • Loading branch information
leofang committed Feb 22, 2024
1 parent c1bd065 commit ff9f09b
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 46 deletions.
96 changes: 86 additions & 10 deletions cupy/cuda/cupy_cub.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,44 @@ struct FpLimits<complex<double>>
template <> struct NumericTraits<complex<float>> : BaseTraits<FLOATING_POINT, true, false, unsigned int, complex<float>> {};
template <> struct NumericTraits<complex<double>> : BaseTraits<FLOATING_POINT, true, false, unsigned long long, complex<double>> {};

// need specializations for initial values
namespace std {

template <>
class numeric_limits<thrust::complex<float>> {
public:
static __host__ __device__ thrust::complex<float> infinity() noexcept {
return thrust::complex<float>(std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity());
}

static constexpr bool has_infinity = true;
};

template <>
class numeric_limits<thrust::complex<double>> {
public:
static __host__ __device__ thrust::complex<double> infinity() noexcept {
return thrust::complex<double>(std::numeric_limits<double>::infinity(), std::numeric_limits<double>::infinity());
}

static constexpr bool has_infinity = true;
};

template <>
class numeric_limits<__half> {
public:
static __host__ __device__ constexpr __half infinity() noexcept {
unsigned short inf_half = 0x7C00U;
__half inf_value = *reinterpret_cast<__half*>(&inf_half);
return inf_value;
}

static constexpr bool has_infinity = true;
};

} // namespace std


#else

// hipCUB internally uses std::numeric_limits, so we should provide specializations for the complex numbers.
Expand Down Expand Up @@ -669,8 +707,17 @@ struct _cub_reduce_min {
void operator()(void* workspace, size_t& workspace_size, void* x, void* y,
int num_items, cudaStream_t s)
{
DeviceReduce::Min(workspace, workspace_size, static_cast<T*>(x),
static_cast<T*>(y), num_items, s);
if constexpr (std::numeric_limits<T>::has_infinity)
{
DeviceReduce::Reduce(workspace, workspace_size, static_cast<T*>(x),
static_cast<T*>(y), num_items,
cub::Min(), std::numeric_limits<T>::infinity(), s);
}
else
{
DeviceReduce::Min(workspace, workspace_size, static_cast<T*>(x),
static_cast<T*>(y), num_items, s);
}
}
};

Expand All @@ -679,9 +726,19 @@ struct _cub_segmented_reduce_min {
void operator()(void* workspace, size_t& workspace_size, void* x, void* y,
int num_segments, seg_offset_itr offset_start, cudaStream_t s)
{
DeviceSegmentedReduce::Min(workspace, workspace_size,
static_cast<T*>(x), static_cast<T*>(y), num_segments,
offset_start, offset_start+1, s);
if constexpr (std::numeric_limits<T>::has_infinity)
{
DeviceSegmentedReduce::Reduce(workspace, workspace_size,
static_cast<T*>(x), static_cast<T*>(y), num_segments,
offset_start, offset_start+1,
cub::Min(), std::numeric_limits<T>::infinity(), s);
}
else
{
DeviceSegmentedReduce::Min(workspace, workspace_size,
static_cast<T*>(x), static_cast<T*>(y), num_segments,
offset_start, offset_start+1, s);
}
}
};

Expand All @@ -693,8 +750,17 @@ struct _cub_reduce_max {
void operator()(void* workspace, size_t& workspace_size, void* x, void* y,
int num_items, cudaStream_t s)
{
DeviceReduce::Max(workspace, workspace_size, static_cast<T*>(x),
static_cast<T*>(y), num_items, s);
if constexpr (std::numeric_limits<T>::has_infinity)
{
DeviceReduce::Reduce(workspace, workspace_size, static_cast<T*>(x),
static_cast<T*>(y), num_items,
cub::Max(), -std::numeric_limits<T>::infinity(), s);
}
else
{
DeviceReduce::Max(workspace, workspace_size, static_cast<T*>(x),
static_cast<T*>(y), num_items, s);
}
}
};

Expand All @@ -703,9 +769,19 @@ struct _cub_segmented_reduce_max {
void operator()(void* workspace, size_t& workspace_size, void* x, void* y,
int num_segments, seg_offset_itr offset_start, cudaStream_t s)
{
DeviceSegmentedReduce::Max(workspace, workspace_size,
static_cast<T*>(x), static_cast<T*>(y), num_segments,
offset_start, offset_start+1, s);
if constexpr (std::numeric_limits<T>::has_infinity)
{
DeviceSegmentedReduce::Reduce(workspace, workspace_size,
static_cast<T*>(x), static_cast<T*>(y), num_segments,
offset_start, offset_start+1,
cub::Max(), -std::numeric_limits<T>::infinity(), s);
}
else
{
DeviceSegmentedReduce::Max(workspace, workspace_size,
static_cast<T*>(x), static_cast<T*>(y), num_segments,
offset_start, offset_start+1, s);
}
}
};

Expand Down
54 changes: 18 additions & 36 deletions install/cupy_builder/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def _nvcc_gencode_options(cuda_version: int) -> List[str]:
('compute_72', 'sm_72'), # Jetson (Xavier)
('compute_87', 'sm_87'), # Jetson (Orin)
]
elif cuda_version >= 11010:
elif cuda_version >= 11020:
arch_list = ['compute_35',
'compute_50',
('compute_60', 'sm_60'),
Expand All @@ -143,23 +143,6 @@ def _nvcc_gencode_options(cuda_version: int) -> List[str]:
('compute_80', 'sm_80'),
('compute_86', 'sm_86'),
'compute_86']
elif cuda_version >= 11000:
arch_list = ['compute_35',
'compute_50',
('compute_60', 'sm_60'),
('compute_61', 'sm_61'),
('compute_70', 'sm_70'),
('compute_75', 'sm_75'),
('compute_80', 'sm_80'),
'compute_80']
elif cuda_version >= 10000:
arch_list = ['compute_30',
'compute_50',
('compute_60', 'sm_60'),
('compute_61', 'sm_61'),
('compute_70', 'sm_70'),
('compute_75', 'sm_75'),
'compute_70']
else:
# This should not happen.
assert False
Expand Down Expand Up @@ -215,14 +198,14 @@ def _compile_unix_nvcc(self, obj: str, src: str, ext: Extension) -> None:

cuda_version = self._context.features['cuda'].get_version()
postargs = _nvcc_gencode_options(cuda_version) + [
'-Xfatbin=-compress-all', '-O2', '--compiler-options="-fPIC"']
if cuda_version >= 11020:
postargs += ['--std=c++14']
num_threads = int(os.environ.get('CUPY_NUM_NVCC_THREADS', '2'))
postargs += [f'-t{num_threads}']
else:
postargs += ['--std=c++11']
postargs += ['-Xcompiler=-fno-gnu-unique']
'-Xfatbin=-compress-all', '-O2', '--compiler-options="-fPIC"',
'--expt-relaxed-constexpr']
num_threads = int(os.environ.get('CUPY_NUM_NVCC_THREADS', '2'))
# Note: we only support CUDA 11.2+ since CuPy v13.0.0.
# Bumping C++ standard from C++14 to C++17 for "if constexpr"
postargs += ['--std=c++17',
f'-t{num_threads}',
'-Xcompiler=-fno-gnu-unique']
print('NVCC options:', postargs)
self.spawn(compiler_so + base_opts + cc_args + [src, '-o', obj] +
postargs)
Expand Down Expand Up @@ -257,17 +240,16 @@ def compile(self, obj: str, src: str, ext: Extension) -> None:
cuda_version = self._context.features['cuda'].get_version()
postargs = _nvcc_gencode_options(cuda_version) + [
'-Xfatbin=-compress-all', '-O2']
if cuda_version >= 11020:
# MSVC 14.0 (2015) is deprecated for CUDA 11.2 but we need it
# to build CuPy because some Python versions were built using it.
# REF: https://wiki.python.org/moin/WindowsCompilers
postargs += ['-allow-unsupported-compiler']
# Note: we only support CUDA 11.2+ since CuPy v13.0.0.
# MSVC 14.0 (2015) is deprecated for CUDA 11.2 but we need it
# to build CuPy because some Python versions were built using it.
# REF: https://wiki.python.org/moin/WindowsCompilers
postargs += ['-allow-unsupported-compiler']
postargs += ['-Xcompiler', '/MD', '-D_USE_MATH_DEFINES']
# This is to compile thrust with MSVC2015
if cuda_version >= 11020:
postargs += ['--std=c++14']
num_threads = int(os.environ.get('CUPY_NUM_NVCC_THREADS', '2'))
postargs += [f'-t{num_threads}']
# Bumping C++ standard from C++14 to C++17 for "if constexpr"
num_threads = int(os.environ.get('CUPY_NUM_NVCC_THREADS', '2'))
postargs += ['--std=c++17',
f'-t{num_threads}']
cl_exe_path = self._find_host_compiler_path()
if cl_exe_path is not None:
print(f'Using host compiler at {cl_exe_path}')
Expand Down

0 comments on commit ff9f09b

Please sign in to comment.