fix CUB inf handling

leofang · Feb 22, 2024 · ff9f09b · ff9f09b
1 parent c1bd065
commit ff9f09b
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 46 deletions.
diff --git a/cupy/cuda/cupy_cub.cu b/cupy/cuda/cupy_cub.cu
@@ -56,6 +56,44 @@ struct FpLimits<complex<double>>
 template <> struct NumericTraits<complex<float>>  : BaseTraits<FLOATING_POINT, true, false, unsigned int, complex<float>> {};
 template <> struct NumericTraits<complex<double>> : BaseTraits<FLOATING_POINT, true, false, unsigned long long, complex<double>> {};
 
+// need specializations for initial values
+namespace std {
+
+template <>
+class numeric_limits<thrust::complex<float>> {
+  public:
+    static __host__ __device__ thrust::complex<float> infinity() noexcept {
+        return thrust::complex<float>(std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity());
+    }
+
+    static constexpr bool has_infinity = true;
+};
+
+template <>
+class numeric_limits<thrust::complex<double>> {
+  public:
+    static __host__ __device__ thrust::complex<double> infinity() noexcept {
+        return thrust::complex<double>(std::numeric_limits<double>::infinity(), std::numeric_limits<double>::infinity());
+    }
+
+    static constexpr bool has_infinity = true;
+};
+
+template <>
+class numeric_limits<__half> {
+  public:
+    static __host__ __device__ constexpr __half infinity() noexcept {
+        unsigned short inf_half = 0x7C00U;
+        __half inf_value = *reinterpret_cast<__half*>(&inf_half);
+        return inf_value;
+    }
+
+    static constexpr bool has_infinity = true;
+};
+
+}  // namespace std
+
+
 #else
 
 // hipCUB internally uses std::numeric_limits, so we should provide specializations for the complex numbers.
@@ -669,8 +707,17 @@ struct _cub_reduce_min {
     void operator()(void* workspace, size_t& workspace_size, void* x, void* y,
         int num_items, cudaStream_t s)
     {
-        DeviceReduce::Min(workspace, workspace_size, static_cast<T*>(x),
-            static_cast<T*>(y), num_items, s);
+        if constexpr (std::numeric_limits<T>::has_infinity)
+        {
+            DeviceReduce::Reduce(workspace, workspace_size, static_cast<T*>(x),
+                static_cast<T*>(y), num_items,
+                cub::Min(), std::numeric_limits<T>::infinity(), s);
+        }
+        else
+        {
+            DeviceReduce::Min(workspace, workspace_size, static_cast<T*>(x),
+                static_cast<T*>(y), num_items, s);
+        }
     }
 };
 
@@ -679,9 +726,19 @@ struct _cub_segmented_reduce_min {
     void operator()(void* workspace, size_t& workspace_size, void* x, void* y,
         int num_segments, seg_offset_itr offset_start, cudaStream_t s)
     {
-        DeviceSegmentedReduce::Min(workspace, workspace_size,
-            static_cast<T*>(x), static_cast<T*>(y), num_segments,
-            offset_start, offset_start+1, s);
+        if constexpr (std::numeric_limits<T>::has_infinity)
+        {
+            DeviceSegmentedReduce::Reduce(workspace, workspace_size,
+                static_cast<T*>(x), static_cast<T*>(y), num_segments,
+                offset_start, offset_start+1,
+                cub::Min(), std::numeric_limits<T>::infinity(), s);
+        }
+        else
+        {
+            DeviceSegmentedReduce::Min(workspace, workspace_size,
+                static_cast<T*>(x), static_cast<T*>(y), num_segments,
+                offset_start, offset_start+1, s);
+        }
     }
 };
 
@@ -693,8 +750,17 @@ struct _cub_reduce_max {
     void operator()(void* workspace, size_t& workspace_size, void* x, void* y,
         int num_items, cudaStream_t s)
     {
-        DeviceReduce::Max(workspace, workspace_size, static_cast<T*>(x),
-            static_cast<T*>(y), num_items, s);
+        if constexpr (std::numeric_limits<T>::has_infinity)
+        {
+            DeviceReduce::Reduce(workspace, workspace_size, static_cast<T*>(x),
+                static_cast<T*>(y), num_items,
+                cub::Max(), -std::numeric_limits<T>::infinity(), s);
+        }
+        else
+        {
+            DeviceReduce::Max(workspace, workspace_size, static_cast<T*>(x),
+                static_cast<T*>(y), num_items, s);
+        }
     }
 };
 
@@ -703,9 +769,19 @@ struct _cub_segmented_reduce_max {
     void operator()(void* workspace, size_t& workspace_size, void* x, void* y,
         int num_segments, seg_offset_itr offset_start, cudaStream_t s)
     {
-        DeviceSegmentedReduce::Max(workspace, workspace_size,
-            static_cast<T*>(x), static_cast<T*>(y), num_segments,
-            offset_start, offset_start+1, s);
+        if constexpr (std::numeric_limits<T>::has_infinity)
+        {
+            DeviceSegmentedReduce::Reduce(workspace, workspace_size,
+                static_cast<T*>(x), static_cast<T*>(y), num_segments,
+                offset_start, offset_start+1,
+                cub::Max(), -std::numeric_limits<T>::infinity(), s);
+        }
+        else
+        {
+            DeviceSegmentedReduce::Max(workspace, workspace_size,
+                static_cast<T*>(x), static_cast<T*>(y), num_segments,
+                offset_start, offset_start+1, s);
+        }
     }
 };
 

diff --git a/install/cupy_builder/_compiler.py b/install/cupy_builder/_compiler.py
@@ -133,7 +133,7 @@ def _nvcc_gencode_options(cuda_version: int) -> List[str]:
                     ('compute_72', 'sm_72'),  # Jetson (Xavier)
                     ('compute_87', 'sm_87'),  # Jetson (Orin)
                 ]
-        elif cuda_version >= 11010:
+        elif cuda_version >= 11020:
             arch_list = ['compute_35',
                          'compute_50',
                          ('compute_60', 'sm_60'),
@@ -143,23 +143,6 @@ def _nvcc_gencode_options(cuda_version: int) -> List[str]:
                          ('compute_80', 'sm_80'),
                          ('compute_86', 'sm_86'),
                          'compute_86']
-        elif cuda_version >= 11000:
-            arch_list = ['compute_35',
-                         'compute_50',
-                         ('compute_60', 'sm_60'),
-                         ('compute_61', 'sm_61'),
-                         ('compute_70', 'sm_70'),
-                         ('compute_75', 'sm_75'),
-                         ('compute_80', 'sm_80'),
-                         'compute_80']
-        elif cuda_version >= 10000:
-            arch_list = ['compute_30',
-                         'compute_50',
-                         ('compute_60', 'sm_60'),
-                         ('compute_61', 'sm_61'),
-                         ('compute_70', 'sm_70'),
-                         ('compute_75', 'sm_75'),
-                         'compute_70']
         else:
             # This should not happen.
             assert False
@@ -215,14 +198,14 @@ def _compile_unix_nvcc(self, obj: str, src: str, ext: Extension) -> None:
 
         cuda_version = self._context.features['cuda'].get_version()
         postargs = _nvcc_gencode_options(cuda_version) + [
-            '-Xfatbin=-compress-all', '-O2', '--compiler-options="-fPIC"']
-        if cuda_version >= 11020:
-            postargs += ['--std=c++14']
-            num_threads = int(os.environ.get('CUPY_NUM_NVCC_THREADS', '2'))
-            postargs += [f'-t{num_threads}']
-        else:
-            postargs += ['--std=c++11']
-        postargs += ['-Xcompiler=-fno-gnu-unique']
+            '-Xfatbin=-compress-all', '-O2', '--compiler-options="-fPIC"',
+            '--expt-relaxed-constexpr']
+        num_threads = int(os.environ.get('CUPY_NUM_NVCC_THREADS', '2'))
+        # Note: we only support CUDA 11.2+ since CuPy v13.0.0.
+        # Bumping C++ standard from C++14 to C++17 for "if constexpr"
+        postargs += ['--std=c++17',
+                     f'-t{num_threads}',
+                     '-Xcompiler=-fno-gnu-unique']
         print('NVCC options:', postargs)
         self.spawn(compiler_so + base_opts + cc_args + [src, '-o', obj] +
                    postargs)
@@ -257,17 +240,16 @@ def compile(self, obj: str, src: str, ext: Extension) -> None:
         cuda_version = self._context.features['cuda'].get_version()
         postargs = _nvcc_gencode_options(cuda_version) + [
             '-Xfatbin=-compress-all', '-O2']
-        if cuda_version >= 11020:
-            # MSVC 14.0 (2015) is deprecated for CUDA 11.2 but we need it
-            # to build CuPy because some Python versions were built using it.
-            # REF: https://wiki.python.org/moin/WindowsCompilers
-            postargs += ['-allow-unsupported-compiler']
+        # Note: we only support CUDA 11.2+ since CuPy v13.0.0.
+        # MSVC 14.0 (2015) is deprecated for CUDA 11.2 but we need it
+        # to build CuPy because some Python versions were built using it.
+        # REF: https://wiki.python.org/moin/WindowsCompilers
+        postargs += ['-allow-unsupported-compiler']
         postargs += ['-Xcompiler', '/MD', '-D_USE_MATH_DEFINES']
-        # This is to compile thrust with MSVC2015
-        if cuda_version >= 11020:
-            postargs += ['--std=c++14']
-            num_threads = int(os.environ.get('CUPY_NUM_NVCC_THREADS', '2'))
-            postargs += [f'-t{num_threads}']
+        # Bumping C++ standard from C++14 to C++17 for "if constexpr"
+        num_threads = int(os.environ.get('CUPY_NUM_NVCC_THREADS', '2'))
+        postargs += ['--std=c++17',
+                     f'-t{num_threads}']
         cl_exe_path = self._find_host_compiler_path()
         if cl_exe_path is not None:
             print(f'Using host compiler at {cl_exe_path}')