kokkos · dalg24 · Mar 11, 2023 · Jan 11, 2023 · Mar 2, 2023 · Mar 1, 2023
diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -212,16 +212,33 @@ class ParallelReduce<CombinedFunctorReducerType,
   using size_type      = Cuda::size_type;
   using reducer_type   = ReducerType;
 
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  static_assert(sizeof(size_type) == 4);
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < 4,
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
+
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
 
   const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;  // used for workrange and nwork
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
-  size_type* m_scratch_space;
+  word_size_type* m_scratch_space;
   size_type* m_scratch_flags;
-  size_type* m_unified_space;
+  word_size_type* m_unified_space;
 
   using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
       Policy::rank, Policy, FunctorType, typename Policy::work_tag,
@@ -248,21 +265,22 @@ class ParallelReduce<CombinedFunctorReducerType,
   }
 
   inline __device__ void operator()() const {
-    const integral_nonzero_constant<
-        size_type, ReducerType::static_value_size() / sizeof(size_type)>
+    const integral_nonzero_constant<word_size_type,
+                                    ReducerType::static_value_size() /
+                                        sizeof(word_size_type)>
         word_count(m_functor_reducer.get_reducer().value_size() /
-                   sizeof(size_type));
+                   sizeof(word_size_type));
 
     {
       reference_type value =
           m_functor_reducer.get_reducer().init(reinterpret_cast<pointer_type>(
-              kokkos_impl_cuda_shared_memory<size_type>() +
+              kokkos_impl_cuda_shared_memory<word_size_type>() +
               threadIdx.y * word_count.value));
 
       // Number of blocks is bounded so that the reduction can be limited to two
       // passes. Each thread block is given an approximately equal amount of
       // work to perform. Accumulate the values for this block. The accumulation
-      // ordering does not match the final pass, but is arithmatically
+      // ordering does not match the final pass, but is arithmetically
       // equivalent.
 
       this->exec_range(value);
@@ -272,15 +290,16 @@ class ParallelReduce<CombinedFunctorReducerType,
     // Problem: non power-of-two blockDim
     if (cuda_single_inter_block_reduce_scan<false>(
             m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x,
-            kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+            kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
             m_scratch_flags)) {
       // This is the final block with the final result at the final threads'
       // location
-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global =
+      word_size_type* const shared =
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
           m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
               : (m_unified_space ? m_unified_space : m_scratch_space);
 
       if (threadIdx.y == 0) {
@@ -342,13 +361,16 @@ class ParallelReduce<CombinedFunctorReducerType,
                        : suggested_blocksize;  // Note: block_size must be less
                                                // than or equal to 512
 
-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), m_functor_reducer.get_reducer().value_size() *
-                                block_size /* block_size == max block_count */);
+      m_scratch_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
+              m_policy.space(),
+              m_functor_reducer.get_reducer().value_size() *
+                  block_size /* block_size == max block_count */));
       m_scratch_flags =
           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), m_functor_reducer.get_reducer().value_size());
+      m_unified_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
+              m_policy.space(), m_functor_reducer.get_reducer().value_size()));
 
       // REQUIRED ( 1 , N , 1 )
       const dim3 block(1, block_size, 1);

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -203,7 +203,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
       // Number of blocks is bounded so that the reduction can be limited to two
       // passes. Each thread block is given an approximately equal amount of
       // work to perform. Accumulate the values for this block. The accumulation
-      // ordering does not match the final pass, but is arithmatically
+      // ordering does not match the final pass, but is arithmetically
       // equivalent.
 
       const WorkRange range(m_policy, blockIdx.x, gridDim.x);
@@ -463,7 +463,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
     // Number of blocks is bounded so that the reduction can be limited to two
     // passes. Each thread block is given an approximately equal amount of work
     // to perform. Accumulate the values for this block. The accumulation
-    // ordering does not match the final pass, but is arithmatically equivalent.
+    // ordering does not match the final pass, but is arithmetically equivalent.
 
     const WorkRange range(m_policy, blockIdx.x, gridDim.x);
 
@@ -780,7 +780,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     // Number of blocks is bounded so that the reduction can be limited to two
     // passes. Each thread block is given an approximately equal amount of work
     // to perform. Accumulate the values for this block. The accumulation
-    // ordering does not match the final pass, but is arithmatically equivalent.
+    // ordering does not match the final pass, but is arithmetically equivalent.
 
     const WorkRange range(m_policy, blockIdx.x, gridDim.x);
 

diff --git a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
@@ -188,14 +188,31 @@ class ParallelReduce<CombinedFunctorReducerType,
   using functor_type   = FunctorType;
   using size_type      = HIP::size_type;
 
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::HIP::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  static_assert(sizeof(size_type) == 4);
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < 4,
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
+
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
 
   const CombinedFunctorReducerType m_functor_reducer;
   const Policy m_policy;  // used for workrange and nwork
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
-  size_type* m_scratch_space;
+  word_size_type* m_scratch_space;
   size_type* m_scratch_flags;
 
   using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
@@ -210,19 +227,20 @@ class ParallelReduce<CombinedFunctorReducerType,
   inline __device__ void operator()() const {
     const ReducerType& reducer = m_functor_reducer.get_reducer();
 
-    const integral_nonzero_constant<
-        size_type, ReducerType::static_value_size() / sizeof(size_type)>
-        word_count(reducer.value_size() / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type,
+                                    ReducerType::static_value_size() /
+                                        sizeof(word_size_type)>
+        word_count(reducer.value_size() / sizeof(word_size_type));
 
     {
       reference_type value = reducer.init(reinterpret_cast<pointer_type>(
-          kokkos_impl_hip_shared_memory<size_type>() +
+          kokkos_impl_hip_shared_memory<word_size_type>() +
           threadIdx.y * word_count.value));
 
       // Number of blocks is bounded so that the reduction can be limited to two
       // passes. Each thread block is given an approximately equal amount of
       // work to perform. Accumulate the values for this block. The accumulation
-      // ordering does not match the final pass, but is arithmatically
+      // ordering does not match the final pass, but is arithmetically
       // equivalent.
 
       this->exec_range(value);
@@ -232,15 +250,17 @@ class ParallelReduce<CombinedFunctorReducerType,
     // Problem: non power-of-two blockDim
     if (::Kokkos::Impl::hip_single_inter_block_reduce_scan<false>(
             reducer, blockIdx.x, gridDim.x,
-            kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space,
+            kokkos_impl_hip_shared_memory<word_size_type>(), m_scratch_space,
             m_scratch_flags)) {
       // This is the final block with the final result at the final threads'
       // location
-      size_type* const shared = kokkos_impl_hip_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global = m_result_ptr_device_accessible
-                                    ? reinterpret_cast<size_type*>(m_result_ptr)
-                                    : m_scratch_space;
+      word_size_type* const shared =
+          kokkos_impl_hip_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
+          m_result_ptr_device_accessible
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
+              : m_scratch_space;
 
       if (threadIdx.y == 0) {
         reducer.final(reinterpret_cast<value_type*>(shared));
@@ -294,9 +314,11 @@ class ParallelReduce<CombinedFunctorReducerType,
                        : suggested_blocksize;  // Note: block_size must be less
                                                // than or equal to 512
 
-      m_scratch_space = hip_internal_scratch_space(
-          m_policy.space(), reducer.value_size() *
-                                block_size /* block_size == max block_count */);
+      m_scratch_space =
+          reinterpret_cast<word_size_type*>(hip_internal_scratch_space(
+              m_policy.space(),
+              reducer.value_size() *
+                  block_size /* block_size == max block_count */));
       m_scratch_flags =
           hip_internal_scratch_flags(m_policy.space(), sizeof(size_type));
 

diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt
@@ -169,6 +169,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
         MDRange_f
         MDRange_g
         MDRangePolicyConstructors
+        MDRangeReduce
         MDSpan
         MinMaxClamp
         NumericTraits

diff --git a/core/unit_test/TestMDRangeReduce.hpp b/core/unit_test/TestMDRangeReduce.hpp
@@ -0,0 +1,68 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+namespace {
+
+template <typename T>
+void MDRangeReduceTester([[maybe_unused]] int bound, int k) {
+  const auto policy_MD = Kokkos::MDRangePolicy<Kokkos::Rank<2>, TEST_EXECSPACE>(
+      {0, 0}, {bound, 2});
+
+  // No explicit fence() calls needed because result is in HostSpace
+  {
+    T lor_MD = 0;
+    Kokkos::parallel_reduce(
+        policy_MD,
+        KOKKOS_LAMBDA(const int i, const int, T& res) { res = res || i == k; },
+        Kokkos::LOr<T>(lor_MD));
+    EXPECT_EQ(lor_MD, 1);
+  }
+  {
+    // Stick just a few true values in the Logical-OR reduction space,
+    // to try to make sure every value is being captured
+    T land_MD = 0;
+    Kokkos::parallel_reduce(
+        policy_MD, KOKKOS_LAMBDA(const int, const int, T& res) { res = 1; },
+        Kokkos::LAnd<T>(land_MD));
+    EXPECT_EQ(land_MD, 1);
+  }
+}
+
+TEST(TEST_CATEGORY, mdrange_parallel_reduce_primitive_types) {
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+  GTEST_SKIP() << "FIXME OPENMPTARGET Tests of MDRange reduce over values "
+                  "smaller than int would fail";
+#elif !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+  GTEST_SKIP() << "Skipped ENABLE_CUDA_LAMBDA";
+#else
+  for (int bound : {0, 1, 7, 32, 65, 7000}) {
+    for (int k = 0; k < bound; ++k) {
+      MDRangeReduceTester<bool>(bound, k);
+      MDRangeReduceTester<signed char>(bound, k);
+      MDRangeReduceTester<int8_t>(bound, k);
+      MDRangeReduceTester<int16_t>(bound, k);
+      MDRangeReduceTester<int32_t>(bound, k);
+      MDRangeReduceTester<int64_t>(bound, k);
+    }
+  }
+#endif
+}
+
+}  // namespace