Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#5641: Fix HIP & CUDA MDRange reduce for sizeof(value_type) < sizeof(int) #5745

Merged
merged 10 commits into from Mar 11, 2023
56 changes: 39 additions & 17 deletions core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
Expand Up @@ -212,16 +212,33 @@ class ParallelReduce<CombinedFunctorReducerType,
using size_type = Cuda::size_type;
using reducer_type = ReducerType;

// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::Cuda::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the reduction is performed.
// Within the reduction, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the reduction, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
static_assert(sizeof(size_type) == 4);
using word_size_type = std::conditional_t<
sizeof(value_type) < 4,
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;

// Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
// blockDim.z == 1

const CombinedFunctorReducerType m_functor_reducer;
const Policy m_policy; // used for workrange and nwork
const pointer_type m_result_ptr;
const bool m_result_ptr_device_accessible;
size_type* m_scratch_space;
word_size_type* m_scratch_space;
size_type* m_scratch_flags;
size_type* m_unified_space;
word_size_type* m_unified_space;

using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
Policy::rank, Policy, FunctorType, typename Policy::work_tag,
Expand All @@ -248,21 +265,22 @@ class ParallelReduce<CombinedFunctorReducerType,
}

inline __device__ void operator()() const {
const integral_nonzero_constant<
size_type, ReducerType::static_value_size() / sizeof(size_type)>
const integral_nonzero_constant<word_size_type,
ReducerType::static_value_size() /
sizeof(word_size_type)>
word_count(m_functor_reducer.get_reducer().value_size() /
sizeof(size_type));
sizeof(word_size_type));

{
reference_type value =
m_functor_reducer.get_reducer().init(reinterpret_cast<pointer_type>(
kokkos_impl_cuda_shared_memory<size_type>() +
kokkos_impl_cuda_shared_memory<word_size_type>() +
threadIdx.y * word_count.value));

// Number of blocks is bounded so that the reduction can be limited to two
// passes. Each thread block is given an approximately equal amount of
// work to perform. Accumulate the values for this block. The accumulation
// ordering does not match the final pass, but is arithmatically
// ordering does not match the final pass, but is arithmetically
// equivalent.

this->exec_range(value);
Expand All @@ -272,15 +290,16 @@ class ParallelReduce<CombinedFunctorReducerType,
// Problem: non power-of-two blockDim
if (cuda_single_inter_block_reduce_scan<false>(
m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x,
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
m_scratch_flags)) {
// This is the final block with the final result at the final threads'
// location
size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
(blockDim.y - 1) * word_count.value;
size_type* const global =
word_size_type* const shared =
kokkos_impl_cuda_shared_memory<word_size_type>() +
(blockDim.y - 1) * word_count.value;
word_size_type* const global =
m_result_ptr_device_accessible
? reinterpret_cast<size_type*>(m_result_ptr)
? reinterpret_cast<word_size_type*>(m_result_ptr)
: (m_unified_space ? m_unified_space : m_scratch_space);

if (threadIdx.y == 0) {
Expand Down Expand Up @@ -342,13 +361,16 @@ class ParallelReduce<CombinedFunctorReducerType,
: suggested_blocksize; // Note: block_size must be less
// than or equal to 512

m_scratch_space = cuda_internal_scratch_space(
m_policy.space(), m_functor_reducer.get_reducer().value_size() *
block_size /* block_size == max block_count */);
m_scratch_space =
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
m_policy.space(),
m_functor_reducer.get_reducer().value_size() *
block_size /* block_size == max block_count */));
m_scratch_flags =
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
m_unified_space = cuda_internal_scratch_unified(
m_policy.space(), m_functor_reducer.get_reducer().value_size());
m_unified_space =
reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
m_policy.space(), m_functor_reducer.get_reducer().value_size()));

// REQUIRED ( 1 , N , 1 )
const dim3 block(1, block_size, 1);
Expand Down
6 changes: 3 additions & 3 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
Expand Up @@ -203,7 +203,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
// Number of blocks is bounded so that the reduction can be limited to two
// passes. Each thread block is given an approximately equal amount of
// work to perform. Accumulate the values for this block. The accumulation
// ordering does not match the final pass, but is arithmatically
// ordering does not match the final pass, but is arithmetically
// equivalent.

const WorkRange range(m_policy, blockIdx.x, gridDim.x);
Expand Down Expand Up @@ -463,7 +463,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
// Number of blocks is bounded so that the reduction can be limited to two
// passes. Each thread block is given an approximately equal amount of work
// to perform. Accumulate the values for this block. The accumulation
// ordering does not match the final pass, but is arithmatically equivalent.
// ordering does not match the final pass, but is arithmetically equivalent.

const WorkRange range(m_policy, blockIdx.x, gridDim.x);

Expand Down Expand Up @@ -780,7 +780,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
// Number of blocks is bounded so that the reduction can be limited to two
// passes. Each thread block is given an approximately equal amount of work
// to perform. Accumulate the values for this block. The accumulation
// ordering does not match the final pass, but is arithmatically equivalent.
// ordering does not match the final pass, but is arithmetically equivalent.

const WorkRange range(m_policy, blockIdx.x, gridDim.x);

Expand Down
52 changes: 37 additions & 15 deletions core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
Expand Up @@ -188,14 +188,31 @@ class ParallelReduce<CombinedFunctorReducerType,
using functor_type = FunctorType;
using size_type = HIP::size_type;

// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::HIP::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the reduction is performed.
// Within the reduction, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the reduction, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
static_assert(sizeof(size_type) == 4);
using word_size_type = std::conditional_t<
sizeof(value_type) < 4,
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;

// Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
// blockDim.z == 1

const CombinedFunctorReducerType m_functor_reducer;
const Policy m_policy; // used for workrange and nwork
const pointer_type m_result_ptr;
const bool m_result_ptr_device_accessible;
size_type* m_scratch_space;
word_size_type* m_scratch_space;
size_type* m_scratch_flags;

using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
Expand All @@ -210,19 +227,20 @@ class ParallelReduce<CombinedFunctorReducerType,
inline __device__ void operator()() const {
const ReducerType& reducer = m_functor_reducer.get_reducer();

const integral_nonzero_constant<
size_type, ReducerType::static_value_size() / sizeof(size_type)>
word_count(reducer.value_size() / sizeof(size_type));
const integral_nonzero_constant<word_size_type,
ReducerType::static_value_size() /
sizeof(word_size_type)>
word_count(reducer.value_size() / sizeof(word_size_type));

{
reference_type value = reducer.init(reinterpret_cast<pointer_type>(
kokkos_impl_hip_shared_memory<size_type>() +
kokkos_impl_hip_shared_memory<word_size_type>() +
threadIdx.y * word_count.value));

// Number of blocks is bounded so that the reduction can be limited to two
// passes. Each thread block is given an approximately equal amount of
// work to perform. Accumulate the values for this block. The accumulation
// ordering does not match the final pass, but is arithmatically
// ordering does not match the final pass, but is arithmetically
// equivalent.

this->exec_range(value);
Expand All @@ -232,15 +250,17 @@ class ParallelReduce<CombinedFunctorReducerType,
// Problem: non power-of-two blockDim
if (::Kokkos::Impl::hip_single_inter_block_reduce_scan<false>(
reducer, blockIdx.x, gridDim.x,
kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space,
kokkos_impl_hip_shared_memory<word_size_type>(), m_scratch_space,
m_scratch_flags)) {
// This is the final block with the final result at the final threads'
// location
size_type* const shared = kokkos_impl_hip_shared_memory<size_type>() +
(blockDim.y - 1) * word_count.value;
size_type* const global = m_result_ptr_device_accessible
? reinterpret_cast<size_type*>(m_result_ptr)
: m_scratch_space;
word_size_type* const shared =
kokkos_impl_hip_shared_memory<word_size_type>() +
(blockDim.y - 1) * word_count.value;
word_size_type* const global =
m_result_ptr_device_accessible
? reinterpret_cast<word_size_type*>(m_result_ptr)
: m_scratch_space;

if (threadIdx.y == 0) {
reducer.final(reinterpret_cast<value_type*>(shared));
Expand Down Expand Up @@ -294,9 +314,11 @@ class ParallelReduce<CombinedFunctorReducerType,
: suggested_blocksize; // Note: block_size must be less
// than or equal to 512

m_scratch_space = hip_internal_scratch_space(
m_policy.space(), reducer.value_size() *
block_size /* block_size == max block_count */);
m_scratch_space =
reinterpret_cast<word_size_type*>(hip_internal_scratch_space(
m_policy.space(),
reducer.value_size() *
block_size /* block_size == max block_count */));
m_scratch_flags =
hip_internal_scratch_flags(m_policy.space(), sizeof(size_type));

Expand Down
1 change: 1 addition & 0 deletions core/unit_test/CMakeLists.txt
Expand Up @@ -169,6 +169,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
MDRange_f
MDRange_g
MDRangePolicyConstructors
MDRangeReduce
MDSpan
MinMaxClamp
NumericTraits
Expand Down
68 changes: 68 additions & 0 deletions core/unit_test/TestMDRangeReduce.hpp
@@ -0,0 +1,68 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#include <gtest/gtest.h>

#include <Kokkos_Core.hpp>

namespace {

template <typename T>
void MDRangeReduceTester([[maybe_unused]] int bound, int k) {
const auto policy_MD = Kokkos::MDRangePolicy<Kokkos::Rank<2>, TEST_EXECSPACE>(
{0, 0}, {bound, 2});

// No explicit fence() calls needed because result is in HostSpace
{
T lor_MD = 0;
PhilMiller marked this conversation as resolved.
Show resolved Hide resolved
Kokkos::parallel_reduce(
policy_MD,
KOKKOS_LAMBDA(const int i, const int, T& res) { res = res || i == k; },
PhilMiller marked this conversation as resolved.
Show resolved Hide resolved
Kokkos::LOr<T>(lor_MD));
EXPECT_EQ(lor_MD, 1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider putting the type and the arguments value into the stream from the assertion to make failure messages more useful.

}
{
// Stick just a few true values in the Logical-OR reduction space,
// to try to make sure every value is being captured
T land_MD = 0;
Kokkos::parallel_reduce(
policy_MD, KOKKOS_LAMBDA(const int, const int, T& res) { res = 1; },
Kokkos::LAnd<T>(land_MD));
EXPECT_EQ(land_MD, 1);
}
}

TEST(TEST_CATEGORY, mdrange_parallel_reduce_primitive_types) {
#if defined(KOKKOS_ENABLE_OPENMPTARGET)
GTEST_SKIP() << "FIXME OPENMPTARGET Tests of MDRange reduce over values "
"smaller than int would fail";
#elif !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
PhilMiller marked this conversation as resolved.
Show resolved Hide resolved
PhilMiller marked this conversation as resolved.
Show resolved Hide resolved
PhilMiller marked this conversation as resolved.
Show resolved Hide resolved
GTEST_SKIP() << "Skipped ENABLE_CUDA_LAMBDA";
#else
for (int bound : {0, 1, 7, 32, 65, 7000}) {
for (int k = 0; k < bound; ++k) {
MDRangeReduceTester<bool>(bound, k);
MDRangeReduceTester<signed char>(bound, k);
MDRangeReduceTester<int8_t>(bound, k);
MDRangeReduceTester<int16_t>(bound, k);
PhilMiller marked this conversation as resolved.
Show resolved Hide resolved
MDRangeReduceTester<int32_t>(bound, k);
MDRangeReduceTester<int64_t>(bound, k);
}
}
#endif
}

} // namespace