Skip to content

Commit

Permalink
#5641: Fix HIP & CUDA MDRange reduce for sizeof(value_type) < sizeof(…
Browse files Browse the repository at this point in the history
…int) (#5745)

* Fix comment typo noticed in early analysis

* Add test case

* #5641: HIP: Fix MDRange parallel_reduce over values smaller than int

* #5641 Cuda: Fix MDRange parallel_reduce over values smaller than int

* Try to appease icpc's idiocy

* Skip the test for OpenMPTarget backend, since it's broken

* Sample bound values to test, rather than sweeping

* Shrink largest bound value to avoid timeout

* Report skipped in disabled CUDA extended lambda case

* Fix skipping condition
  • Loading branch information
PhilMiller committed Mar 11, 2023
1 parent 9786d57 commit ee75763
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 35 deletions.
56 changes: 39 additions & 17 deletions core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
Expand Up @@ -212,16 +212,33 @@ class ParallelReduce<CombinedFunctorReducerType,
using size_type = Cuda::size_type;
using reducer_type = ReducerType;

// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::Cuda::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the reduction is performed.
// Within the reduction, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the reduction, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
static_assert(sizeof(size_type) == 4);
using word_size_type = std::conditional_t<
sizeof(value_type) < 4,
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;

// Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
// blockDim.z == 1

const CombinedFunctorReducerType m_functor_reducer;
const Policy m_policy; // used for workrange and nwork
const pointer_type m_result_ptr;
const bool m_result_ptr_device_accessible;
size_type* m_scratch_space;
word_size_type* m_scratch_space;
size_type* m_scratch_flags;
size_type* m_unified_space;
word_size_type* m_unified_space;

using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
Policy::rank, Policy, FunctorType, typename Policy::work_tag,
Expand All @@ -248,21 +265,22 @@ class ParallelReduce<CombinedFunctorReducerType,
}

inline __device__ void operator()() const {
const integral_nonzero_constant<
size_type, ReducerType::static_value_size() / sizeof(size_type)>
const integral_nonzero_constant<word_size_type,
ReducerType::static_value_size() /
sizeof(word_size_type)>
word_count(m_functor_reducer.get_reducer().value_size() /
sizeof(size_type));
sizeof(word_size_type));

{
reference_type value =
m_functor_reducer.get_reducer().init(reinterpret_cast<pointer_type>(
kokkos_impl_cuda_shared_memory<size_type>() +
kokkos_impl_cuda_shared_memory<word_size_type>() +
threadIdx.y * word_count.value));

// Number of blocks is bounded so that the reduction can be limited to two
// passes. Each thread block is given an approximately equal amount of
// work to perform. Accumulate the values for this block. The accumulation
// ordering does not match the final pass, but is arithmatically
// ordering does not match the final pass, but is arithmetically
// equivalent.

this->exec_range(value);
Expand All @@ -272,15 +290,16 @@ class ParallelReduce<CombinedFunctorReducerType,
// Problem: non power-of-two blockDim
if (cuda_single_inter_block_reduce_scan<false>(
m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x,
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
m_scratch_flags)) {
// This is the final block with the final result at the final threads'
// location
size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
(blockDim.y - 1) * word_count.value;
size_type* const global =
word_size_type* const shared =
kokkos_impl_cuda_shared_memory<word_size_type>() +
(blockDim.y - 1) * word_count.value;
word_size_type* const global =
m_result_ptr_device_accessible
? reinterpret_cast<size_type*>(m_result_ptr)
? reinterpret_cast<word_size_type*>(m_result_ptr)
: (m_unified_space ? m_unified_space : m_scratch_space);

if (threadIdx.y == 0) {
Expand Down Expand Up @@ -342,13 +361,16 @@ class ParallelReduce<CombinedFunctorReducerType,
: suggested_blocksize; // Note: block_size must be less
// than or equal to 512

m_scratch_space = cuda_internal_scratch_space(
m_policy.space(), m_functor_reducer.get_reducer().value_size() *
block_size /* block_size == max block_count */);
m_scratch_space =
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
m_policy.space(),
m_functor_reducer.get_reducer().value_size() *
block_size /* block_size == max block_count */));
m_scratch_flags =
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
m_unified_space = cuda_internal_scratch_unified(
m_policy.space(), m_functor_reducer.get_reducer().value_size());
m_unified_space =
reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
m_policy.space(), m_functor_reducer.get_reducer().value_size()));

// REQUIRED ( 1 , N , 1 )
const dim3 block(1, block_size, 1);
Expand Down
6 changes: 3 additions & 3 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
Expand Up @@ -203,7 +203,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
// Number of blocks is bounded so that the reduction can be limited to two
// passes. Each thread block is given an approximately equal amount of
// work to perform. Accumulate the values for this block. The accumulation
// ordering does not match the final pass, but is arithmatically
// ordering does not match the final pass, but is arithmetically
// equivalent.

const WorkRange range(m_policy, blockIdx.x, gridDim.x);
Expand Down Expand Up @@ -463,7 +463,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
// Number of blocks is bounded so that the reduction can be limited to two
// passes. Each thread block is given an approximately equal amount of work
// to perform. Accumulate the values for this block. The accumulation
// ordering does not match the final pass, but is arithmatically equivalent.
// ordering does not match the final pass, but is arithmetically equivalent.

const WorkRange range(m_policy, blockIdx.x, gridDim.x);

Expand Down Expand Up @@ -780,7 +780,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
// Number of blocks is bounded so that the reduction can be limited to two
// passes. Each thread block is given an approximately equal amount of work
// to perform. Accumulate the values for this block. The accumulation
// ordering does not match the final pass, but is arithmatically equivalent.
// ordering does not match the final pass, but is arithmetically equivalent.

const WorkRange range(m_policy, blockIdx.x, gridDim.x);

Expand Down
52 changes: 37 additions & 15 deletions core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
Expand Up @@ -188,14 +188,31 @@ class ParallelReduce<CombinedFunctorReducerType,
using functor_type = FunctorType;
using size_type = HIP::size_type;

// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::HIP::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the reduction is performed.
// Within the reduction, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the reduction, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
static_assert(sizeof(size_type) == 4);
using word_size_type = std::conditional_t<
sizeof(value_type) < 4,
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;

// Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
// blockDim.z == 1

const CombinedFunctorReducerType m_functor_reducer;
const Policy m_policy; // used for workrange and nwork
const pointer_type m_result_ptr;
const bool m_result_ptr_device_accessible;
size_type* m_scratch_space;
word_size_type* m_scratch_space;
size_type* m_scratch_flags;

using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
Expand All @@ -210,19 +227,20 @@ class ParallelReduce<CombinedFunctorReducerType,
inline __device__ void operator()() const {
const ReducerType& reducer = m_functor_reducer.get_reducer();

const integral_nonzero_constant<
size_type, ReducerType::static_value_size() / sizeof(size_type)>
word_count(reducer.value_size() / sizeof(size_type));
const integral_nonzero_constant<word_size_type,
ReducerType::static_value_size() /
sizeof(word_size_type)>
word_count(reducer.value_size() / sizeof(word_size_type));

{
reference_type value = reducer.init(reinterpret_cast<pointer_type>(
kokkos_impl_hip_shared_memory<size_type>() +
kokkos_impl_hip_shared_memory<word_size_type>() +
threadIdx.y * word_count.value));

// Number of blocks is bounded so that the reduction can be limited to two
// passes. Each thread block is given an approximately equal amount of
// work to perform. Accumulate the values for this block. The accumulation
// ordering does not match the final pass, but is arithmatically
// ordering does not match the final pass, but is arithmetically
// equivalent.

this->exec_range(value);
Expand All @@ -232,15 +250,17 @@ class ParallelReduce<CombinedFunctorReducerType,
// Problem: non power-of-two blockDim
if (::Kokkos::Impl::hip_single_inter_block_reduce_scan<false>(
reducer, blockIdx.x, gridDim.x,
kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space,
kokkos_impl_hip_shared_memory<word_size_type>(), m_scratch_space,
m_scratch_flags)) {
// This is the final block with the final result at the final threads'
// location
size_type* const shared = kokkos_impl_hip_shared_memory<size_type>() +
(blockDim.y - 1) * word_count.value;
size_type* const global = m_result_ptr_device_accessible
? reinterpret_cast<size_type*>(m_result_ptr)
: m_scratch_space;
word_size_type* const shared =
kokkos_impl_hip_shared_memory<word_size_type>() +
(blockDim.y - 1) * word_count.value;
word_size_type* const global =
m_result_ptr_device_accessible
? reinterpret_cast<word_size_type*>(m_result_ptr)
: m_scratch_space;

if (threadIdx.y == 0) {
reducer.final(reinterpret_cast<value_type*>(shared));
Expand Down Expand Up @@ -294,9 +314,11 @@ class ParallelReduce<CombinedFunctorReducerType,
: suggested_blocksize; // Note: block_size must be less
// than or equal to 512

m_scratch_space = hip_internal_scratch_space(
m_policy.space(), reducer.value_size() *
block_size /* block_size == max block_count */);
m_scratch_space =
reinterpret_cast<word_size_type*>(hip_internal_scratch_space(
m_policy.space(),
reducer.value_size() *
block_size /* block_size == max block_count */));
m_scratch_flags =
hip_internal_scratch_flags(m_policy.space(), sizeof(size_type));

Expand Down
1 change: 1 addition & 0 deletions core/unit_test/CMakeLists.txt
Expand Up @@ -169,6 +169,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL)
MDRange_f
MDRange_g
MDRangePolicyConstructors
MDRangeReduce
MDSpan
MinMaxClamp
NumericTraits
Expand Down
68 changes: 68 additions & 0 deletions core/unit_test/TestMDRangeReduce.hpp
@@ -0,0 +1,68 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#include <gtest/gtest.h>

#include <Kokkos_Core.hpp>

namespace {

template <typename T>
void MDRangeReduceTester([[maybe_unused]] int bound, int k) {
const auto policy_MD = Kokkos::MDRangePolicy<Kokkos::Rank<2>, TEST_EXECSPACE>(
{0, 0}, {bound, 2});

// No explicit fence() calls needed because result is in HostSpace
{
T lor_MD = 0;
Kokkos::parallel_reduce(
policy_MD,
KOKKOS_LAMBDA(const int i, const int, T& res) { res = res || i == k; },
Kokkos::LOr<T>(lor_MD));
EXPECT_EQ(lor_MD, 1);
}
{
// Stick just a few true values in the Logical-OR reduction space,
// to try to make sure every value is being captured
T land_MD = 0;
Kokkos::parallel_reduce(
policy_MD, KOKKOS_LAMBDA(const int, const int, T& res) { res = 1; },
Kokkos::LAnd<T>(land_MD));
EXPECT_EQ(land_MD, 1);
}
}

TEST(TEST_CATEGORY, mdrange_parallel_reduce_primitive_types) {
#if defined(KOKKOS_ENABLE_OPENMPTARGET)
GTEST_SKIP() << "FIXME OPENMPTARGET Tests of MDRange reduce over values "
"smaller than int would fail";
#elif defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
GTEST_SKIP() << "Skipped ENABLE_CUDA_LAMBDA";
#else
for (int bound : {0, 1, 7, 32, 65, 7000}) {
for (int k = 0; k < bound; ++k) {
MDRangeReduceTester<bool>(bound, k);
MDRangeReduceTester<signed char>(bound, k);
MDRangeReduceTester<int8_t>(bound, k);
MDRangeReduceTester<int16_t>(bound, k);
MDRangeReduceTester<int32_t>(bound, k);
MDRangeReduceTester<int64_t>(bound, k);
}
}
#endif
}

} // namespace

0 comments on commit ee75763

Please sign in to comment.