Skip to content

Commit

Permalink
Use recommended/max team size functions in Cuda ParallelFor and Reduc…
Browse files Browse the repository at this point in the history
…e constructors (kokkos#6891)

* Use team_size_recommended in cuda ParallelFor constructor

Previous computation was intended to match team_size_recommended, but
missing extra scratch space allocation.

* Same change for cuda ParallelReduce()

* Remove unused attr variable

* Use team_size_max() in pfor constructor instead of recomputing
  • Loading branch information
tcclevenger committed Apr 8, 2024
1 parent e52cda3 commit 55c5757
Showing 1 changed file with 9 additions and 29 deletions.
38 changes: 9 additions & 29 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,17 +539,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
m_vector_size(arg_policy.impl_vector_length()) {
auto internal_space_instance =
m_policy.space().impl_internal_space_instance();
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelFor, LaunchBounds>::get_cuda_func_attributes(
internal_space_instance->m_cudaDev);
m_team_size =
m_team_size >= 0
? m_team_size
: Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
internal_space_instance, attr, m_functor, m_vector_size,
m_policy.team_scratch_size(0),
m_policy.thread_scratch_size(0)) /
m_vector_size;
m_team_size = m_team_size >= 0 ? m_team_size
: arg_policy.team_size_recommended(
arg_functor, ParallelForTag());

m_shmem_begin = (sizeof(double) * (m_team_size + 2));
m_shmem_size =
Expand Down Expand Up @@ -585,13 +577,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
"Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
}

if (int(m_team_size) >
int(Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
internal_space_instance, attr, arg_functor,
arg_policy.impl_vector_length(),
arg_policy.team_scratch_size(0),
arg_policy.thread_scratch_size(0)) /
arg_policy.impl_vector_length())) {
if (m_team_size > arg_policy.team_size_max(arg_functor, ParallelForTag())) {
Kokkos::Impl::throw_runtime_exception(std::string(
"Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
}
Expand Down Expand Up @@ -909,17 +895,11 @@ class ParallelReduce<CombinedFunctorReducerType,
m_vector_size(arg_policy.impl_vector_length()) {
auto internal_space_instance =
m_policy.space().impl_internal_space_instance();
cudaFuncAttributes attr = CudaParallelLaunch<ParallelReduce, LaunchBounds>::
get_cuda_func_attributes(internal_space_instance->m_cudaDev);
m_team_size =
m_team_size >= 0
? m_team_size
: Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
internal_space_instance, attr,
m_functor_reducer.get_functor(), m_vector_size,
m_policy.team_scratch_size(0),
m_policy.thread_scratch_size(0)) /
m_vector_size;
m_team_size = m_team_size >= 0 ? m_team_size
: arg_policy.team_size_recommended(
arg_functor_reducer.get_functor(),
arg_functor_reducer.get_reducer(),
ParallelReduceTag());

m_team_begin =
UseShflReduction
Expand Down

0 comments on commit 55c5757

Please sign in to comment.