Use recommended/max team size functions in Cuda ParallelFor and Reduc…

…e constructors (kokkos#6891) * Use team_size_recommended in cuda ParallelFor constructor Previous computation was intended to match team_size_recommended, but missing extra scratch space allocation. * Same change for cuda ParallelReduce() * Remove unused attr variable * Use team_size_max() in pfor constructor instead of recomputing
ldh4 · Apr 8, 2024 · 55c5757 · 55c5757
1 parent e52cda3
commit 55c5757
Showing 1 changed file with 9 additions and 29 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -539,17 +539,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_vector_size(arg_policy.impl_vector_length()) {
     auto internal_space_instance =
         m_policy.space().impl_internal_space_instance();
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor, LaunchBounds>::get_cuda_func_attributes(
-            internal_space_instance->m_cudaDev);
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
-                  internal_space_instance, attr, m_functor, m_vector_size,
-                  m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelForTag());
 
     m_shmem_begin = (sizeof(double) * (m_team_size + 2));
     m_shmem_size =
@@ -585,13 +577,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
           "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
     }
 
-    if (int(m_team_size) >
-        int(Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
-                internal_space_instance, attr, arg_functor,
-                arg_policy.impl_vector_length(),
-                arg_policy.team_scratch_size(0),
-                arg_policy.thread_scratch_size(0)) /
-            arg_policy.impl_vector_length())) {
+    if (m_team_size > arg_policy.team_size_max(arg_functor, ParallelForTag())) {
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
     }
@@ -909,17 +895,11 @@ class ParallelReduce<CombinedFunctorReducerType,
         m_vector_size(arg_policy.impl_vector_length()) {
     auto internal_space_instance =
         m_policy.space().impl_internal_space_instance();
-    cudaFuncAttributes attr = CudaParallelLaunch<ParallelReduce, LaunchBounds>::
-        get_cuda_func_attributes(internal_space_instance->m_cudaDev);
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
-                  internal_space_instance, attr,
-                  m_functor_reducer.get_functor(), m_vector_size,
-                  m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor_reducer.get_functor(),
+                                         arg_functor_reducer.get_reducer(),
+                                         ParallelReduceTag());
 
     m_team_begin =
         UseShflReduction