OpenMPTarget: clang extensions for dynamic shared memory. (#6380)

* OpenMPTarget: clang extensions for dynamic shared memory. * OpenMPTarget: Rebase and use macros for pragma. * OpenMPTarget: Fix bug. Add Macros file. * OpenMPTarget: Fix resize_scratch. * OpenMPTarget: restore map clause. * OpenMPTarget: Fix comment. * OpenMPTarget: change L0 values in resize_scratch. * OpenMPTarget: Fix comment and spaces. --------- Co-authored-by: Rahulkumar Gayatri <rgayatri@lbl.gov>
kokkos · Dec 8, 2023 · 843fca3 · 843fca3
1 parent 24b6484
commit 843fca3
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 25 deletions.
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -99,6 +99,11 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0,
                                       int64_t shmem_size_L1,
                                       int64_t league_size) {
   Kokkos::Experimental::OpenMPTargetSpace space;
+  // Level-0 scratch when using clang/17 and higher comes from their OpenMP
+  // extension, `ompx_dyn_cgroup_mem`.
+#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS)
+  shmem_size_L0 = 0;
+#endif
   const int64_t shmem_size =
       shmem_size_L0 + shmem_size_L1;  // L0 + L1 scratch memory per team.
   const int64_t padding = shmem_size * 10 / 100;  // Padding per team.

diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp
@@ -0,0 +1,46 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_OPENMPTARGET_MACROS_HPP
+#define KOKKOS_OPENMPTARGET_MACROS_HPP
+
+// Intel architectures prefer the classical hierarchical parallelism that relies
+// on OpenMP.
+#if defined(KOKKOS_ARCH_INTEL_GPU)
+#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU
+#endif
+
+// Define a macro for llvm compiler greater than version 17 and on NVIDIA and
+// AMD GPUs. This would be useful in cases where non-OpenMP standard llvm
+// extensions can be used.
+#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1700) && \
+    (defined(KOKKOS_ARCH_AMD_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU))
+#define KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS
+#endif
+
+#define KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(x) _Pragma(#x)
+#define KOKKOS_IMPL_OMPTARGET_PRAGMA(x) \
+  KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(omp target x)
+
+// Use scratch memory extensions to request dynamic shared memory for the
+// right compiler/architecture combination.
+#ifdef KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS
+#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) ompx_dyn_cgroup_mem(N)
+#else
+#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N)
+#endif
+
+#endif  // KOKKOS_OPENMPTARGET_MACROS_HPP
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -24,12 +24,7 @@
 
 #include <Kokkos_Atomic.hpp>
 #include "Kokkos_OpenMPTarget_Abort.hpp"
-
-// Intel architectures prefer the classical hierarchical parallelism that relies
-// on OpenMP.
-#if defined(KOKKOS_ARCH_INTEL_GPU)
-#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU
-#endif
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -248,15 +243,37 @@ class OpenMPTargetExecTeamMember {
     // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for
     // hierarchical reduction. There is an additional 10% of the requested
     // scratch memory allocated per team as padding. Hence the product with 0.1.
+    //
+    // Use llvm extensions for dynamic shared memory with compilers/architecture
+    // combinations where it is supported.
+    //
+    // Size allocated in HBM will now change based on whether we use llvm
+    // extensions.
+#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS)
+    const int total_shmem = shmem_size_L1 + shmem_size_L1 * 0.1;
+#else
+    const int total_shmem =
+        shmem_size_L0 + shmem_size_L1 + (shmem_size_L0 + shmem_size_L1) * 0.1;
+#endif
+
+    // Per team offset for buffer in HBM.
     const int reduce_offset =
-        m_shmem_block_index *
-        (shmem_size_L0 + shmem_size_L1 +
-         ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE);
+        m_shmem_block_index * (total_shmem + TEAM_REDUCE_SIZE);
+
+#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS)
+    const int l1_offset = reduce_offset + TEAM_REDUCE_SIZE;
+    char* l0_scratch =
+        static_cast<char*>(llvm_omp_target_dynamic_shared_alloc());
+    m_team_shared = scratch_memory_space(
+        l0_scratch, shmem_size_L0, static_cast<char*>(glb_scratch) + l1_offset,
+        shmem_size_L1);
+#else
     const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE;
     const int l1_offset = l0_offset + shmem_size_L0;
     m_team_shared       = scratch_memory_space(
         (static_cast<char*>(glb_scratch) + l0_offset), shmem_size_L0,
         static_cast<char*>(glb_scratch) + l1_offset, shmem_size_L1);
+#endif
     m_reduce_scratch = static_cast<char*>(glb_scratch) + reduce_offset;
     m_league_rank    = league_rank;
     m_team_rank      = omp_tid;

diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp
@@ -19,6 +19,7 @@
 
 #include <omp.h>
 #include <sstream>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
 
@@ -140,8 +141,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 // guarantees that the number of teams specified in the `num_teams` clause is
 // always less than or equal to the maximum concurrently running teams.
 #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
-#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \
-    num_teams(max_active_teams) is_device_ptr(scratch_ptr)
+    KOKKOS_IMPL_OMPTARGET_PRAGMA(
+        teams thread_limit(team_size) firstprivate(a_functor)
+            num_teams(max_active_teams) is_device_ptr(scratch_ptr)
+                KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0))
 #pragma omp parallel
     {
       if (omp_get_num_teams() > max_active_teams)

diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp
@@ -21,6 +21,7 @@
 #include <sstream>
 #include <Kokkos_Parallel.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -394,9 +395,11 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
 
 #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
-#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \
-    firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom              \
-                                                         : result)
+    KOKKOS_IMPL_OMPTARGET_PRAGMA(
+        teams num_teams(max_active_teams) thread_limit(team_size)
+            firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom
+                                                                 : result)
+                KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0))
 #pragma omp parallel reduction(custom : result)
     {
       if (omp_get_num_teams() > max_active_teams)
@@ -482,9 +485,11 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
 
       // Case where reduction is on a native data type.
       if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr) reduction(+: result)
+        // Use scratch memory extensions to request dynamic shared memory for
+        // the right compiler/architecture combination.
+        KOKKOS_IMPL_OMPTARGET_PRAGMA(teams num_teams(max_active_teams) thread_limit(team_size) map(to: f) \
+    is_device_ptr(scratch_ptr) reduction(+: result)               \
+        KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0))
 #pragma omp parallel reduction(+ : result)
         {
           if (omp_get_num_teams() > max_active_teams)
@@ -636,11 +641,13 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
 
       return;
     }
-
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr)
-    {
+    // Use scratch memory extensions to request dynamic shared memory for the
+    // right compiler/architecture combination.
+    KOKKOS_IMPL_OMPTARGET_PRAGMA(
+        teams num_teams(nteams) thread_limit(team_size) map(to
+                                                            : f)
+            is_device_ptr(scratch_ptr)
+                KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) {
 #pragma omp parallel
       {
         const int team_num      = omp_get_team_num();
@@ -665,9 +672,8 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
 
     int tree_neighbor_offset = 1;
     do {
-#pragma omp target teams distribute parallel for simd map(to               \
-                                                          : final_reducer) \
-    is_device_ptr(scratch_ptr)
+#pragma omp target teams distribute parallel for simd firstprivate( \
+    final_reducer) is_device_ptr(scratch_ptr)
       for (int i = 0; i < nteams - tree_neighbor_offset;
            i += 2 * tree_neighbor_offset) {
         ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr);