Skip to content

Commit

Permalink
OpenMPTarget: clang extensions for dynamic shared memory. (#6380)
Browse files Browse the repository at this point in the history
* OpenMPTarget: clang extensions for dynamic shared memory.

* OpenMPTarget: Rebase and use macros for pragma.

* OpenMPTarget: Fix bug. Add Macros file.

* OpenMPTarget: Fix resize_scratch.

* OpenMPTarget: restore map clause.

* OpenMPTarget: Fix comment.

* OpenMPTarget: change L0 values in resize_scratch.

* OpenMPTarget: Fix comment and spaces.

---------

Co-authored-by: Rahulkumar Gayatri <rgayatri@lbl.gov>
  • Loading branch information
rgayatri23 and Rahulkumar Gayatri committed Dec 8, 2023
1 parent 24b6484 commit 843fca3
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 25 deletions.
5 changes: 5 additions & 0 deletions core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
Expand Up @@ -99,6 +99,11 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0,
int64_t shmem_size_L1,
int64_t league_size) {
Kokkos::Experimental::OpenMPTargetSpace space;
// Level-0 scratch when using clang/17 and higher comes from their OpenMP
// extension, `ompx_dyn_cgroup_mem`.
#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS)
shmem_size_L0 = 0;
#endif
const int64_t shmem_size =
shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team.
const int64_t padding = shmem_size * 10 / 100; // Padding per team.
Expand Down
46 changes: 46 additions & 0 deletions core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp
@@ -0,0 +1,46 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#ifndef KOKKOS_OPENMPTARGET_MACROS_HPP
#define KOKKOS_OPENMPTARGET_MACROS_HPP

// Intel architectures prefer the classical hierarchical parallelism that relies
// on OpenMP.
#if defined(KOKKOS_ARCH_INTEL_GPU)
#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU
#endif

// Define a macro for llvm compiler greater than version 17 and on NVIDIA and
// AMD GPUs. This would be useful in cases where non-OpenMP standard llvm
// extensions can be used.
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1700) && \
(defined(KOKKOS_ARCH_AMD_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU))
#define KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS
#endif

#define KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(x) _Pragma(#x)
#define KOKKOS_IMPL_OMPTARGET_PRAGMA(x) \
KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(omp target x)

// Use scratch memory extensions to request dynamic shared memory for the
// right compiler/architecture combination.
#ifdef KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS
#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) ompx_dyn_cgroup_mem(N)
#else
#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N)
#endif

#endif // KOKKOS_OPENMPTARGET_MACROS_HPP
35 changes: 26 additions & 9 deletions core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
Expand Up @@ -24,12 +24,7 @@

#include <Kokkos_Atomic.hpp>
#include "Kokkos_OpenMPTarget_Abort.hpp"

// Intel architectures prefer the classical hierarchical parallelism that relies
// on OpenMP.
#if defined(KOKKOS_ARCH_INTEL_GPU)
#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU
#endif
#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp>

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
Expand Down Expand Up @@ -248,15 +243,37 @@ class OpenMPTargetExecTeamMember {
// and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for
// hierarchical reduction. There is an additional 10% of the requested
// scratch memory allocated per team as padding. Hence the product with 0.1.
//
// Use llvm extensions for dynamic shared memory with compilers/architecture
// combinations where it is supported.
//
// Size allocated in HBM will now change based on whether we use llvm
// extensions.
#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS)
const int total_shmem = shmem_size_L1 + shmem_size_L1 * 0.1;
#else
const int total_shmem =
shmem_size_L0 + shmem_size_L1 + (shmem_size_L0 + shmem_size_L1) * 0.1;
#endif

// Per team offset for buffer in HBM.
const int reduce_offset =
m_shmem_block_index *
(shmem_size_L0 + shmem_size_L1 +
((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE);
m_shmem_block_index * (total_shmem + TEAM_REDUCE_SIZE);

#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS)
const int l1_offset = reduce_offset + TEAM_REDUCE_SIZE;
char* l0_scratch =
static_cast<char*>(llvm_omp_target_dynamic_shared_alloc());
m_team_shared = scratch_memory_space(
l0_scratch, shmem_size_L0, static_cast<char*>(glb_scratch) + l1_offset,
shmem_size_L1);
#else
const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE;
const int l1_offset = l0_offset + shmem_size_L0;
m_team_shared = scratch_memory_space(
(static_cast<char*>(glb_scratch) + l0_offset), shmem_size_L0,
static_cast<char*>(glb_scratch) + l1_offset, shmem_size_L1);
#endif
m_reduce_scratch = static_cast<char*>(glb_scratch) + reduce_offset;
m_league_rank = league_rank;
m_team_rank = omp_tid;
Expand Down
Expand Up @@ -19,6 +19,7 @@

#include <omp.h>
#include <sstream>
#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp>
#include <Kokkos_Parallel.hpp>
#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>

Expand Down Expand Up @@ -140,8 +141,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
// guarantees that the number of teams specified in the `num_teams` clause is
// always less than or equal to the maximum concurrently running teams.
#if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \
num_teams(max_active_teams) is_device_ptr(scratch_ptr)
KOKKOS_IMPL_OMPTARGET_PRAGMA(
teams thread_limit(team_size) firstprivate(a_functor)
num_teams(max_active_teams) is_device_ptr(scratch_ptr)
KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0))
#pragma omp parallel
{
if (omp_get_num_teams() > max_active_teams)
Expand Down
34 changes: 20 additions & 14 deletions core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp
Expand Up @@ -21,6 +21,7 @@
#include <sstream>
#include <Kokkos_Parallel.hpp>
#include <OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp>
#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp>

namespace Kokkos {
namespace Impl {
Expand Down Expand Up @@ -394,9 +395,11 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))

#if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \
firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom \
: result)
KOKKOS_IMPL_OMPTARGET_PRAGMA(
teams num_teams(max_active_teams) thread_limit(team_size)
firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom
: result)
KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0))
#pragma omp parallel reduction(custom : result)
{
if (omp_get_num_teams() > max_active_teams)
Expand Down Expand Up @@ -482,9 +485,11 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,

// Case where reduction is on a native data type.
if constexpr (std::is_arithmetic<ValueType>::value) {
#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \
: f) \
is_device_ptr(scratch_ptr) reduction(+: result)
// Use scratch memory extensions to request dynamic shared memory for
// the right compiler/architecture combination.
KOKKOS_IMPL_OMPTARGET_PRAGMA(teams num_teams(max_active_teams) thread_limit(team_size) map(to: f) \
is_device_ptr(scratch_ptr) reduction(+: result) \
KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0))
#pragma omp parallel reduction(+ : result)
{
if (omp_get_num_teams() > max_active_teams)
Expand Down Expand Up @@ -636,11 +641,13 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,

return;
}

#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \
: f) \
is_device_ptr(scratch_ptr)
{
// Use scratch memory extensions to request dynamic shared memory for the
// right compiler/architecture combination.
KOKKOS_IMPL_OMPTARGET_PRAGMA(
teams num_teams(nteams) thread_limit(team_size) map(to
: f)
is_device_ptr(scratch_ptr)
KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) {
#pragma omp parallel
{
const int team_num = omp_get_team_num();
Expand All @@ -665,9 +672,8 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,

int tree_neighbor_offset = 1;
do {
#pragma omp target teams distribute parallel for simd map(to \
: final_reducer) \
is_device_ptr(scratch_ptr)
#pragma omp target teams distribute parallel for simd firstprivate( \
final_reducer) is_device_ptr(scratch_ptr)
for (int i = 0; i < nteams - tree_neighbor_offset;
i += 2 * tree_neighbor_offset) {
ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr);
Expand Down

0 comments on commit 843fca3

Please sign in to comment.