Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,54 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory,
_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void
__spirv_ControlBarrier(unsigned int scope, unsigned int memory,
unsigned int semantics) {
unsigned int order = semantics & 0x1F;
if (scope == Subgroup) {
// use a full mask as barriers are required to be convergent and exited
// threads can safely be in the mask
__nvvm_bar_warp_sync(0xFFFFFFFF);
} else if (scope == Device && memory == Device &&
order == SequentiallyConsistent &&
__clc_nvvm_reflect_arch() >= 700) {
unsigned int env1, env2;
__asm__ __volatile__("mov.u32 %0, %%envreg1;" : "=r"(env1));
__asm__ __volatile__("mov.u32 %0, %%envreg2;" : "=r"(env2));
long long envreg1 = env1;
long long envreg2 = env2;
// Bit field insert operation. Place 32 bits of envreg2 next to 32 bits of
// envreg1: s64[envreg2][envreg1]. The resulting value is the address in
// device global memory region, where atomic operations can be performed.
long long atomicAddr;
__asm__ __volatile__("bfi.b64 %0, %1, %2, 32, 32;"
: "=l"(atomicAddr)
: "l"(envreg1), "l"(envreg2));
if (!atomicAddr) {
__builtin_trap();
} else {
unsigned int tidX = __nvvm_read_ptx_sreg_tid_x();
unsigned int tidY = __nvvm_read_ptx_sreg_tid_y();
unsigned int tidZ = __nvvm_read_ptx_sreg_tid_z();
if (tidX + tidY + tidZ == 0) {
// Increment address by 4 to get the precise region initialized to 0.
atomicAddr += 4;
unsigned int nctaidX = __nvvm_read_ptx_sreg_nctaid_x();
unsigned int nctaidY = __nvvm_read_ptx_sreg_nctaid_y();
unsigned int nctaidZ = __nvvm_read_ptx_sreg_nctaid_z();
unsigned int totalNctaid = nctaidX * nctaidY * nctaidZ;

// Do atomic.add(1) for each CTA and spin ld.acquire in a loop until all
// CTAs have performed the addition
unsigned int prev, current;
__asm__ __volatile__("atom.add.release.gpu.u32 %0,[%1],1;"
: "=r"(prev)
: "l"(atomicAddr));
do {
__asm__ __volatile__("ld.acquire.gpu.u32 %0,[%1];"
: "=r"(current)
: "l"(atomicAddr));
} while (current % totalNctaid != 0);
}
__nvvm_barrier_sync(0);
}
} else {
__syncthreads();
}
Expand Down
12 changes: 10 additions & 2 deletions sycl/test-e2e/GroupAlgorithm/root_group.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Fails with opencl non-cpu, enable when fixed.
// XFAIL: (opencl && !cpu && !accelerator)
// RUN: %{build} -I . -o %t.out
// RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
// RUN: %{run} %t.out

// Disabled temporarily while investigation into the failure is ongoing.
Expand All @@ -10,6 +10,7 @@
#include <cstdlib>
#include <type_traits>

#include <sycl/builtins.hpp>
#include <sycl/detail/core.hpp>
#include <sycl/ext/oneapi/experimental/root_group.hpp>
#include <sycl/group_barrier.hpp>
Expand Down Expand Up @@ -53,10 +54,17 @@ void testRootGroup() {
sycl::accessor data{dataBuf, h};
h.parallel_for<
class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
volatile float X = 1.0f;
volatile float Y = 1.0f;
auto root = it.ext_oneapi_get_root_group();
data[root.get_local_id()] = root.get_local_id();
sycl::group_barrier(root);

// Delay half of the workgroups with extra work to check that the barrier
// synchronizes the whole device.
if (it.get_group(0) % 2 == 0) {
X += sycl::sin(X);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see that we ever check neither X nor Y. Should we? Otherwise, why we need this? How do we make sure it ran?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to explicitly delay some of the workgroups by adding them more work to do, because I've seen this test passing if insufficient barrier was used. For instance on CUDA backend, doing work-group wide barrier would be enough for it to pass and that is not correct. I think this test should perform some work-group divergence to actually check that we actually perform gpu-wide barrier.

How do we make sure it ran?

The X and Y are declared as volatile and my understanding was that this would prevent compiler from removing them with some optimization.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, can we have a comment explaining it? Otherwise we risk that this code will just be removed in the future thinking it's not required.

Y += sycl::cos(Y);
}
root =
sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>();
int sum = data[root.get_local_id()] +
Expand Down