From 62a159a2c92ae56e63301e79c15f719c815b2939 Mon Sep 17 00:00:00 2001
From: Konrad Kusiak <konrad.kusiak@codeplay.com>
Date: Mon, 29 Jul 2024 16:07:37 +0100
Subject: [PATCH 1/5] Implement root group wide barrier for CUDA backend

---
 .../libspirv/synchronization/barrier.cl       | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl
index 1cf3fb8750c2e..eb011986659eb 100644
--- a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl
+++ b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl
@@ -45,10 +45,54 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory,
 _CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void
 __spirv_ControlBarrier(unsigned int scope, unsigned int memory,
                        unsigned int semantics) {
+  unsigned int order = semantics & 0x1F;
   if (scope == Subgroup) {
     // use a full mask as barriers are required to be convergent and exited
     // threads can safely be in the mask
     __nvvm_bar_warp_sync(0xFFFFFFFF);
+  } else if (scope == Device && memory == Device &&
+             order == SequentiallyConsistent &&
+             __clc_nvvm_reflect_arch() >= 700) {
+    unsigned int env1, env2;
+    __asm__ __volatile__("mov.u32 %0, %%envreg1;" : "=r"(env1));
+    __asm__ __volatile__("mov.u32 %0, %%envreg2;" : "=r"(env2));
+    long long envreg1 = env1;
+    long long envreg2 = env2;
+    // Bit field insert operation. Place 32 bits of envreg2 next to 32 bits of
+    // envreg1: s64[envreg2][envreg1]. The resulting value is the address in
+    // device global memory region, where atomic operations can be performed.
+    long long atomicAddr;
+    __asm__ __volatile__("bfi.b64 %0, %1, %2, 32, 32;"
+                         : "=l"(atomicAddr)
+                         : "l"(envreg1), "l"(envreg2));
+    if (!atomicAddr) {
+      __builtin_trap();
+    } else {
+      unsigned int tidX = __nvvm_read_ptx_sreg_tid_x();
+      unsigned int tidY = __nvvm_read_ptx_sreg_tid_y();
+      unsigned int tidZ = __nvvm_read_ptx_sreg_tid_z();
+      if (tidX + tidY + tidZ == 0) {
+        // Increment address by 4 to get the precise region initialized to 0.
+        atomicAddr += 4;
+        unsigned int nctaidX = __nvvm_read_ptx_sreg_nctaid_x();
+        unsigned int nctaidY = __nvvm_read_ptx_sreg_nctaid_y();
+        unsigned int nctaidZ = __nvvm_read_ptx_sreg_nctaid_z();
+        unsigned int totalNctaid = nctaidX * nctaidY * nctaidZ;
+
+        // Do atomic.add(1) for each CTA and spin ld.acquire in a loop until all
+        // CTAs have performed the addition
+        unsigned int prev, current;
+        __asm__ __volatile__("atom.add.release.gpu.u32 %0,[%1],1;"
+                             : "=r"(prev)
+                             : "l"(atomicAddr));
+        do {
+          __asm__ __volatile__("ld.acquire.gpu.u32 %0,[%1];"
+                               : "=r"(current)
+                               : "l"(atomicAddr));
+        } while (current % totalNctaid != 0);
+      }
+      __nvvm_barrier_sync(0);
+    }
   } else {
     __syncthreads();
   }

From 79aa19eae86c38742c9fb9f6d3b3a9e4760557ee Mon Sep 17 00:00:00 2001
From: Konrad Kusiak <konrad.kusiak@codeplay.com>
Date: Tue, 30 Jul 2024 09:27:01 +0100
Subject: [PATCH 2/5] Added divergence to a root group test

---
 sycl/test-e2e/GroupAlgorithm/root_group.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index 983f8e7ca003a..806b9970200a7 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -53,10 +53,15 @@ void testRootGroup() {
     sycl::accessor data{dataBuf, h};
     h.parallel_for<
         class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
+      volatile float X = 1.0f;
+      volatile float Y = 1.0f;
       auto root = it.ext_oneapi_get_root_group();
       data[root.get_local_id()] = root.get_local_id();
       sycl::group_barrier(root);
-
+      if (it.get_group(0) % 2 == 0) {
+        X += sycl::sin(X);
+        Y += sycl::cos(Y);
+      }
       root =
           sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>();
       int sum = data[root.get_local_id()] +

From 35b66482d91fd6de68041d2132368058bbe599b4 Mon Sep 17 00:00:00 2001
From: Konrad Kusiak <konrad.kusiak@codeplay.com>
Date: Tue, 30 Jul 2024 09:37:17 +0100
Subject: [PATCH 3/5] Increased number of workgroups in the test

---
 sycl/test-e2e/GroupAlgorithm/root_group.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index 806b9970200a7..ebcb2c9013a00 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -42,9 +42,12 @@ void testRootGroup() {
   const auto bundle =
       sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
   const auto kernel = bundle.get_kernel<class RootGroupKernel>();
-  const auto maxWGs = kernel.ext_oneapi_get_info<
-      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
-          max_num_work_group_sync>(q);
+  // TODO: Uncomment following lines after
+  // https://github.com/intel/llvm/pull/14333 is merged.
+  // const auto maxWGs = kernel.ext_oneapi_get_info<
+  //     sycl::ext::oneapi::experimental::info::kernel_queue_specific::
+  //         max_num_work_group_sync>(q);
+  const auto maxWGs = 4;
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
   sycl::buffer<int> dataBuf{sycl::range{maxWGs * WorkGroupSize}};

From caa7df4c49156c07d30057def937b19a81d38b34 Mon Sep 17 00:00:00 2001
From: Konrad Kusiak <konrad.kusiak@codeplay.com>
Date: Tue, 30 Jul 2024 11:57:26 +0100
Subject: [PATCH 4/5] Added missing header to the test

---
 sycl/test-e2e/GroupAlgorithm/root_group.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index ebcb2c9013a00..3aa8d662f486e 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -10,6 +10,7 @@
 #include <cstdlib>
 #include <type_traits>
 
+#include <sycl/builtins.hpp>
 #include <sycl/detail/core.hpp>
 #include <sycl/ext/oneapi/experimental/root_group.hpp>
 #include <sycl/group_barrier.hpp>

From 2f7ad3040f4bee06a756ff0fc4a35318faaf4d68 Mon Sep 17 00:00:00 2001
From: Konrad Kusiak <konrad.kusiak@codeplay.com>
Date: Tue, 30 Jul 2024 14:51:31 +0100
Subject: [PATCH 5/5] Added comment to the test

---
 sycl/test-e2e/GroupAlgorithm/root_group.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index 3aa8d662f486e..d8393f35c6253 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -1,6 +1,6 @@
 // Fails with opencl non-cpu, enable when fixed.
 // XFAIL: (opencl && !cpu && !accelerator)
-// RUN: %{build} -I . -o %t.out
+// RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
 // RUN: %{run} %t.out
 
 // Disabled temporarily while investigation into the failure is ongoing.
@@ -43,12 +43,9 @@ void testRootGroup() {
   const auto bundle =
       sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
   const auto kernel = bundle.get_kernel<class RootGroupKernel>();
-  // TODO: Uncomment following lines after
-  // https://github.com/intel/llvm/pull/14333 is merged.
-  // const auto maxWGs = kernel.ext_oneapi_get_info<
-  //     sycl::ext::oneapi::experimental::info::kernel_queue_specific::
-  //         max_num_work_group_sync>(q);
-  const auto maxWGs = 4;
+  const auto maxWGs = kernel.ext_oneapi_get_info<
+      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
+          max_num_work_group_sync>(q);
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
   sycl::buffer<int> dataBuf{sycl::range{maxWGs * WorkGroupSize}};
@@ -62,6 +59,8 @@ void testRootGroup() {
       auto root = it.ext_oneapi_get_root_group();
       data[root.get_local_id()] = root.get_local_id();
       sycl::group_barrier(root);
+      // Delay half of the workgroups with extra work to check that the barrier
+      // synchronizes the whole device.
       if (it.get_group(0) % 2 == 0) {
         X += sycl::sin(X);
         Y += sycl::cos(Y);