From 62a159a2c92ae56e63301e79c15f719c815b2939 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Mon, 29 Jul 2024 16:07:37 +0100 Subject: [PATCH 1/5] Implement root group wide barrier for CUDA backend --- .../libspirv/synchronization/barrier.cl | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl index 1cf3fb8750c2e..eb011986659eb 100644 --- a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl +++ b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl @@ -45,10 +45,54 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory, _CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void __spirv_ControlBarrier(unsigned int scope, unsigned int memory, unsigned int semantics) { + unsigned int order = semantics & 0x1F; if (scope == Subgroup) { // use a full mask as barriers are required to be convergent and exited // threads can safely be in the mask __nvvm_bar_warp_sync(0xFFFFFFFF); + } else if (scope == Device && memory == Device && + order == SequentiallyConsistent && + __clc_nvvm_reflect_arch() >= 700) { + unsigned int env1, env2; + __asm__ __volatile__("mov.u32 %0, %%envreg1;" : "=r"(env1)); + __asm__ __volatile__("mov.u32 %0, %%envreg2;" : "=r"(env2)); + long long envreg1 = env1; + long long envreg2 = env2; + // Bit field insert operation. Place 32 bits of envreg2 next to 32 bits of + // envreg1: s64[envreg2][envreg1]. The resulting value is the address in + // device global memory region, where atomic operations can be performed. + long long atomicAddr; + __asm__ __volatile__("bfi.b64 %0, %1, %2, 32, 32;" + : "=l"(atomicAddr) + : "l"(envreg1), "l"(envreg2)); + if (!atomicAddr) { + __builtin_trap(); + } else { + unsigned int tidX = __nvvm_read_ptx_sreg_tid_x(); + unsigned int tidY = __nvvm_read_ptx_sreg_tid_y(); + unsigned int tidZ = __nvvm_read_ptx_sreg_tid_z(); + if (tidX + tidY + tidZ == 0) { + // Increment address by 4 to get the precise region initialized to 0. + atomicAddr += 4; + unsigned int nctaidX = __nvvm_read_ptx_sreg_nctaid_x(); + unsigned int nctaidY = __nvvm_read_ptx_sreg_nctaid_y(); + unsigned int nctaidZ = __nvvm_read_ptx_sreg_nctaid_z(); + unsigned int totalNctaid = nctaidX * nctaidY * nctaidZ; + + // Do atomic.add(1) for each CTA and spin ld.acquire in a loop until all + // CTAs have performed the addition + unsigned int prev, current; + __asm__ __volatile__("atom.add.release.gpu.u32 %0,[%1],1;" + : "=r"(prev) + : "l"(atomicAddr)); + do { + __asm__ __volatile__("ld.acquire.gpu.u32 %0,[%1];" + : "=r"(current) + : "l"(atomicAddr)); + } while (current % totalNctaid != 0); + } + __nvvm_barrier_sync(0); + } } else { __syncthreads(); } From 79aa19eae86c38742c9fb9f6d3b3a9e4760557ee Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Tue, 30 Jul 2024 09:27:01 +0100 Subject: [PATCH 2/5] Added divergence to a root group test --- sycl/test-e2e/GroupAlgorithm/root_group.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index 983f8e7ca003a..806b9970200a7 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -53,10 +53,15 @@ void testRootGroup() { sycl::accessor data{dataBuf, h}; h.parallel_for< class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) { + volatile float X = 1.0f; + volatile float Y = 1.0f; auto root = it.ext_oneapi_get_root_group(); data[root.get_local_id()] = root.get_local_id(); sycl::group_barrier(root); - + if (it.get_group(0) % 2 == 0) { + X += sycl::sin(X); + Y += sycl::cos(Y); + } root = sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>(); int sum = data[root.get_local_id()] + From 35b66482d91fd6de68041d2132368058bbe599b4 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Tue, 30 Jul 2024 09:37:17 +0100 Subject: [PATCH 3/5] Increased number of workgroups in the test --- sycl/test-e2e/GroupAlgorithm/root_group.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index 806b9970200a7..ebcb2c9013a00 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -42,9 +42,12 @@ void testRootGroup() { const auto bundle = sycl::get_kernel_bundle(q.get_context()); const auto kernel = bundle.get_kernel(); - const auto maxWGs = kernel.ext_oneapi_get_info< - sycl::ext::oneapi::experimental::info::kernel_queue_specific:: - max_num_work_group_sync>(q); + // TODO: Uncomment following lines after + // https://github.com/intel/llvm/pull/14333 is merged. + // const auto maxWGs = kernel.ext_oneapi_get_info< + // sycl::ext::oneapi::experimental::info::kernel_queue_specific:: + // max_num_work_group_sync>(q); + const auto maxWGs = 4; const auto props = sycl::ext::oneapi::experimental::properties{ sycl::ext::oneapi::experimental::use_root_sync}; sycl::buffer dataBuf{sycl::range{maxWGs * WorkGroupSize}}; From caa7df4c49156c07d30057def937b19a81d38b34 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Tue, 30 Jul 2024 11:57:26 +0100 Subject: [PATCH 4/5] Added missing header to the test --- sycl/test-e2e/GroupAlgorithm/root_group.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index ebcb2c9013a00..3aa8d662f486e 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include From 2f7ad3040f4bee06a756ff0fc4a35318faaf4d68 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Tue, 30 Jul 2024 14:51:31 +0100 Subject: [PATCH 5/5] Added comment to the test --- sycl/test-e2e/GroupAlgorithm/root_group.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index 3aa8d662f486e..d8393f35c6253 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -1,6 +1,6 @@ // Fails with opencl non-cpu, enable when fixed. // XFAIL: (opencl && !cpu && !accelerator) -// RUN: %{build} -I . -o %t.out +// RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %} // RUN: %{run} %t.out // Disabled temporarily while investigation into the failure is ongoing. @@ -43,12 +43,9 @@ void testRootGroup() { const auto bundle = sycl::get_kernel_bundle(q.get_context()); const auto kernel = bundle.get_kernel(); - // TODO: Uncomment following lines after - // https://github.com/intel/llvm/pull/14333 is merged. - // const auto maxWGs = kernel.ext_oneapi_get_info< - // sycl::ext::oneapi::experimental::info::kernel_queue_specific:: - // max_num_work_group_sync>(q); - const auto maxWGs = 4; + const auto maxWGs = kernel.ext_oneapi_get_info< + sycl::ext::oneapi::experimental::info::kernel_queue_specific:: + max_num_work_group_sync>(q); const auto props = sycl::ext::oneapi::experimental::properties{ sycl::ext::oneapi::experimental::use_root_sync}; sycl::buffer dataBuf{sycl::range{maxWGs * WorkGroupSize}}; @@ -62,6 +59,8 @@ void testRootGroup() { auto root = it.ext_oneapi_get_root_group(); data[root.get_local_id()] = root.get_local_id(); sycl::group_barrier(root); + // Delay half of the workgroups with extra work to check that the barrier + // synchronizes the whole device. if (it.get_group(0) % 2 == 0) { X += sycl::sin(X); Y += sycl::cos(Y);