From 26604d5cb47cae4df477dec81bfdb746603325de Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Thu, 20 Jan 2022 16:38:29 +0000
Subject: [PATCH 1/3] [SYCL][AMDGCN] Fix up and down shuffles and reductions

This patch fixes the group collective implementation for AMDGCN, which
had two main issues, in one place it was calling a regular `shuffle`
instead of a `shuffleUp` which ended up breaking the reduction
algorithm. In addition it was also not using the correct interface for
the SPIR-V `shuffleUp` function.

Which leads to the second part of this patch which fixes the `shuffleUp`
and `shuffleDown` functions, mostly for the AMDGCN built-ins but also in
the SYCL header, as the SYCL built-ins were not implemented properly on
top of the SPIR-V built-ins.

At the SYCL level, the `shuffleUp` and `shuffleDown` built-ins take a
value to participate in the shuffle and a delta. The delta is used to
compute which thread to take the value from during the shuffle
operation. For `shuffleUp` it will be substracted from the thread id,
and for `shuffleDown` it will be added.  And so in SYCL this delta must
be defined such as `subgroup_local_id - delta` falls within `[0,
subgroup_local_size[` for `shuffleUp`, and `subgroup_local_id + delta`
falls within `[0, subgroup_local_size[` for `shuffleDown`.

However in SPIR-V, these built-ins are a bit more complicated and take
two values to participate in the shuffle and support twice the delta
range as the SYCL built-ins. For example for `shuffleUp` the valid range
for `subgroup_local_id - delta` is `[-subgroup_local_size,
subgroup_local_size[` and in this instance if it falls within
`[-subgroup_local_size, 0[` the first value will be used to participate
in the shuffle, and if it falls within `[0, subgroup_local_size[` the
second value will be used to participate in the shuffle. And it works in
a similar way for `shuffleDown`.

And so when implementing the SYCL built-ins using the SPIR-V built-ins,
only half of the range can be used in a properly defined way, which
means only one of the value parameters of the SPIR-V built-ins actually
matters. Therefore the SYCL built-ins are implemented passing in the
same value to both value parameters of the SPIR-V built-ins.

The complete definition of the SPIR-V built-ins can be found here:

* https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_subgroups.asciidoc#instructions
---
 .../libspirv/group/collectives.cl             |  11 +-
 .../libspirv/misc/sub_group_shuffle.cl        | 110 +++++++++++-------
 sycl/include/CL/sycl/detail/spirv.hpp         |  45 ++++---
 3 files changed, 97 insertions(+), 69 deletions(-)

diff --git a/libclc/amdgcn-amdhsa/libspirv/group/collectives.cl b/libclc/amdgcn-amdhsa/libspirv/group/collectives.cl
index 1cce52f2944e4..72d9e1971ec58 100644
--- a/libclc/amdgcn-amdhsa/libspirv/group/collectives.cl
+++ b/libclc/amdgcn-amdhsa/libspirv/group/collectives.cl
@@ -42,8 +42,8 @@ __clc__get_group_scratch_double() __asm("__clc__get_group_scratch_double");
   _CLC_DECL TYPE _Z28__spirv_SubgroupShuffleINTELI##TYPE_MANGLED##ET_S0_j(     \
       TYPE, int);                                                              \
   _CLC_DECL TYPE                                                               \
-      _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(TYPE,      \
-                                                                    int);
+      _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(           \
+          TYPE, TYPE, unsigned int);
 
 __CLC_DECLARE_SHUFFLES(char, a);
 __CLC_DECLARE_SHUFFLES(unsigned char, h);
@@ -72,7 +72,8 @@ __CLC_DECLARE_SHUFFLES(double, d);
   /* Can't use XOR/butterfly shuffles; some lanes may be inactive */           \
   for (int o = 1; o < __spirv_SubgroupMaxSize(); o *= 2) {                     \
     TYPE contribution =                                                        \
-        _Z28__spirv_SubgroupShuffleINTELI##TYPE_MANGLED##ET_S0_j(x, o);        \
+        _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(x, x,    \
+                                                                      o);      \
     bool inactive = (sg_lid < o);                                              \
     contribution = (inactive) ? IDENTITY : contribution;                       \
     x = OP(x, contribution);                                                   \
@@ -90,8 +91,8 @@ __CLC_DECLARE_SHUFFLES(double, d);
   } /* For ExclusiveScan, shift and prepend identity */                        \
   else if (op == ExclusiveScan) {                                              \
     *carry = x;                                                                \
-    result =                                                                   \
-        _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(x, 1);   \
+    result = _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(    \
+        x, x, 1);                                                              \
     if (sg_lid == 0) {                                                         \
       result = IDENTITY;                                                       \
     }                                                                          \
diff --git a/libclc/amdgcn-amdhsa/libspirv/misc/sub_group_shuffle.cl b/libclc/amdgcn-amdhsa/libspirv/misc/sub_group_shuffle.cl
index 255a3e7564520..faa0dae2db469 100644
--- a/libclc/amdgcn-amdhsa/libspirv/misc/sub_group_shuffle.cl
+++ b/libclc/amdgcn-amdhsa/libspirv/misc/sub_group_shuffle.cl
@@ -256,12 +256,26 @@ __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double16, d, 16)
 // Shuffle Up
 // int __spirv_SubgroupShuffleUpINTEL<int>(int, int, unsigned int)
 _CLC_DEF int
-_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int var, int lane_delta,
-                                               unsigned int width) {
+_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int previous, int current,
+                                               unsigned int delta) {
   int self = SELF;
-  int index = self - lane_delta;
-  index = (index < (self & ~(width - 1))) ? index : self;
-  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+  int size = SUBGROUP_SIZE;
+
+  int index = self - delta;
+
+  int val;
+  if (index >= 0 && index < size) {
+    val = current;
+  } else if (index < 0 && index > -size) {
+    val = previous;
+    index = index + size;
+  } else {
+    // index out of bounds so return arbitrary data
+    val = current;
+    index = self;
+  }
+
+  return __builtin_amdgcn_ds_bpermute(index << 2, val);
 }
 
 // Sub 32-bit types.
@@ -272,9 +286,9 @@ _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int var, int lane_delta,
 #define __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(TYPE, MANGLED_TYPE_NAME)              \
   _CLC_DEF TYPE                                                                \
       _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j(      \
-          TYPE var, TYPE lane_delta, unsigned int width) {                     \
-    return _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(var, lane_delta,     \
-                                                          width);              \
+          TYPE previous, TYPE current, unsigned int delta) {                   \
+    return _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(previous, current,   \
+                                                          delta);              \
   }
 __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(char, a);
 __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned char, h);
@@ -288,9 +302,9 @@ __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned short, t);
 #define __AMDGCN_CLC_SUBGROUP_UP_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME)       \
   _CLC_DEF TYPE                                                                \
       _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j(      \
-          TYPE var, TYPE lane_delta, unsigned int width) {                     \
+          TYPE previous, TYPE current, unsigned int delta) {                   \
     return __builtin_astype(_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(    \
-                                as_int(var), as_int(lane_delta), width),       \
+                                as_int(previous), as_int(current), delta),     \
                             CAST_TYPE);                                        \
   }
 __AMDGCN_CLC_SUBGROUP_UP_I32(unsigned int, uint, j);
@@ -304,13 +318,15 @@ __AMDGCN_CLC_SUBGROUP_UP_I32(float, float, f);
 #define __AMDGCN_CLC_SUBGROUP_UP_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME)       \
   _CLC_DEF TYPE                                                                \
       _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j(      \
-          TYPE var, TYPE lane_delta, unsigned int width) {                     \
-    int2 tmp = as_int2(var);                                                   \
-    tmp.lo = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(                   \
-        tmp.lo, (int)lane_delta, width);                                       \
-    tmp.hi = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(                   \
-        tmp.hi, (int)lane_delta, width);                                       \
-    return __builtin_astype(tmp, CAST_TYPE);                                   \
+          TYPE previous, TYPE current, unsigned int delta) {                   \
+    int2 tmp_previous = as_int2(previous);                                     \
+    int2 tmp_current = as_int2(current);                                       \
+    int2 ret;                                                                  \
+    ret.lo = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(                   \
+        tmp_previous.lo, tmp_current.lo, delta);                               \
+    ret.hi = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(                   \
+        tmp_previous.hi, tmp_current.hi, delta);                               \
+    return __builtin_astype(ret, CAST_TYPE);                                   \
   }
 __AMDGCN_CLC_SUBGROUP_UP_I64(long, long, l);
 __AMDGCN_CLC_SUBGROUP_UP_I64(unsigned long, ulong, m);
@@ -321,12 +337,12 @@ __AMDGCN_CLC_SUBGROUP_UP_I64(double, double, d);
 #define __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS)               \
   _CLC_DEF TYPE                                                                           \
       _Z30__spirv_SubgroupShuffleUpINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_S1_j( \
-          TYPE var, TYPE lane_delta, unsigned int width) {                                \
+          TYPE previous, TYPE current, unsigned int delta) {                              \
     TYPE res;                                                                             \
     for (int i = 0; i < NUM_ELEMS; ++i) {                                                 \
       res[i] =                                                                            \
           _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_SCALAR_TY##ET_S0_S0_j(             \
-              var[i], (unsigned int)lane_delta[0], width);                                \
+              previous[i], current[i], delta);                                            \
     }                                                                                     \
     return res;                                                                           \
   }
@@ -381,12 +397,26 @@ __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double16, d, 16)
 // Shuffle Down
 // int __spirv_SubgroupShuffleDownINTEL<int>(int, int, unsigned int)
 _CLC_DEF int
-_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int var, int lane_delta,
-                                                 unsigned int width) {
-  unsigned int self = SELF;
-  unsigned int index = self + lane_delta;
-  index = as_uint(((self & (width - 1)) + lane_delta)) >= width ? self : index;
-  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int current, int next,
+                                                 unsigned int delta) {
+  int self = SELF;
+  int size = SUBGROUP_SIZE;
+
+  int index = self + delta;
+
+  int val;
+  if (index < size) {
+    val = current;
+  } else if (index < 2 * size) {
+    val = next;
+    index = index - size;
+  } else {
+    // index out of bounds so return arbitrary data
+    val = current;
+    index = self;
+  }
+
+  return __builtin_amdgcn_ds_bpermute(index << 2, val);
 }
 
 // Sub 32-bit types.
@@ -397,9 +427,9 @@ _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int var, int lane_delta,
 #define __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(TYPE, MANGLED_TYPE_NAME)             \
   _CLC_DEF TYPE                                                                \
       _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j(    \
-          TYPE var, TYPE lane_delta, unsigned int width) {                     \
-    return _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(var, lane_delta,   \
-                                                            width);            \
+          TYPE current, TYPE next, unsigned int delta) {                       \
+    return _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(current, next,     \
+                                                            delta);            \
   }
 __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(char, a);
 __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned char, h);
@@ -413,9 +443,9 @@ __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned short, t);
 #define __AMDGCN_CLC_SUBGROUP_DOWN_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME)     \
   _CLC_DEF TYPE                                                                \
       _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j(    \
-          TYPE var, TYPE lane_delta, unsigned int width) {                     \
+          TYPE current, TYPE next, unsigned int delta) {                       \
     return __builtin_astype(_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(  \
-                                as_int(var), as_int(lane_delta), width),       \
+                                as_int(current), as_int(next), delta),         \
                             CAST_TYPE);                                        \
   }
 __AMDGCN_CLC_SUBGROUP_DOWN_I32(unsigned int, uint, j);
@@ -427,13 +457,15 @@ __AMDGCN_CLC_SUBGROUP_DOWN_I32(float, float, f);
 #define __AMDGCN_CLC_SUBGROUP_DOWN_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME)     \
   _CLC_DEF TYPE                                                                \
       _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j(    \
-          TYPE var, TYPE lane_delta, unsigned int width) {                     \
-    int2 tmp = as_int2(var);                                                   \
-    tmp.lo = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(                 \
-        tmp.lo, (int)lane_delta, width);                                       \
-    tmp.hi = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(                 \
-        tmp.hi, (int)lane_delta, width);                                       \
-    return __builtin_astype(tmp, CAST_TYPE);                                   \
+          TYPE current, TYPE next, unsigned int delta) {                       \
+    int2 tmp_current = as_int2(current);                                       \
+    int2 tmp_next = as_int2(next);                                             \
+    int2 ret;                                                                  \
+    ret.lo = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(                 \
+        tmp_current.lo, tmp_next.lo, delta);                                   \
+    ret.hi = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(                 \
+        tmp_current.hi, tmp_next.hi, delta);                                   \
+    return __builtin_astype(ret, CAST_TYPE);                                   \
   }
 __AMDGCN_CLC_SUBGROUP_DOWN_I64(long, long, l);
 __AMDGCN_CLC_SUBGROUP_DOWN_I64(unsigned long, ulong, m);
@@ -444,12 +476,12 @@ __AMDGCN_CLC_SUBGROUP_DOWN_I64(double, double, d);
 #define __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS)               \
   _CLC_DEF TYPE                                                                             \
       _Z32__spirv_SubgroupShuffleDownINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_S1_j( \
-          TYPE var, TYPE lane_delta, unsigned int width) {                                  \
+          TYPE current, TYPE next, unsigned int delta) {                                    \
     TYPE res;                                                                               \
     for (int i = 0; i < NUM_ELEMS; ++i) {                                                   \
       res[i] =                                                                              \
           _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_SCALAR_TY##ET_S0_S0_j(             \
-              var[i], (unsigned int)lane_delta[0], width);                                  \
+              current[i], next[i], delta);                                                  \
     }                                                                                       \
     return res;                                                                             \
   }
diff --git a/sycl/include/CL/sycl/detail/spirv.hpp b/sycl/include/CL/sycl/detail/spirv.hpp
index 40077923bfd8f..4b6244695f29b 100644
--- a/sycl/include/CL/sycl/detail/spirv.hpp
+++ b/sycl/include/CL/sycl/detail/spirv.hpp
@@ -515,24 +515,22 @@ EnableIfNativeShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
 }
 
 template <typename T>
-EnableIfNativeShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
+EnableIfNativeShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
 #ifndef __NVPTX__
   using OCLT = detail::ConvertToOpenCLType_t<T>;
-  return __spirv_SubgroupShuffleDownINTEL(
-      OCLT(x), OCLT(x), static_cast<uint32_t>(local_id.get(0)));
+  return __spirv_SubgroupShuffleDownINTEL(OCLT(x), OCLT(x), delta);
 #else
-  return __nvvm_shfl_sync_down_i32(membermask(), x, local_id.get(0), 0x1f);
+  return __nvvm_shfl_sync_down_i32(membermask(), x, delta, 0x1f);
 #endif
 }
 
 template <typename T>
-EnableIfNativeShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
+EnableIfNativeShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
 #ifndef __NVPTX__
   using OCLT = detail::ConvertToOpenCLType_t<T>;
-  return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x),
-                                        static_cast<uint32_t>(local_id.get(0)));
+  return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x), delta);
 #else
-  return __nvvm_shfl_sync_up_i32(membermask(), x, local_id.get(0), 0);
+  return __nvvm_shfl_sync_up_i32(membermask(), x, delta, 0);
 #endif
 }
 
@@ -556,19 +554,19 @@ EnableIfVectorShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
 }
 
 template <typename T>
-EnableIfVectorShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
+EnableIfVectorShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
   T result;
   for (int s = 0; s < x.get_size(); ++s) {
-    result[s] = SubgroupShuffleDown(x[s], local_id);
+    result[s] = SubgroupShuffleDown(x[s], delta);
   }
   return result;
 }
 
 template <typename T>
-EnableIfVectorShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
+EnableIfVectorShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
   T result;
   for (int s = 0; s < x.get_size(); ++s) {
-    result[s] = SubgroupShuffleUp(x[s], local_id);
+    result[s] = SubgroupShuffleUp(x[s], delta);
   }
   return result;
 }
@@ -626,29 +624,26 @@ EnableIfBitcastShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
 }
 
 template <typename T>
-EnableIfBitcastShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
+EnableIfBitcastShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
   using ShuffleT = ConvertToNativeShuffleType_t<T>;
   auto ShuffleX = bit_cast<ShuffleT>(x);
 #ifndef __NVPTX__
-  ShuffleT Result = __spirv_SubgroupShuffleDownINTEL(
-      ShuffleX, ShuffleX, static_cast<uint32_t>(local_id.get(0)));
+  ShuffleT Result = __spirv_SubgroupShuffleDownINTEL(ShuffleX, ShuffleX, delta);
 #else
   ShuffleT Result =
-      __nvvm_shfl_sync_down_i32(membermask(), ShuffleX, local_id.get(0), 0x1f);
+      __nvvm_shfl_sync_down_i32(membermask(), ShuffleX, delta, 0x1f);
 #endif
   return bit_cast<T>(Result);
 }
 
 template <typename T>
-EnableIfBitcastShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
+EnableIfBitcastShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
   using ShuffleT = ConvertToNativeShuffleType_t<T>;
   auto ShuffleX = bit_cast<ShuffleT>(x);
 #ifndef __NVPTX__
-  ShuffleT Result = __spirv_SubgroupShuffleUpINTEL(
-      ShuffleX, ShuffleX, static_cast<uint32_t>(local_id.get(0)));
+  ShuffleT Result = __spirv_SubgroupShuffleUpINTEL(ShuffleX, ShuffleX, delta);
 #else
-  ShuffleT Result =
-      __nvvm_shfl_sync_up_i32(membermask(), ShuffleX, local_id.get(0), 0);
+  ShuffleT Result = __nvvm_shfl_sync_up_i32(membermask(), ShuffleX, delta, 0);
 #endif
   return bit_cast<T>(Result);
 }
@@ -706,14 +701,14 @@ EnableIfGenericShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
 }
 
 template <typename T>
-EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
+EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
   T Result;
   char *XBytes = reinterpret_cast<char *>(&x);
   char *ResultBytes = reinterpret_cast<char *>(&Result);
   auto ShuffleBytes = [=](size_t Offset, size_t Size) {
     ShuffleChunkT ShuffleX, ShuffleResult;
     std::memcpy(&ShuffleX, XBytes + Offset, Size);
-    ShuffleResult = SubgroupShuffleDown(ShuffleX, local_id);
+    ShuffleResult = SubgroupShuffleDown(ShuffleX, delta);
     std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
   };
   GenericCall<T>(ShuffleBytes);
@@ -721,14 +716,14 @@ EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
 }
 
 template <typename T>
-EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
+EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
   T Result;
   char *XBytes = reinterpret_cast<char *>(&x);
   char *ResultBytes = reinterpret_cast<char *>(&Result);
   auto ShuffleBytes = [=](size_t Offset, size_t Size) {
     ShuffleChunkT ShuffleX, ShuffleResult;
     std::memcpy(&ShuffleX, XBytes + Offset, Size);
-    ShuffleResult = SubgroupShuffleUp(ShuffleX, local_id);
+    ShuffleResult = SubgroupShuffleUp(ShuffleX, delta);
     std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
   };
   GenericCall<T>(ShuffleBytes);

From 1f7f1ee99ca6f0b34d122b6bb86a1f95c35b0eb3 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 25 Jan 2022 11:39:38 +0000
Subject: [PATCH 2/3] [LIBCLC][AMDGCN] Fix get_max_sub_group_size

Using defines to figure out the wavefront size there is incorrect
because libclc is not built for a specific amdgcn version, so it will
always default to `64`.

Instead use the `__oclc_wavefront64` global variable provided by ROCm,
which will be set to a different value depending on the architecture.
---
 .../workitem/get_max_sub_group_size.cl        | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl b/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl
index 394405a0c1931..ba9b88652a890 100644
--- a/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl
+++ b/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl
@@ -8,21 +8,16 @@
 
 #include <spirv/spirv.h>
 
-// FIXME: Remove the following workaround once the clang change is released.
-// This is for backward compatibility with older clang which does not define
-// __AMDGCN_WAVEFRONT_SIZE. It does not consider -mwavefrontsize64.
-// See:
-// https://github.com/intel/llvm/blob/sycl/clang/lib/Basic/Targets/AMDGPU.h#L414
-// and:
-// https://github.com/intel/llvm/blob/sycl/clang/lib/Basic/Targets/AMDGPU.cpp#L421
-#ifndef __AMDGCN_WAVEFRONT_SIZE
-#if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__
-#define __AMDGCN_WAVEFRONT_SIZE 32
-#else
-#define __AMDGCN_WAVEFRONT_SIZE 64
-#endif
-#endif
+// The clang driver will define this variable depending on the architecture and
+// compile flags by linking in ROCm bitcode defining it to true or false. If
+// it's 1 the wavefront size used is 64, if it's 0 the wavefront size used is
+// 32.
+extern constant unsigned char __oclc_wavefrontsize64;
 
 _CLC_DEF _CLC_OVERLOAD uint __spirv_SubgroupMaxSize() {
-  return __AMDGCN_WAVEFRONT_SIZE;
+  if (__oclc_wavefrontsize64 == 1) {
+    return 64;
+  } else {
+    return 32;
+  }
 }

From ffc0867480829daae0ef65f54519b0e2ede66148 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.pierre.miller@gmail.com>
Date: Tue, 25 Jan 2022 12:25:13 +0000
Subject: [PATCH 3/3] Update
 libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl

Co-authored-by: Alexey Bader <alexey.bader@intel.com>
---
 .../amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl b/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl
index ba9b88652a890..55fc70e5225bc 100644
--- a/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl
+++ b/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl
@@ -17,7 +17,6 @@ extern constant unsigned char __oclc_wavefrontsize64;
 _CLC_DEF _CLC_OVERLOAD uint __spirv_SubgroupMaxSize() {
   if (__oclc_wavefrontsize64 == 1) {
     return 64;
-  } else {
-    return 32;
   }
+  return 32;
 }