From 26604d5cb47cae4df477dec81bfdb746603325de Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Thu, 20 Jan 2022 16:38:29 +0000 Subject: [PATCH 1/3] [SYCL][AMDGCN] Fix up and down shuffles and reductions This patch fixes the group collective implementation for AMDGCN, which had two main issues, in one place it was calling a regular `shuffle` instead of a `shuffleUp` which ended up breaking the reduction algorithm. In addition it was also not using the correct interface for the SPIR-V `shuffleUp` function. Which leads to the second part of this patch which fixes the `shuffleUp` and `shuffleDown` functions, mostly for the AMDGCN built-ins but also in the SYCL header, as the SYCL built-ins were not implemented properly on top of the SPIR-V built-ins. At the SYCL level, the `shuffleUp` and `shuffleDown` built-ins take a value to participate in the shuffle and a delta. The delta is used to compute which thread to take the value from during the shuffle operation. For `shuffleUp` it will be substracted from the thread id, and for `shuffleDown` it will be added. And so in SYCL this delta must be defined such as `subgroup_local_id - delta` falls within `[0, subgroup_local_size[` for `shuffleUp`, and `subgroup_local_id + delta` falls within `[0, subgroup_local_size[` for `shuffleDown`. However in SPIR-V, these built-ins are a bit more complicated and take two values to participate in the shuffle and support twice the delta range as the SYCL built-ins. For example for `shuffleUp` the valid range for `subgroup_local_id - delta` is `[-subgroup_local_size, subgroup_local_size[` and in this instance if it falls within `[-subgroup_local_size, 0[` the first value will be used to participate in the shuffle, and if it falls within `[0, subgroup_local_size[` the second value will be used to participate in the shuffle. And it works in a similar way for `shuffleDown`. And so when implementing the SYCL built-ins using the SPIR-V built-ins, only half of the range can be used in a properly defined way, which means only one of the value parameters of the SPIR-V built-ins actually matters. Therefore the SYCL built-ins are implemented passing in the same value to both value parameters of the SPIR-V built-ins. The complete definition of the SPIR-V built-ins can be found here: * https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_subgroups.asciidoc#instructions --- .../libspirv/group/collectives.cl | 11 +- .../libspirv/misc/sub_group_shuffle.cl | 110 +++++++++++------- sycl/include/CL/sycl/detail/spirv.hpp | 45 ++++--- 3 files changed, 97 insertions(+), 69 deletions(-) diff --git a/libclc/amdgcn-amdhsa/libspirv/group/collectives.cl b/libclc/amdgcn-amdhsa/libspirv/group/collectives.cl index 1cce52f2944e4..72d9e1971ec58 100644 --- a/libclc/amdgcn-amdhsa/libspirv/group/collectives.cl +++ b/libclc/amdgcn-amdhsa/libspirv/group/collectives.cl @@ -42,8 +42,8 @@ __clc__get_group_scratch_double() __asm("__clc__get_group_scratch_double"); _CLC_DECL TYPE _Z28__spirv_SubgroupShuffleINTELI##TYPE_MANGLED##ET_S0_j( \ TYPE, int); \ _CLC_DECL TYPE \ - _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(TYPE, \ - int); + _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j( \ + TYPE, TYPE, unsigned int); __CLC_DECLARE_SHUFFLES(char, a); __CLC_DECLARE_SHUFFLES(unsigned char, h); @@ -72,7 +72,8 @@ __CLC_DECLARE_SHUFFLES(double, d); /* Can't use XOR/butterfly shuffles; some lanes may be inactive */ \ for (int o = 1; o < __spirv_SubgroupMaxSize(); o *= 2) { \ TYPE contribution = \ - _Z28__spirv_SubgroupShuffleINTELI##TYPE_MANGLED##ET_S0_j(x, o); \ + _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(x, x, \ + o); \ bool inactive = (sg_lid < o); \ contribution = (inactive) ? IDENTITY : contribution; \ x = OP(x, contribution); \ @@ -90,8 +91,8 @@ __CLC_DECLARE_SHUFFLES(double, d); } /* For ExclusiveScan, shift and prepend identity */ \ else if (op == ExclusiveScan) { \ *carry = x; \ - result = \ - _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(x, 1); \ + result = _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j( \ + x, x, 1); \ if (sg_lid == 0) { \ result = IDENTITY; \ } \ diff --git a/libclc/amdgcn-amdhsa/libspirv/misc/sub_group_shuffle.cl b/libclc/amdgcn-amdhsa/libspirv/misc/sub_group_shuffle.cl index 255a3e7564520..faa0dae2db469 100644 --- a/libclc/amdgcn-amdhsa/libspirv/misc/sub_group_shuffle.cl +++ b/libclc/amdgcn-amdhsa/libspirv/misc/sub_group_shuffle.cl @@ -256,12 +256,26 @@ __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double16, d, 16) // Shuffle Up // int __spirv_SubgroupShuffleUpINTEL(int, int, unsigned int) _CLC_DEF int -_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int var, int lane_delta, - unsigned int width) { +_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int previous, int current, + unsigned int delta) { int self = SELF; - int index = self - lane_delta; - index = (index < (self & ~(width - 1))) ? index : self; - return __builtin_amdgcn_ds_bpermute(index << 2, var); + int size = SUBGROUP_SIZE; + + int index = self - delta; + + int val; + if (index >= 0 && index < size) { + val = current; + } else if (index < 0 && index > -size) { + val = previous; + index = index + size; + } else { + // index out of bounds so return arbitrary data + val = current; + index = self; + } + + return __builtin_amdgcn_ds_bpermute(index << 2, val); } // Sub 32-bit types. @@ -272,9 +286,9 @@ _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int var, int lane_delta, #define __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(TYPE, MANGLED_TYPE_NAME) \ _CLC_DEF TYPE \ _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE var, TYPE lane_delta, unsigned int width) { \ - return _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(var, lane_delta, \ - width); \ + TYPE previous, TYPE current, unsigned int delta) { \ + return _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(previous, current, \ + delta); \ } __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(char, a); __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned char, h); @@ -288,9 +302,9 @@ __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned short, t); #define __AMDGCN_CLC_SUBGROUP_UP_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ _CLC_DEF TYPE \ _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE var, TYPE lane_delta, unsigned int width) { \ + TYPE previous, TYPE current, unsigned int delta) { \ return __builtin_astype(_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \ - as_int(var), as_int(lane_delta), width), \ + as_int(previous), as_int(current), delta), \ CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_UP_I32(unsigned int, uint, j); @@ -304,13 +318,15 @@ __AMDGCN_CLC_SUBGROUP_UP_I32(float, float, f); #define __AMDGCN_CLC_SUBGROUP_UP_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ _CLC_DEF TYPE \ _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE var, TYPE lane_delta, unsigned int width) { \ - int2 tmp = as_int2(var); \ - tmp.lo = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \ - tmp.lo, (int)lane_delta, width); \ - tmp.hi = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \ - tmp.hi, (int)lane_delta, width); \ - return __builtin_astype(tmp, CAST_TYPE); \ + TYPE previous, TYPE current, unsigned int delta) { \ + int2 tmp_previous = as_int2(previous); \ + int2 tmp_current = as_int2(current); \ + int2 ret; \ + ret.lo = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \ + tmp_previous.lo, tmp_current.lo, delta); \ + ret.hi = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \ + tmp_previous.hi, tmp_current.hi, delta); \ + return __builtin_astype(ret, CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_UP_I64(long, long, l); __AMDGCN_CLC_SUBGROUP_UP_I64(unsigned long, ulong, m); @@ -321,12 +337,12 @@ __AMDGCN_CLC_SUBGROUP_UP_I64(double, double, d); #define __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \ _CLC_DEF TYPE \ _Z30__spirv_SubgroupShuffleUpINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_S1_j( \ - TYPE var, TYPE lane_delta, unsigned int width) { \ + TYPE previous, TYPE current, unsigned int delta) { \ TYPE res; \ for (int i = 0; i < NUM_ELEMS; ++i) { \ res[i] = \ _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_SCALAR_TY##ET_S0_S0_j( \ - var[i], (unsigned int)lane_delta[0], width); \ + previous[i], current[i], delta); \ } \ return res; \ } @@ -381,12 +397,26 @@ __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double16, d, 16) // Shuffle Down // int __spirv_SubgroupShuffleDownINTEL(int, int, unsigned int) _CLC_DEF int -_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int var, int lane_delta, - unsigned int width) { - unsigned int self = SELF; - unsigned int index = self + lane_delta; - index = as_uint(((self & (width - 1)) + lane_delta)) >= width ? self : index; - return __builtin_amdgcn_ds_bpermute(index << 2, var); +_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int current, int next, + unsigned int delta) { + int self = SELF; + int size = SUBGROUP_SIZE; + + int index = self + delta; + + int val; + if (index < size) { + val = current; + } else if (index < 2 * size) { + val = next; + index = index - size; + } else { + // index out of bounds so return arbitrary data + val = current; + index = self; + } + + return __builtin_amdgcn_ds_bpermute(index << 2, val); } // Sub 32-bit types. @@ -397,9 +427,9 @@ _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int var, int lane_delta, #define __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(TYPE, MANGLED_TYPE_NAME) \ _CLC_DEF TYPE \ _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE var, TYPE lane_delta, unsigned int width) { \ - return _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(var, lane_delta, \ - width); \ + TYPE current, TYPE next, unsigned int delta) { \ + return _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(current, next, \ + delta); \ } __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(char, a); __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned char, h); @@ -413,9 +443,9 @@ __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned short, t); #define __AMDGCN_CLC_SUBGROUP_DOWN_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ _CLC_DEF TYPE \ _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE var, TYPE lane_delta, unsigned int width) { \ + TYPE current, TYPE next, unsigned int delta) { \ return __builtin_astype(_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \ - as_int(var), as_int(lane_delta), width), \ + as_int(current), as_int(next), delta), \ CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_DOWN_I32(unsigned int, uint, j); @@ -427,13 +457,15 @@ __AMDGCN_CLC_SUBGROUP_DOWN_I32(float, float, f); #define __AMDGCN_CLC_SUBGROUP_DOWN_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ _CLC_DEF TYPE \ _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE var, TYPE lane_delta, unsigned int width) { \ - int2 tmp = as_int2(var); \ - tmp.lo = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \ - tmp.lo, (int)lane_delta, width); \ - tmp.hi = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \ - tmp.hi, (int)lane_delta, width); \ - return __builtin_astype(tmp, CAST_TYPE); \ + TYPE current, TYPE next, unsigned int delta) { \ + int2 tmp_current = as_int2(current); \ + int2 tmp_next = as_int2(next); \ + int2 ret; \ + ret.lo = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \ + tmp_current.lo, tmp_next.lo, delta); \ + ret.hi = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \ + tmp_current.hi, tmp_next.hi, delta); \ + return __builtin_astype(ret, CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_DOWN_I64(long, long, l); __AMDGCN_CLC_SUBGROUP_DOWN_I64(unsigned long, ulong, m); @@ -444,12 +476,12 @@ __AMDGCN_CLC_SUBGROUP_DOWN_I64(double, double, d); #define __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \ _CLC_DEF TYPE \ _Z32__spirv_SubgroupShuffleDownINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_S1_j( \ - TYPE var, TYPE lane_delta, unsigned int width) { \ + TYPE current, TYPE next, unsigned int delta) { \ TYPE res; \ for (int i = 0; i < NUM_ELEMS; ++i) { \ res[i] = \ _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_SCALAR_TY##ET_S0_S0_j( \ - var[i], (unsigned int)lane_delta[0], width); \ + current[i], next[i], delta); \ } \ return res; \ } diff --git a/sycl/include/CL/sycl/detail/spirv.hpp b/sycl/include/CL/sycl/detail/spirv.hpp index 40077923bfd8f..4b6244695f29b 100644 --- a/sycl/include/CL/sycl/detail/spirv.hpp +++ b/sycl/include/CL/sycl/detail/spirv.hpp @@ -515,24 +515,22 @@ EnableIfNativeShuffle SubgroupShuffleXor(T x, id<1> local_id) { } template -EnableIfNativeShuffle SubgroupShuffleDown(T x, id<1> local_id) { +EnableIfNativeShuffle SubgroupShuffleDown(T x, uint32_t delta) { #ifndef __NVPTX__ using OCLT = detail::ConvertToOpenCLType_t; - return __spirv_SubgroupShuffleDownINTEL( - OCLT(x), OCLT(x), static_cast(local_id.get(0))); + return __spirv_SubgroupShuffleDownINTEL(OCLT(x), OCLT(x), delta); #else - return __nvvm_shfl_sync_down_i32(membermask(), x, local_id.get(0), 0x1f); + return __nvvm_shfl_sync_down_i32(membermask(), x, delta, 0x1f); #endif } template -EnableIfNativeShuffle SubgroupShuffleUp(T x, id<1> local_id) { +EnableIfNativeShuffle SubgroupShuffleUp(T x, uint32_t delta) { #ifndef __NVPTX__ using OCLT = detail::ConvertToOpenCLType_t; - return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x), - static_cast(local_id.get(0))); + return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x), delta); #else - return __nvvm_shfl_sync_up_i32(membermask(), x, local_id.get(0), 0); + return __nvvm_shfl_sync_up_i32(membermask(), x, delta, 0); #endif } @@ -556,19 +554,19 @@ EnableIfVectorShuffle SubgroupShuffleXor(T x, id<1> local_id) { } template -EnableIfVectorShuffle SubgroupShuffleDown(T x, id<1> local_id) { +EnableIfVectorShuffle SubgroupShuffleDown(T x, uint32_t delta) { T result; for (int s = 0; s < x.get_size(); ++s) { - result[s] = SubgroupShuffleDown(x[s], local_id); + result[s] = SubgroupShuffleDown(x[s], delta); } return result; } template -EnableIfVectorShuffle SubgroupShuffleUp(T x, id<1> local_id) { +EnableIfVectorShuffle SubgroupShuffleUp(T x, uint32_t delta) { T result; for (int s = 0; s < x.get_size(); ++s) { - result[s] = SubgroupShuffleUp(x[s], local_id); + result[s] = SubgroupShuffleUp(x[s], delta); } return result; } @@ -626,29 +624,26 @@ EnableIfBitcastShuffle SubgroupShuffleXor(T x, id<1> local_id) { } template -EnableIfBitcastShuffle SubgroupShuffleDown(T x, id<1> local_id) { +EnableIfBitcastShuffle SubgroupShuffleDown(T x, uint32_t delta) { using ShuffleT = ConvertToNativeShuffleType_t; auto ShuffleX = bit_cast(x); #ifndef __NVPTX__ - ShuffleT Result = __spirv_SubgroupShuffleDownINTEL( - ShuffleX, ShuffleX, static_cast(local_id.get(0))); + ShuffleT Result = __spirv_SubgroupShuffleDownINTEL(ShuffleX, ShuffleX, delta); #else ShuffleT Result = - __nvvm_shfl_sync_down_i32(membermask(), ShuffleX, local_id.get(0), 0x1f); + __nvvm_shfl_sync_down_i32(membermask(), ShuffleX, delta, 0x1f); #endif return bit_cast(Result); } template -EnableIfBitcastShuffle SubgroupShuffleUp(T x, id<1> local_id) { +EnableIfBitcastShuffle SubgroupShuffleUp(T x, uint32_t delta) { using ShuffleT = ConvertToNativeShuffleType_t; auto ShuffleX = bit_cast(x); #ifndef __NVPTX__ - ShuffleT Result = __spirv_SubgroupShuffleUpINTEL( - ShuffleX, ShuffleX, static_cast(local_id.get(0))); + ShuffleT Result = __spirv_SubgroupShuffleUpINTEL(ShuffleX, ShuffleX, delta); #else - ShuffleT Result = - __nvvm_shfl_sync_up_i32(membermask(), ShuffleX, local_id.get(0), 0); + ShuffleT Result = __nvvm_shfl_sync_up_i32(membermask(), ShuffleX, delta, 0); #endif return bit_cast(Result); } @@ -706,14 +701,14 @@ EnableIfGenericShuffle SubgroupShuffleXor(T x, id<1> local_id) { } template -EnableIfGenericShuffle SubgroupShuffleDown(T x, id<1> local_id) { +EnableIfGenericShuffle SubgroupShuffleDown(T x, uint32_t delta) { T Result; char *XBytes = reinterpret_cast(&x); char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { ShuffleChunkT ShuffleX, ShuffleResult; std::memcpy(&ShuffleX, XBytes + Offset, Size); - ShuffleResult = SubgroupShuffleDown(ShuffleX, local_id); + ShuffleResult = SubgroupShuffleDown(ShuffleX, delta); std::memcpy(ResultBytes + Offset, &ShuffleResult, Size); }; GenericCall(ShuffleBytes); @@ -721,14 +716,14 @@ EnableIfGenericShuffle SubgroupShuffleDown(T x, id<1> local_id) { } template -EnableIfGenericShuffle SubgroupShuffleUp(T x, id<1> local_id) { +EnableIfGenericShuffle SubgroupShuffleUp(T x, uint32_t delta) { T Result; char *XBytes = reinterpret_cast(&x); char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { ShuffleChunkT ShuffleX, ShuffleResult; std::memcpy(&ShuffleX, XBytes + Offset, Size); - ShuffleResult = SubgroupShuffleUp(ShuffleX, local_id); + ShuffleResult = SubgroupShuffleUp(ShuffleX, delta); std::memcpy(ResultBytes + Offset, &ShuffleResult, Size); }; GenericCall(ShuffleBytes); From 1f7f1ee99ca6f0b34d122b6bb86a1f95c35b0eb3 Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Tue, 25 Jan 2022 11:39:38 +0000 Subject: [PATCH 2/3] [LIBCLC][AMDGCN] Fix get_max_sub_group_size Using defines to figure out the wavefront size there is incorrect because libclc is not built for a specific amdgcn version, so it will always default to `64`. Instead use the `__oclc_wavefront64` global variable provided by ROCm, which will be set to a different value depending on the architecture. --- .../workitem/get_max_sub_group_size.cl | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl b/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl index 394405a0c1931..ba9b88652a890 100644 --- a/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl +++ b/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl @@ -8,21 +8,16 @@ #include -// FIXME: Remove the following workaround once the clang change is released. -// This is for backward compatibility with older clang which does not define -// __AMDGCN_WAVEFRONT_SIZE. It does not consider -mwavefrontsize64. -// See: -// https://github.com/intel/llvm/blob/sycl/clang/lib/Basic/Targets/AMDGPU.h#L414 -// and: -// https://github.com/intel/llvm/blob/sycl/clang/lib/Basic/Targets/AMDGPU.cpp#L421 -#ifndef __AMDGCN_WAVEFRONT_SIZE -#if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__ -#define __AMDGCN_WAVEFRONT_SIZE 32 -#else -#define __AMDGCN_WAVEFRONT_SIZE 64 -#endif -#endif +// The clang driver will define this variable depending on the architecture and +// compile flags by linking in ROCm bitcode defining it to true or false. If +// it's 1 the wavefront size used is 64, if it's 0 the wavefront size used is +// 32. +extern constant unsigned char __oclc_wavefrontsize64; _CLC_DEF _CLC_OVERLOAD uint __spirv_SubgroupMaxSize() { - return __AMDGCN_WAVEFRONT_SIZE; + if (__oclc_wavefrontsize64 == 1) { + return 64; + } else { + return 32; + } } From ffc0867480829daae0ef65f54519b0e2ede66148 Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Tue, 25 Jan 2022 12:25:13 +0000 Subject: [PATCH 3/3] Update libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl Co-authored-by: Alexey Bader --- .../amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl b/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl index ba9b88652a890..55fc70e5225bc 100644 --- a/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl +++ b/libclc/amdgcn-amdhsa/libspirv/workitem/get_max_sub_group_size.cl @@ -17,7 +17,6 @@ extern constant unsigned char __oclc_wavefrontsize64; _CLC_DEF _CLC_OVERLOAD uint __spirv_SubgroupMaxSize() { if (__oclc_wavefrontsize64 == 1) { return 64; - } else { - return 32; } + return 32; }