Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions libclc/amdgcn-amdhsa/libspirv/group/collectives.cl
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ __clc__get_group_scratch_double() __asm("__clc__get_group_scratch_double");
_CLC_DECL TYPE _Z28__spirv_SubgroupShuffleINTELI##TYPE_MANGLED##ET_S0_j( \
TYPE, int); \
_CLC_DECL TYPE \
_Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(TYPE, \
int);
_Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j( \
TYPE, TYPE, unsigned int);

__CLC_DECLARE_SHUFFLES(char, a);
__CLC_DECLARE_SHUFFLES(unsigned char, h);
Expand Down Expand Up @@ -72,7 +72,8 @@ __CLC_DECLARE_SHUFFLES(double, d);
/* Can't use XOR/butterfly shuffles; some lanes may be inactive */ \
for (int o = 1; o < __spirv_SubgroupMaxSize(); o *= 2) { \
TYPE contribution = \
_Z28__spirv_SubgroupShuffleINTELI##TYPE_MANGLED##ET_S0_j(x, o); \
_Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(x, x, \
o); \
bool inactive = (sg_lid < o); \
contribution = (inactive) ? IDENTITY : contribution; \
x = OP(x, contribution); \
Expand All @@ -90,8 +91,8 @@ __CLC_DECLARE_SHUFFLES(double, d);
} /* For ExclusiveScan, shift and prepend identity */ \
else if (op == ExclusiveScan) { \
*carry = x; \
result = \
_Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(x, 1); \
result = _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j( \
x, x, 1); \
if (sg_lid == 0) { \
result = IDENTITY; \
} \
Expand Down
110 changes: 71 additions & 39 deletions libclc/amdgcn-amdhsa/libspirv/misc/sub_group_shuffle.cl
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,26 @@ __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double16, d, 16)
// Shuffle Up
// int __spirv_SubgroupShuffleUpINTEL<int>(int, int, unsigned int)
_CLC_DEF int
_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int var, int lane_delta,
unsigned int width) {
_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int previous, int current,
unsigned int delta) {
int self = SELF;
int index = self - lane_delta;
index = (index < (self & ~(width - 1))) ? index : self;
return __builtin_amdgcn_ds_bpermute(index << 2, var);
int size = SUBGROUP_SIZE;

int index = self - delta;

int val;
if (index >= 0 && index < size) {
val = current;
} else if (index < 0 && index > -size) {
val = previous;
index = index + size;
} else {
// index out of bounds so return arbitrary data
val = current;
index = self;
}
Comment on lines +266 to +276
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a correct implementation of ShuffleUpINTEL to me, but I'm concerned about the extra complexity it introduces. We know that previous and current are always the same variable (if this intrinsic is called from the SYCL headers.

The SPIR-V non-uniform shuffle instructions (e.g. OpGroupNonUniformShuffleUp) are much closer semantically to the functionality required by SYCL, and this PR has me wondering whether it would be a good idea to switch over to those. It might be worth opening an issue or something to track this -- somebody would have to run some experiments to see which backends actually support these instructions, etc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, we should definitely investigate the performance of this at some point.

Also for what it's worth the CUDA backend implements this directly in the header using NVPTX built-ins rather than going through the SPIR-V interface. So that mostly would affect the HIP backend and maybe others as well.

I've filed a ticket to keep track of this:


return __builtin_amdgcn_ds_bpermute(index << 2, val);
}

// Sub 32-bit types.
Expand All @@ -272,9 +286,9 @@ _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int var, int lane_delta,
#define __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(TYPE, MANGLED_TYPE_NAME) \
_CLC_DEF TYPE \
_Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \
TYPE var, TYPE lane_delta, unsigned int width) { \
return _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(var, lane_delta, \
width); \
TYPE previous, TYPE current, unsigned int delta) { \
return _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(previous, current, \
delta); \
}
__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(char, a);
__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned char, h);
Expand All @@ -288,9 +302,9 @@ __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned short, t);
#define __AMDGCN_CLC_SUBGROUP_UP_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \
_CLC_DEF TYPE \
_Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \
TYPE var, TYPE lane_delta, unsigned int width) { \
TYPE previous, TYPE current, unsigned int delta) { \
return __builtin_astype(_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \
as_int(var), as_int(lane_delta), width), \
as_int(previous), as_int(current), delta), \
CAST_TYPE); \
}
__AMDGCN_CLC_SUBGROUP_UP_I32(unsigned int, uint, j);
Expand All @@ -304,13 +318,15 @@ __AMDGCN_CLC_SUBGROUP_UP_I32(float, float, f);
#define __AMDGCN_CLC_SUBGROUP_UP_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \
_CLC_DEF TYPE \
_Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \
TYPE var, TYPE lane_delta, unsigned int width) { \
int2 tmp = as_int2(var); \
tmp.lo = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \
tmp.lo, (int)lane_delta, width); \
tmp.hi = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \
tmp.hi, (int)lane_delta, width); \
return __builtin_astype(tmp, CAST_TYPE); \
TYPE previous, TYPE current, unsigned int delta) { \
int2 tmp_previous = as_int2(previous); \
int2 tmp_current = as_int2(current); \
int2 ret; \
ret.lo = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \
tmp_previous.lo, tmp_current.lo, delta); \
ret.hi = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \
tmp_previous.hi, tmp_current.hi, delta); \
return __builtin_astype(ret, CAST_TYPE); \
}
__AMDGCN_CLC_SUBGROUP_UP_I64(long, long, l);
__AMDGCN_CLC_SUBGROUP_UP_I64(unsigned long, ulong, m);
Expand All @@ -321,12 +337,12 @@ __AMDGCN_CLC_SUBGROUP_UP_I64(double, double, d);
#define __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \
_CLC_DEF TYPE \
_Z30__spirv_SubgroupShuffleUpINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_S1_j( \
TYPE var, TYPE lane_delta, unsigned int width) { \
TYPE previous, TYPE current, unsigned int delta) { \
TYPE res; \
for (int i = 0; i < NUM_ELEMS; ++i) { \
res[i] = \
_Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_SCALAR_TY##ET_S0_S0_j( \
var[i], (unsigned int)lane_delta[0], width); \
previous[i], current[i], delta); \
} \
return res; \
}
Expand Down Expand Up @@ -381,12 +397,26 @@ __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double16, d, 16)
// Shuffle Down
// int __spirv_SubgroupShuffleDownINTEL<int>(int, int, unsigned int)
_CLC_DEF int
_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int var, int lane_delta,
unsigned int width) {
unsigned int self = SELF;
unsigned int index = self + lane_delta;
index = as_uint(((self & (width - 1)) + lane_delta)) >= width ? self : index;
return __builtin_amdgcn_ds_bpermute(index << 2, var);
_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int current, int next,
unsigned int delta) {
int self = SELF;
int size = SUBGROUP_SIZE;

int index = self + delta;

int val;
if (index < size) {
val = current;
} else if (index < 2 * size) {
val = next;
index = index - size;
} else {
// index out of bounds so return arbitrary data
val = current;
index = self;
}

return __builtin_amdgcn_ds_bpermute(index << 2, val);
}

// Sub 32-bit types.
Expand All @@ -397,9 +427,9 @@ _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int var, int lane_delta,
#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(TYPE, MANGLED_TYPE_NAME) \
_CLC_DEF TYPE \
_Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \
TYPE var, TYPE lane_delta, unsigned int width) { \
return _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(var, lane_delta, \
width); \
TYPE current, TYPE next, unsigned int delta) { \
return _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(current, next, \
delta); \
}
__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(char, a);
__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned char, h);
Expand All @@ -413,9 +443,9 @@ __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned short, t);
#define __AMDGCN_CLC_SUBGROUP_DOWN_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \
_CLC_DEF TYPE \
_Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \
TYPE var, TYPE lane_delta, unsigned int width) { \
TYPE current, TYPE next, unsigned int delta) { \
return __builtin_astype(_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \
as_int(var), as_int(lane_delta), width), \
as_int(current), as_int(next), delta), \
CAST_TYPE); \
}
__AMDGCN_CLC_SUBGROUP_DOWN_I32(unsigned int, uint, j);
Expand All @@ -427,13 +457,15 @@ __AMDGCN_CLC_SUBGROUP_DOWN_I32(float, float, f);
#define __AMDGCN_CLC_SUBGROUP_DOWN_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \
_CLC_DEF TYPE \
_Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \
TYPE var, TYPE lane_delta, unsigned int width) { \
int2 tmp = as_int2(var); \
tmp.lo = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \
tmp.lo, (int)lane_delta, width); \
tmp.hi = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \
tmp.hi, (int)lane_delta, width); \
return __builtin_astype(tmp, CAST_TYPE); \
TYPE current, TYPE next, unsigned int delta) { \
int2 tmp_current = as_int2(current); \
int2 tmp_next = as_int2(next); \
int2 ret; \
ret.lo = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \
tmp_current.lo, tmp_next.lo, delta); \
ret.hi = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \
tmp_current.hi, tmp_next.hi, delta); \
return __builtin_astype(ret, CAST_TYPE); \
}
__AMDGCN_CLC_SUBGROUP_DOWN_I64(long, long, l);
__AMDGCN_CLC_SUBGROUP_DOWN_I64(unsigned long, ulong, m);
Expand All @@ -444,12 +476,12 @@ __AMDGCN_CLC_SUBGROUP_DOWN_I64(double, double, d);
#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \
_CLC_DEF TYPE \
_Z32__spirv_SubgroupShuffleDownINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_S1_j( \
TYPE var, TYPE lane_delta, unsigned int width) { \
TYPE current, TYPE next, unsigned int delta) { \
TYPE res; \
for (int i = 0; i < NUM_ELEMS; ++i) { \
res[i] = \
_Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_SCALAR_TY##ET_S0_S0_j( \
var[i], (unsigned int)lane_delta[0], width); \
current[i], next[i], delta); \
} \
return res; \
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,15 @@

#include <spirv/spirv.h>

// FIXME: Remove the following workaround once the clang change is released.
// This is for backward compatibility with older clang which does not define
// __AMDGCN_WAVEFRONT_SIZE. It does not consider -mwavefrontsize64.
// See:
// https://github.com/intel/llvm/blob/sycl/clang/lib/Basic/Targets/AMDGPU.h#L414
// and:
// https://github.com/intel/llvm/blob/sycl/clang/lib/Basic/Targets/AMDGPU.cpp#L421
#ifndef __AMDGCN_WAVEFRONT_SIZE
#if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__
#define __AMDGCN_WAVEFRONT_SIZE 32
#else
#define __AMDGCN_WAVEFRONT_SIZE 64
#endif
#endif
// The clang driver will define this variable depending on the architecture and
// compile flags by linking in ROCm bitcode defining it to true or false. If
// it's 1 the wavefront size used is 64, if it's 0 the wavefront size used is
// 32.
extern constant unsigned char __oclc_wavefrontsize64;

_CLC_DEF _CLC_OVERLOAD uint __spirv_SubgroupMaxSize() {
return __AMDGCN_WAVEFRONT_SIZE;
if (__oclc_wavefrontsize64 == 1) {
return 64;
}
return 32;
}
45 changes: 20 additions & 25 deletions sycl/include/CL/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -515,24 +515,22 @@ EnableIfNativeShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
}

template <typename T>
EnableIfNativeShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
EnableIfNativeShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
#ifndef __NVPTX__
using OCLT = detail::ConvertToOpenCLType_t<T>;
return __spirv_SubgroupShuffleDownINTEL(
OCLT(x), OCLT(x), static_cast<uint32_t>(local_id.get(0)));
return __spirv_SubgroupShuffleDownINTEL(OCLT(x), OCLT(x), delta);
#else
return __nvvm_shfl_sync_down_i32(membermask(), x, local_id.get(0), 0x1f);
return __nvvm_shfl_sync_down_i32(membermask(), x, delta, 0x1f);
#endif
}

template <typename T>
EnableIfNativeShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
EnableIfNativeShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
#ifndef __NVPTX__
using OCLT = detail::ConvertToOpenCLType_t<T>;
return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x),
static_cast<uint32_t>(local_id.get(0)));
return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x), delta);
#else
return __nvvm_shfl_sync_up_i32(membermask(), x, local_id.get(0), 0);
return __nvvm_shfl_sync_up_i32(membermask(), x, delta, 0);
#endif
}

Expand All @@ -556,19 +554,19 @@ EnableIfVectorShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
}

template <typename T>
EnableIfVectorShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
EnableIfVectorShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
T result;
for (int s = 0; s < x.get_size(); ++s) {
result[s] = SubgroupShuffleDown(x[s], local_id);
result[s] = SubgroupShuffleDown(x[s], delta);
}
return result;
}

template <typename T>
EnableIfVectorShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
EnableIfVectorShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
T result;
for (int s = 0; s < x.get_size(); ++s) {
result[s] = SubgroupShuffleUp(x[s], local_id);
result[s] = SubgroupShuffleUp(x[s], delta);
}
return result;
}
Expand Down Expand Up @@ -626,29 +624,26 @@ EnableIfBitcastShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
}

template <typename T>
EnableIfBitcastShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
EnableIfBitcastShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
using ShuffleT = ConvertToNativeShuffleType_t<T>;
auto ShuffleX = bit_cast<ShuffleT>(x);
#ifndef __NVPTX__
ShuffleT Result = __spirv_SubgroupShuffleDownINTEL(
ShuffleX, ShuffleX, static_cast<uint32_t>(local_id.get(0)));
ShuffleT Result = __spirv_SubgroupShuffleDownINTEL(ShuffleX, ShuffleX, delta);
#else
ShuffleT Result =
__nvvm_shfl_sync_down_i32(membermask(), ShuffleX, local_id.get(0), 0x1f);
__nvvm_shfl_sync_down_i32(membermask(), ShuffleX, delta, 0x1f);
#endif
return bit_cast<T>(Result);
}

template <typename T>
EnableIfBitcastShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
EnableIfBitcastShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
using ShuffleT = ConvertToNativeShuffleType_t<T>;
auto ShuffleX = bit_cast<ShuffleT>(x);
#ifndef __NVPTX__
ShuffleT Result = __spirv_SubgroupShuffleUpINTEL(
ShuffleX, ShuffleX, static_cast<uint32_t>(local_id.get(0)));
ShuffleT Result = __spirv_SubgroupShuffleUpINTEL(ShuffleX, ShuffleX, delta);
#else
ShuffleT Result =
__nvvm_shfl_sync_up_i32(membermask(), ShuffleX, local_id.get(0), 0);
ShuffleT Result = __nvvm_shfl_sync_up_i32(membermask(), ShuffleX, delta, 0);
#endif
return bit_cast<T>(Result);
}
Expand Down Expand Up @@ -706,29 +701,29 @@ EnableIfGenericShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
}

template <typename T>
EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, uint32_t delta) {
T Result;
char *XBytes = reinterpret_cast<char *>(&x);
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
ShuffleChunkT ShuffleX, ShuffleResult;
std::memcpy(&ShuffleX, XBytes + Offset, Size);
ShuffleResult = SubgroupShuffleDown(ShuffleX, local_id);
ShuffleResult = SubgroupShuffleDown(ShuffleX, delta);
std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
};
GenericCall<T>(ShuffleBytes);
return Result;
}

template <typename T>
EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
T Result;
char *XBytes = reinterpret_cast<char *>(&x);
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
ShuffleChunkT ShuffleX, ShuffleResult;
std::memcpy(&ShuffleX, XBytes + Offset, Size);
ShuffleResult = SubgroupShuffleUp(ShuffleX, local_id);
ShuffleResult = SubgroupShuffleUp(ShuffleX, delta);
std::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
};
GenericCall<T>(ShuffleBytes);
Expand Down