Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions sycl/include/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ struct sub_group;
namespace experimental {
template <typename ParentGroup> class ballot_group;
template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;
template <typename ParentGroup> class tangle_group;
class opportunistic_group;
} // namespace experimental
} // namespace oneapi
} // namespace ext
Expand Down Expand Up @@ -72,6 +74,16 @@ struct group_scope<sycl::ext::oneapi::experimental::fixed_size_group<
static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
};

template <typename ParentGroup>
struct group_scope<sycl::ext::oneapi::experimental::tangle_group<ParentGroup>> {
static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
};

template <>
struct group_scope<::sycl::ext::oneapi::experimental::opportunistic_group> {
static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
};

// Generic shuffles and broadcasts may require multiple calls to
// intrinsics, and should use the fewest broadcasts possible
// - Loop over chunks until remaining bytes < chunk size
Expand Down Expand Up @@ -135,6 +147,16 @@ bool GroupAll(
static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
static_cast<uint32_t>(pred), PartitionSize);
}
template <typename ParentGroup>
bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
}
template <typename Group>
bool GroupAll(const ext::oneapi::experimental::opportunistic_group &,
bool pred) {
return __spirv_GroupNonUniformAll(
group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
}

template <typename Group> bool GroupAny(Group, bool pred) {
return __spirv_GroupAny(group_scope<Group>::value, pred);
Expand All @@ -161,6 +183,15 @@ bool GroupAny(
static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
static_cast<uint32_t>(pred), PartitionSize);
}
template <typename ParentGroup>
bool GroupAny(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
}
bool GroupAny(const ext::oneapi::experimental::opportunistic_group &,
bool pred) {
return __spirv_GroupNonUniformAny(
group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
}

// Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
// FIXME: Do not special-case for half or vec once all backends support all data
Expand Down Expand Up @@ -281,6 +312,45 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
return __spirv_GroupNonUniformShuffle(group_scope<ParentGroup>::value, OCLX,
OCLId);
}
template <typename ParentGroup, typename T, typename IdT>
EnableIfNativeBroadcast<T, IdT>
GroupBroadcast(ext::oneapi::experimental::tangle_group<ParentGroup> g, T x,
IdT local_id) {
// Remap local_id to its original numbering in ParentGroup.
auto LocalId = detail::IdToMaskPosition(g, local_id);

// TODO: Refactor to avoid duplication after design settles.
using GroupIdT = typename GroupId<ParentGroup>::type;
GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
using OCLT = detail::ConvertToOpenCLType_t<T>;
using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);

return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value, OCLX,
OCLId);
}
template <typename T, typename IdT>
EnableIfNativeBroadcast<T, IdT>
GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x,
IdT local_id) {
// Remap local_id to its original numbering in sub-group
auto LocalId = detail::IdToMaskPosition(g, local_id);

// TODO: Refactor to avoid duplication after design settles.
using GroupIdT = typename GroupId<sycl::ext::oneapi::sub_group>::type;
GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
using OCLT = detail::ConvertToOpenCLType_t<T>;
using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);

return __spirv_GroupNonUniformBroadcast(
group_scope<ext::oneapi::experimental::opportunistic_group>::value, OCLX,
OCLId);
}

template <typename Group, typename T, typename IdT>
EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
Expand Down Expand Up @@ -956,6 +1026,18 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
#endif
}

template <typename Group>
struct is_tangle_or_opportunistic_group : std::false_type {};

template <typename ParentGroup>
struct is_tangle_or_opportunistic_group<
sycl::ext::oneapi::experimental::tangle_group<ParentGroup>>
: std::true_type {};

template <>
struct is_tangle_or_opportunistic_group<
sycl::ext::oneapi::experimental::opportunistic_group> : std::true_type {};

// TODO: Refactor to avoid duplication after design settles
#define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \
template <__spv::GroupOperation Op, typename Group, typename T> \
Expand Down Expand Up @@ -1037,6 +1119,24 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
} \
return tmp; \
} \
} \
template <__spv::GroupOperation Op, typename Group, typename T> \
inline typename std::enable_if_t< \
is_tangle_or_opportunistic_group<Group>::value, T> \
Group##Instruction(Group, T x) { \
using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
\
using OCLT = std::conditional_t< \
std::is_same<ConvertedT, cl_char>() || \
std::is_same<ConvertedT, cl_short>(), \
cl_int, \
std::conditional_t<std::is_same<ConvertedT, cl_uchar>() || \
std::is_same<ConvertedT, cl_ushort>(), \
cl_uint, ConvertedT>>; \
OCLT Arg = x; \
OCLT Ret = __spirv_GroupNonUniform##Instruction( \
group_scope<Group>::value, static_cast<unsigned int>(Op), Arg); \
return Ret; \
}

__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ namespace ext::oneapi::experimental {
// Forward declarations of non-uniform group types for algorithm definitions
template <typename ParentGroup> class ballot_group;
template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;
template <typename ParentGroup> class tangle_group;
class opportunistic_group;

} // namespace ext::oneapi::experimental

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,16 @@ class opportunistic_group {
#endif
}

private:
protected:
sub_group_mask Mask;

protected:
opportunistic_group(sub_group_mask m) : Mask(m) {}

friend opportunistic_group this_kernel::get_opportunistic_group();

friend uint32_t
sycl::detail::IdToMaskPosition<opportunistic_group>(opportunistic_group Group,
uint32_t Id);
};

namespace this_kernel {
Expand All @@ -144,5 +147,10 @@ template <>
struct is_user_constructed_group<opportunistic_group> : std::true_type {};

} // namespace ext::oneapi::experimental

template <>
struct is_group<ext::oneapi::experimental::opportunistic_group>
: std::true_type {};

} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
11 changes: 9 additions & 2 deletions sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,15 @@ template <typename ParentGroup> class tangle_group {
#endif
}

private:
protected:
sub_group_mask Mask;

protected:
tangle_group(sub_group_mask m) : Mask(m) {}

friend tangle_group<ParentGroup> get_tangle_group<ParentGroup>(ParentGroup);

friend uint32_t sycl::detail::IdToMaskPosition<tangle_group<ParentGroup>>(
tangle_group<ParentGroup> Group, uint32_t Id);
};

template <typename Group>
Expand Down Expand Up @@ -149,5 +151,10 @@ template <typename ParentGroup>
struct is_user_constructed_group<tangle_group<ParentGroup>> : std::true_type {};

} // namespace ext::oneapi::experimental

template <typename ParentGroup>
struct is_group<ext::oneapi::experimental::tangle_group<ParentGroup>>
: std::true_type {};

} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
131 changes: 131 additions & 0 deletions sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
//
// UNSUPPORTED: cpu || cuda || hip

#include <sycl/sycl.hpp>
#include <vector>
namespace syclex = sycl::ext::oneapi::experimental;

class TestKernel;

constexpr uint32_t SGSize = 32;
constexpr uint32_t ArbitraryItem = 5;

int main() {
sycl::queue Q;

auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
if (std::find(SGSizes.begin(), SGSizes.end(), SGSize) == SGSizes.end()) {
std::cout << "Test skipped due to missing support for sub-group size 32."
<< std::endl;
return 0;
}

sycl::buffer<size_t, 1> TmpBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> BarrierBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> BroadcastBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> AnyBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> AllBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> NoneBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> ReduceBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> ExScanBuf{sycl::range{SGSize}};
sycl::buffer<bool, 1> IncScanBuf{sycl::range{SGSize}};

const auto NDR = sycl::nd_range<1>{SGSize, SGSize};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only};
sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only};
sycl::accessor BroadcastAcc{BroadcastBuf, CGH, sycl::write_only};
sycl::accessor AnyAcc{AnyBuf, CGH, sycl::write_only};
sycl::accessor AllAcc{AllBuf, CGH, sycl::write_only};
sycl::accessor NoneAcc{NoneBuf, CGH, sycl::write_only};
sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only};
sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only};
sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

uint32_t OriginalLID = SG.get_local_linear_id();

// Given the dynamic nature of opportunistic groups, the simplest
// case we can reason about is a single work-item. This isn't a very
// robust test, but choosing an arbitrary work-item (i.e. rather
// than the leader) should test an implementation's ability to handle
// arbitrary group membership.
if (OriginalLID == ArbitraryItem) {
Copy link
Contributor

@JackAKirk JackAKirk Apr 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you only have a single thread per group is this going to properly test the group implementations for intel case? In cuda backend it wouldn't for the reduce_over_group case. Also in cuda impl the reduce algorithm behaves differently if OpportunisticGroup.get_local_range() equals 2^n where n is positive integer not zero, or if it does not equal this, or if it is equal 1 (like in this test currently, the more trivial case), or if it 32 (full warp). Making four different cases in total.

But I could add these cases later if needs be.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't going to test every path, but I couldn't think of a good way to do that reliably. The semantics of opportunistic groups are (deliberately) really weird.

Even if we added a case where we picked a power of 2 (say, 8) work-items and had them all take the same branch, the specification doesn't require all 8 of those work-items to end up in the same opportunistic group. The specification only requires that all the work-items who encounter the constructor "together" (furious hand waving) form an opportunistic group. There's no way to query which work-items end up in which group, or how many groups are formed. A single work-item executing the branch was the only case I could think of with predictable, portable behavior.

Ideally, we'd probably want to somehow work out which work-items were split into which opportunistic groups, and then dynamically determine what the algorithm results should be given the partitioning that actually happened at runtime. But I couldn't think of a good way to do that. If we can figure out a good way to write that test, we should definitely add it.

I agree that adding some backend-specific tests would be a good idea, too.

auto OpportunisticGroup =
syclex::this_kernel::get_opportunistic_group();

// This is trivial, but does test that group_barrier can be called.
TmpAcc[WI] = 1;
sycl::group_barrier(OpportunisticGroup);
size_t Visible = TmpAcc[WI];
BarrierAcc[WI] = (Visible == 1);

// Simple check of group algorithms.
uint32_t LID = OpportunisticGroup.get_local_linear_id();

uint32_t BroadcastResult =
sycl::group_broadcast(OpportunisticGroup, OriginalLID, 0);
BroadcastAcc[WI] = (BroadcastResult == OriginalLID);

bool AnyResult = sycl::any_of_group(OpportunisticGroup, (LID == 0));
AnyAcc[WI] = (AnyResult == true);

bool AllResult = sycl::all_of_group(OpportunisticGroup, (LID == 0));
AllAcc[WI] = (AllResult == true);

bool NoneResult =
sycl::none_of_group(OpportunisticGroup, (LID != 0));
NoneAcc[WI] = (NoneResult == true);

uint32_t ReduceResult =
sycl::reduce_over_group(OpportunisticGroup, 1, sycl::plus<>());
ReduceAcc[WI] =
(ReduceResult == OpportunisticGroup.get_local_linear_range());

uint32_t ExScanResult = sycl::exclusive_scan_over_group(
OpportunisticGroup, 1, sycl::plus<>());
ExScanAcc[WI] = (ExScanResult == LID);

uint32_t IncScanResult = sycl::inclusive_scan_over_group(
OpportunisticGroup, 1, sycl::plus<>());
IncScanAcc[WI] = (IncScanResult == LID + 1);
} else {
BarrierAcc[WI] = false;
BroadcastAcc[WI] = false;
AnyAcc[WI] = false;
AllAcc[WI] = false;
NoneAcc[WI] = false;
ReduceAcc[WI] = false;
ExScanAcc[WI] = false;
IncScanAcc[WI] = false;
}
};
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
});

sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only};
sycl::host_accessor BroadcastAcc{BroadcastBuf, sycl::read_only};
sycl::host_accessor AnyAcc{AnyBuf, sycl::read_only};
sycl::host_accessor AllAcc{AllBuf, sycl::read_only};
sycl::host_accessor NoneAcc{NoneBuf, sycl::read_only};
sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only};
sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only};
sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only};
for (uint32_t WI = 0; WI < 32; ++WI) {
bool ExpectedResult = (WI == ArbitraryItem);
assert(BarrierAcc[WI] == ExpectedResult);
assert(BroadcastAcc[WI] == ExpectedResult);
assert(AnyAcc[WI] == ExpectedResult);
assert(AllAcc[WI] == ExpectedResult);
assert(NoneAcc[WI] == ExpectedResult);
assert(ReduceAcc[WI] == ExpectedResult);
assert(ExScanAcc[WI] == ExpectedResult);
assert(IncScanAcc[WI] == ExpectedResult);
}
return 0;
}
Loading