diff --git a/SYCL/GroupLocalMemory/no_early_opt.cpp b/SYCL/GroupLocalMemory/no_early_opt.cpp new file mode 100644 index 0000000000..00680e7c23 --- /dev/null +++ b/SYCL/GroupLocalMemory/no_early_opt.cpp @@ -0,0 +1,53 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fno-sycl-early-optimizations %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// The test checks that multiple calls to the same template instantiation of a +// group local memory function result in separate allocations, even with device +// code optimizations disabled (the implementation relies on inlining these +// functions regardless of device code optimization settings). + +#include + +#include +#include + +constexpr size_t WgSize = 32; +constexpr size_t WgCount = 4; +constexpr size_t Size = WgSize * WgCount; + +class KernelA; + +using namespace sycl; + +int main() { + queue Q; + { + std::vector VecA(Size, 0); + std::vector VecB(Size, 0); + buffer BufA{VecA.data(), range<1>(Size)}; + buffer BufB{VecB.data(), range<1>(Size)}; + + Q.submit([&](handler &Cgh) { + auto AccA = BufA.get_access(Cgh); + auto AccB = BufB.get_access(Cgh); + Cgh.parallel_for( + nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) { + multi_ptr PtrA = + group_local_memory_for_overwrite(Item.get_group()); + multi_ptr PtrB = + group_local_memory_for_overwrite(Item.get_group()); + + size_t GlobalId = Item.get_global_linear_id(); + AccA[GlobalId] = PtrA; + AccB[GlobalId] = PtrB; + }); + }); + + auto AccA = BufA.get_access(); + auto AccB = BufB.get_access(); + for (size_t I = 0; I < Size; ++I) + assert(AccA[I] != AccB[I]); + } +}