From c811f3c748c0bd5d25f2bcf568c49f13facd258a Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Fri, 23 Oct 2020 22:25:37 -0700 Subject: [PATCH] [SYCL] Implement queue::parallel_for() accepting reduction Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/queue.hpp | 21 +++++++++ .../reduction_queue_parallel_for.cpp | 44 +++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 sycl/test/reduction/reduction_queue_parallel_for.cpp diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp index d3b7d3d718cdc..5cca7be19772b 100644 --- a/sycl/include/CL/sycl/queue.hpp +++ b/sycl/include/CL/sycl/queue.hpp @@ -719,6 +719,27 @@ class __SYCL_EXPORT queue { CodeLoc); } + /// parallel_for version with a kernel represented as a lambda + nd_range that + /// specifies global, local sizes and offset. + /// + /// \param ExecutionRange is a range that specifies the work space of the + /// kernel + /// \param Redu is a reduction operation + /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code + template + event parallel_for(nd_range ExecutionRange, Reduction Redu, + _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) { + _CODELOCARG(&CodeLoc); + return submit( + [&](handler &CGH) { + CGH.template parallel_for( + ExecutionRange, Redu, KernelFunc); + }, + CodeLoc); + } + // Clean up CODELOC and KERNELFUNC macros. #undef _CODELOCPARAM #undef _CODELOCARG diff --git a/sycl/test/reduction/reduction_queue_parallel_for.cpp b/sycl/test/reduction/reduction_queue_parallel_for.cpp new file mode 100644 index 0000000000000..415e87dd506c0 --- /dev/null +++ b/sycl/test/reduction/reduction_queue_parallel_for.cpp @@ -0,0 +1,44 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out + +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// TODO: Enable the test for HOST when it supports ONEAPI::reduce() and +// barrier() + +// This test only checks that the method queue::parallel_for() accepting +// reduction, can be properly translated into queue::submit + parallel_for(). + +#include +using namespace sycl; + +int main() { + const size_t NElems = 1024; + const size_t WGSize = 256; + + queue Q; + int *Data = malloc_shared(NElems, Q); + for (int I = 0; I < NElems; I++) + Data[I] = I; + + int *Sum = malloc_shared(1, Q); + *Sum = 0; + + Q.parallel_for( + nd_range<1>{NElems, WGSize}, ONEAPI::reduction(Sum, ONEAPI::plus<>()), + [=](nd_item<1> It, auto &Sum) { Sum += Data[It.get_global_id(0)]; }) + .wait(); + + int ExpectedSum = (NElems - 1) * NElems / 2; + int Error = 0; + if (*Sum != ExpectedSum) { + std::cerr << "Error: Expected = " << ExpectedSum << ", Computed = " << *Sum + << std::endl; + Error = 1; + } + + free(Data, Q); + free(Sum, Q); + return Error; +}