From c811f3c748c0bd5d25f2bcf568c49f13facd258a Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Fri, 23 Oct 2020 22:25:37 -0700
Subject: [PATCH] [SYCL] Implement queue::parallel_for() accepting reduction

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/CL/sycl/queue.hpp                | 21 +++++++++
 .../reduction_queue_parallel_for.cpp          | 44 +++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 sycl/test/reduction/reduction_queue_parallel_for.cpp
diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index d3b7d3d718cdc..5cca7be19772b 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -719,6 +719,27 @@ class __SYCL_EXPORT queue {
         CodeLoc);
   }
 
+  /// parallel_for version with a kernel represented as a lambda + nd_range that
+  /// specifies global, local sizes and offset.
+  ///
+  /// \param ExecutionRange is a range that specifies the work space of the
+  /// kernel
+  /// \param Redu is a reduction operation
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  event parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
+                     _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
+    _CODELOCARG(&CodeLoc);
+    return submit(
+        [&](handler &CGH) {
+          CGH.template parallel_for<KernelName, KernelType, Dims, Reduction>(
+              ExecutionRange, Redu, KernelFunc);
+        },
+        CodeLoc);
+  }
+
 // Clean up CODELOC and KERNELFUNC macros.
 #undef _CODELOCPARAM
 #undef _CODELOCARG
diff --git a/sycl/test/reduction/reduction_queue_parallel_for.cpp b/sycl/test/reduction/reduction_queue_parallel_for.cpp
new file mode 100644
index 0000000000000..415e87dd506c0
--- /dev/null
+++ b/sycl/test/reduction/reduction_queue_parallel_for.cpp
@@ -0,0 +1,44 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable the test for HOST when it supports ONEAPI::reduce() and
+// barrier()
+
+// This test only checks that the method queue::parallel_for() accepting
+// reduction, can be properly translated into queue::submit + parallel_for().
+
+#include <CL/sycl.hpp>
+using namespace sycl;
+
+int main() {
+  const size_t NElems = 1024;
+  const size_t WGSize = 256;
+
+  queue Q;
+  int *Data = malloc_shared<int>(NElems, Q);
+  for (int I = 0; I < NElems; I++)
+    Data[I] = I;
+
+  int *Sum = malloc_shared<int>(1, Q);
+  *Sum = 0;
+
+  Q.parallel_for<class XYZ>(
+       nd_range<1>{NElems, WGSize}, ONEAPI::reduction(Sum, ONEAPI::plus<>()),
+       [=](nd_item<1> It, auto &Sum) { Sum += Data[It.get_global_id(0)]; })
+      .wait();
+
+  int ExpectedSum = (NElems - 1) * NElems / 2;
+  int Error = 0;
+  if (*Sum != ExpectedSum) {
+    std::cerr << "Error: Expected = " << ExpectedSum << ", Computed = " << *Sum
+              << std::endl;
+    Error = 1;
+  }
+
+  free(Data, Q);
+  free(Sum, Q);
+  return Error;
+}