From 5ad7a9cad9828522b25f97e25ca98d397a21be6c Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Mon, 5 Apr 2021 14:24:18 +0300
Subject: [PATCH 1/2] [SYCL] Add a test for group local memory w/o device code
 opt

---
 SYCL/GroupLocalMemory/no_early_opt.cpp | 62 ++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 SYCL/GroupLocalMemory/no_early_opt.cpp
diff --git a/SYCL/GroupLocalMemory/no_early_opt.cpp b/SYCL/GroupLocalMemory/no_early_opt.cpp
new file mode 100644
index 0000000000..76c608135c
--- /dev/null
+++ b/SYCL/GroupLocalMemory/no_early_opt.cpp
@@ -0,0 +1,62 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fno-sycl-early-optimizations %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// The test checks that multiple calls to the same template instantiation of a
+// group local memory function result in separate allocations, even with device
+// code optimizations disabled (the implementation relies on inlining these
+// functions regardless of device code optimization settings).
+
+#include <CL/sycl.hpp>
+
+#include <cassert>
+#include <vector>
+
+constexpr size_t WgSize = 32;
+constexpr size_t WgCount = 4;
+constexpr size_t Size = WgSize * WgCount;
+
+class KernelA;
+
+using namespace sycl;
+
+int main() {
+  queue Q;
+  {
+    std::vector<int> VecA(Size, 0);
+    std::vector<int> VecB(Size, 0);
+    buffer<int, 1> BufA{VecA.data(), range<1>(Size)};
+    buffer<int, 1> BufB{VecB.data(), range<1>(Size)};
+
+    Q.submit([&](handler &Cgh) {
+      auto AccA = BufA.get_access<access::mode::read_write>(Cgh);
+      auto AccB = BufB.get_access<access::mode::read_write>(Cgh);
+      Cgh.parallel_for<KernelA>(
+          nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) {
+            multi_ptr<int[WgSize], access::address_space::local_space> PtrA =
+                group_local_memory_for_overwrite<int[WgSize]>(Item.get_group());
+            multi_ptr<int[WgSize], access::address_space::local_space> PtrB =
+                group_local_memory_for_overwrite<int[WgSize]>(Item.get_group());
+            size_t LocalId = Item.get_local_linear_id();
+            (*PtrA)[LocalId] = 1;
+            (*PtrB)[LocalId] = 1;
+            (*PtrA)[LocalId] *= LocalId;
+            (*PtrB)[LocalId] *= LocalId;
+            Item.barrier();
+
+            size_t LocalIdx = LocalId ^ 1;
+            size_t GlobalIdx = Item.get_global_linear_id() ^ 1;
+            AccA[GlobalIdx] = (*PtrA)[LocalIdx];
+            AccB[GlobalIdx] = (*PtrB)[LocalIdx];
+          });
+    });
+
+    auto AccA = BufA.get_access<access::mode::read>();
+    auto AccB = BufB.get_access<access::mode::read>();
+    for (size_t I = 0; I < Size; ++I) {
+      assert(AccA[I] == I % WgSize);
+      assert(AccB[I] == I % WgSize);
+    }
+  }
+}

From f12fcd8db12979d1a6926eadfa2ad4194d2bab36 Mon Sep 17 00:00:00 2001
From: Sergey Semenov <sergey.semenov@intel.com>
Date: Mon, 5 Apr 2021 17:53:13 +0300
Subject: [PATCH 2/2] Simplify the test

---
 SYCL/GroupLocalMemory/no_early_opt.cpp | 37 ++++++++++----------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/SYCL/GroupLocalMemory/no_early_opt.cpp b/SYCL/GroupLocalMemory/no_early_opt.cpp
index 76c608135c..00680e7c23 100644
--- a/SYCL/GroupLocalMemory/no_early_opt.cpp
+++ b/SYCL/GroupLocalMemory/no_early_opt.cpp
@@ -24,39 +24,30 @@ using namespace sycl;
 int main() {
   queue Q;
   {
-    std::vector<int> VecA(Size, 0);
-    std::vector<int> VecB(Size, 0);
-    buffer<int, 1> BufA{VecA.data(), range<1>(Size)};
-    buffer<int, 1> BufB{VecB.data(), range<1>(Size)};
+    std::vector<int *> VecA(Size, 0);
+    std::vector<int *> VecB(Size, 0);
+    buffer<int *, 1> BufA{VecA.data(), range<1>(Size)};
+    buffer<int *, 1> BufB{VecB.data(), range<1>(Size)};
 
     Q.submit([&](handler &Cgh) {
       auto AccA = BufA.get_access<access::mode::read_write>(Cgh);
       auto AccB = BufB.get_access<access::mode::read_write>(Cgh);
       Cgh.parallel_for<KernelA>(
           nd_range<1>(range<1>(Size), range<1>(WgSize)), [=](nd_item<1> Item) {
-            multi_ptr<int[WgSize], access::address_space::local_space> PtrA =
-                group_local_memory_for_overwrite<int[WgSize]>(Item.get_group());
-            multi_ptr<int[WgSize], access::address_space::local_space> PtrB =
-                group_local_memory_for_overwrite<int[WgSize]>(Item.get_group());
-            size_t LocalId = Item.get_local_linear_id();
-            (*PtrA)[LocalId] = 1;
-            (*PtrB)[LocalId] = 1;
-            (*PtrA)[LocalId] *= LocalId;
-            (*PtrB)[LocalId] *= LocalId;
-            Item.barrier();
-
-            size_t LocalIdx = LocalId ^ 1;
-            size_t GlobalIdx = Item.get_global_linear_id() ^ 1;
-            AccA[GlobalIdx] = (*PtrA)[LocalIdx];
-            AccB[GlobalIdx] = (*PtrB)[LocalIdx];
+            multi_ptr<int, access::address_space::local_space> PtrA =
+                group_local_memory_for_overwrite<int>(Item.get_group());
+            multi_ptr<int, access::address_space::local_space> PtrB =
+                group_local_memory_for_overwrite<int>(Item.get_group());
+
+            size_t GlobalId = Item.get_global_linear_id();
+            AccA[GlobalId] = PtrA;
+            AccB[GlobalId] = PtrB;
           });
     });
 
     auto AccA = BufA.get_access<access::mode::read>();
     auto AccB = BufB.get_access<access::mode::read>();
-    for (size_t I = 0; I < Size; ++I) {
-      assert(AccA[I] == I % WgSize);
-      assert(AccB[I] == I % WgSize);
-    }
+    for (size_t I = 0; I < Size; ++I)
+      assert(AccA[I] != AccB[I]);
   }
 }