From 87158a295cd1c23f58af45f15ab797760e9b19ad Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 7 Dec 2022 14:52:14 +0000
Subject: [PATCH] [SYCL][Fusion] Test kernel fusion and optimization

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 SYCL/KernelFusion/abort_fusion.cpp            | 105 +++++++++++
 SYCL/KernelFusion/abort_internalization.cpp   | 174 ++++++++++++++++++
 .../barrier_local_internalization.cpp         |  83 +++++++++
 SYCL/KernelFusion/buffer_internalization.cpp  |  72 ++++++++
 SYCL/KernelFusion/cancel_fusion.cpp           |   1 +
 SYCL/KernelFusion/complete_fusion.cpp         |  67 +++++++
 SYCL/KernelFusion/diamond_shape.cpp           | 106 +++++++++++
 SYCL/KernelFusion/event_wait_cancel.cpp       |   6 +
 SYCL/KernelFusion/event_wait_complete.cpp     |  88 +++++++++
 .../internal_explicit_dependency.cpp          |  80 ++++++++
 .../internalize_array_wrapper.cpp             | 139 ++++++++++++++
 SYCL/KernelFusion/internalize_deep.cpp        | 106 +++++++++++
 SYCL/KernelFusion/internalize_multi_ptr.cpp   |  81 ++++++++
 SYCL/KernelFusion/internalize_vec.cpp         |  75 ++++++++
 SYCL/KernelFusion/internalize_vfunc.cpp       |  88 +++++++++
 SYCL/KernelFusion/local_internalization.cpp   |  73 ++++++++
 SYCL/KernelFusion/non_unit_local_size.cpp     |  82 +++++++++
 SYCL/KernelFusion/pointer_arg_function.cpp    |  82 +++++++++
 SYCL/KernelFusion/private_internalization.cpp |  71 +++++++
 SYCL/KernelFusion/ranged_offset_accessor.cpp  |  80 ++++++++
 SYCL/KernelFusion/struct_with_array.cpp       |  82 +++++++++
 .../sync_two_queues_event_dep.cpp             |   6 +-
 .../sync_two_queues_requirement.cpp           |   6 +-
 SYCL/KernelFusion/three_dimensional.cpp       |  75 ++++++++
 SYCL/KernelFusion/two_dimensional.cpp         |  74 ++++++++
 SYCL/KernelFusion/usm_no_dependencies.cpp     |  70 +++++++
 SYCL/KernelFusion/work_group_barrier.cpp      |  79 ++++++++
 SYCL/KernelFusion/wrapped_usm.cpp             |  80 ++++++++
 SYCL/lit.cfg.py                               |  17 ++
 29 files changed, 2140 insertions(+), 8 deletions(-)
 create mode 100644 SYCL/KernelFusion/abort_fusion.cpp
 create mode 100644 SYCL/KernelFusion/abort_internalization.cpp
 create mode 100644 SYCL/KernelFusion/barrier_local_internalization.cpp
 create mode 100644 SYCL/KernelFusion/buffer_internalization.cpp
 create mode 100644 SYCL/KernelFusion/complete_fusion.cpp
 create mode 100644 SYCL/KernelFusion/diamond_shape.cpp
 create mode 100644 SYCL/KernelFusion/event_wait_complete.cpp
 create mode 100644 SYCL/KernelFusion/internal_explicit_dependency.cpp
 create mode 100644 SYCL/KernelFusion/internalize_array_wrapper.cpp
 create mode 100644 SYCL/KernelFusion/internalize_deep.cpp
 create mode 100644 SYCL/KernelFusion/internalize_multi_ptr.cpp
 create mode 100644 SYCL/KernelFusion/internalize_vec.cpp
 create mode 100644 SYCL/KernelFusion/internalize_vfunc.cpp
 create mode 100644 SYCL/KernelFusion/local_internalization.cpp
 create mode 100644 SYCL/KernelFusion/non_unit_local_size.cpp
 create mode 100644 SYCL/KernelFusion/pointer_arg_function.cpp
 create mode 100644 SYCL/KernelFusion/private_internalization.cpp
 create mode 100644 SYCL/KernelFusion/ranged_offset_accessor.cpp
 create mode 100644 SYCL/KernelFusion/struct_with_array.cpp
 create mode 100644 SYCL/KernelFusion/three_dimensional.cpp
 create mode 100644 SYCL/KernelFusion/two_dimensional.cpp
 create mode 100644 SYCL/KernelFusion/usm_no_dependencies.cpp
 create mode 100644 SYCL/KernelFusion/work_group_barrier.cpp
 create mode 100644 SYCL/KernelFusion/wrapped_usm.cpp

diff --git a/SYCL/KernelFusion/abort_fusion.cpp b/SYCL/KernelFusion/abort_fusion.cpp
new file mode 100644
index 0000000000..cdc2e67caf
--- /dev/null
+++ b/SYCL/KernelFusion/abort_fusion.cpp
@@ -0,0 +1,105 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test fusion being aborted: Different scenarios causing the JIT compiler
+// to abort fusion due to constraint violations for fusion. Also check that
+// warnings are printed when SYCL_RT_WARNING_LEVEL=1.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+constexpr size_t dataSize = 512;
+
+enum class Internalization { None, Local, Private };
+
+template <typename Kernel1Name, typename Kernel2Name, int Kernel1Dim>
+void performFusion(queue &q, range<Kernel1Dim> k1Global,
+                   range<Kernel1Dim> k1Local) {
+  int in[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in[i] = i;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+  {
+    buffer<int> bIn{in, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw(q);
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn = bIn.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<Kernel1Name>(nd_range<Kernel1Dim>{k1Global, k1Local},
+                                    [=](item<Kernel1Dim> i) {
+                                      auto LID = i.get_linear_id();
+                                      accTmp[LID] = accIn[LID] + 5;
+                                    });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<Kernel2Name>(nd_range<1>{{dataSize}, {8}}, [=](id<1> i) {
+        accOut[i] = accTmp[i] * 2;
+      });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  size_t numErrors = 0;
+  for (size_t i = 0; i < k1Global.size(); ++i) {
+    if (out[i] != ((i + 5) * 2)) {
+      ++numErrors;
+    }
+  }
+  if (numErrors) {
+    std::cout << "COMPUTATION ERROR\n";
+  } else {
+    std::cout << "COMPUTATION OK\n";
+  }
+}
+
+int main() {
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  // Scenario: Fusing two kernels with different dimensionality should lead to
+  // fusion being aborted.
+  performFusion<class Kernel1_1, class Kernel2_1>(q, range<2>{32, 16},
+                                                  range<2>{1, 8});
+  // CHECK: WARNING: Cannot fuse kernels with different dimensionality
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Fusing two kernels with different global size should lead to
+  // fusion being aborted.
+  performFusion<class Kernel1_2, class Kernel2_2>(q, range<1>{256},
+                                                  range<1>{8});
+  // CHECK-NEXT: WARNING: Cannot fuse kerneles with different global size
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Fusing two kernels with different local size should lead to
+  // fusion being aborted.
+  performFusion<class Kernel1_3, class Kernel2_3>(q, range<1>{dataSize},
+                                                  range<1>{16});
+  // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
+  // CHECK-NEXT: COMPUTATION OK
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/abort_internalization.cpp b/SYCL/KernelFusion/abort_internalization.cpp
new file mode 100644
index 0000000000..23a74dd00c
--- /dev/null
+++ b/SYCL/KernelFusion/abort_internalization.cpp
@@ -0,0 +1,174 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test incomplete internalization: Different scenarios causing the JIT compiler
+// to abort internalization due to target or parameter mismatch. Also check that
+// warnings are printed when SYCL_RT_WARNING_LEVEL=1.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+constexpr size_t dataSize = 512;
+
+enum class Internalization { None, Local, Private };
+
+void performFusion(queue &q, Internalization intKernel1,
+                   size_t localSizeKernel1, Internalization intKernel2,
+                   size_t localSizeKernel2,
+                   bool expectInternalization = false) {
+  int in[dataSize], tmp[dataSize], out[dataSize];
+  for (size_t i = 0; i < dataSize; ++i) {
+    in[i] = i;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+  {
+    buffer<int> bIn{in, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn = bIn.get_access(cgh);
+      property_list properties{};
+      if (intKernel1 == Internalization::Private) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_private{}};
+      } else if (intKernel1 == Internalization::Local) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_local{}};
+      }
+      accessor<int> accTmp = bTmp.get_access(cgh, properties);
+
+      if (localSizeKernel1 > 0) {
+        cgh.parallel_for<class Kernel1>(
+            nd_range<1>{{dataSize}, {localSizeKernel1}},
+            [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
+      } else {
+        cgh.parallel_for<class KernelOne>(
+            dataSize, [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
+      }
+    });
+
+    q.submit([&](handler &cgh) {
+      property_list properties{};
+      if (intKernel2 == Internalization::Private) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_private{}};
+      } else if (intKernel2 == Internalization::Local) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_local{}};
+      }
+      accessor<int> accTmp = bTmp.get_access(cgh, properties);
+      auto accOut = bOut.get_access(cgh);
+      if (localSizeKernel2 > 0) {
+        cgh.parallel_for<class Kernel2>(
+            nd_range<1>{{dataSize}, {localSizeKernel2}},
+            [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
+      } else {
+        cgh.parallel_for<class KernelTwo>(
+            dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
+      }
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  size_t numErrors = 0;
+  size_t numInternalized = 0;
+  for (size_t i = 0; i < dataSize; ++i) {
+    if (out[i] != ((i + 5) * 2)) {
+      ++numErrors;
+    }
+    if (tmp[i] == -1) {
+      ++numInternalized;
+    }
+  }
+  if (numErrors) {
+    std::cout << "COMPUTATION ERROR\n";
+    return;
+  }
+  if (!expectInternalization && numInternalized) {
+    std::cout << "WRONG INTERNALIZATION\n";
+    return;
+  }
+  std::cout << "COMPUTATION OK\n";
+}
+
+int main() {
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  // Scenario: One accessor without internalization, one with local
+  // internalization. Should fall back to no internalization and print a
+  // warning.
+  std::cout << "None, Local(0)\n";
+  performFusion(q, Internalization::None, 0, Internalization::Local, 0);
+  // CHECK: None, Local(0)
+  // CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: One accessor without internalization, one with private
+  // internalization. Should fall back to no internalization and print a
+  // warning.
+  std::cout << "None, Private\n";
+  performFusion(q, Internalization::None, 0, Internalization::Private, 0);
+  // CHECK-NEXT: None, Private
+  // CHECK-NEXT: WARNING: Not performing specified private promotion, due to previous mismatch or because previous accessor specified no promotion
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Both accessor with local promotion, but the second kernel does
+  // not specify a work-group size. No promotion should happen and a warning
+  // should be printed.
+  std::cout << "Local(8), Local(0)\n";
+  performFusion(q, Internalization::Local, 8, Internalization::Local, 0);
+  // CHECK-NEXT: Local(8), Local(0)
+  // CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Both accessor with local promotion, but the first kernel does
+  // not specify a work-group size. No promotion should happen and a warning
+  // should be printed.
+  std::cout << "Local(0), Local(8)\n";
+  performFusion(q, Internalization::Local, 0, Internalization::Local, 8);
+  // CHECK-NEXT: Local(0), Local(8)
+  // CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
+  // CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
+  // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Both accessor with local promotion, but the kernels specify
+  // different work-group sizes. No promotion should happen and a warning should
+  // be printed.
+  std::cout << "Local(8), Local(16)\n";
+  performFusion(q, Internalization::Local, 8, Internalization::Local, 16);
+  // CHECK-NEXT: Local(8), Local(16)
+  // CHECK-NEXT: WARNING: Not performing specified local promotion due to work-group size mismatch
+  // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: One accessor with local internalization, one with private
+  // internalization. Should fall back to local internalization and print a
+  // warning.
+  std::cout << "Local(8), Private(8)\n";
+  performFusion(q, Internalization::Local, 8, Internalization::Private, 8,
+                /* expectInternalization */ true);
+  // CHECK-NEXT: Local(8), Private(8)
+  // CHECK-NEXT: WARNING: Performing local internalization instead, because previous accessor specified local promotion
+  // CHECK-NEXT: COMPUTATION OK
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/barrier_local_internalization.cpp b/SYCL/KernelFusion/barrier_local_internalization.cpp
new file mode 100644
index 0000000000..12302c3177
--- /dev/null
+++ b/SYCL/KernelFusion/barrier_local_internalization.cpp
@@ -0,0 +1,83 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization and a combination of kernels
+// that require a work-group barrier to be inserted by fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{
+        tmp,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
+            auto workgroupSize = i.get_local_range(0);
+            auto baseOffset = i.get_group_linear_id() * workgroupSize;
+            auto localIndex = i.get_local_linear_id();
+            auto localOffset = (workgroupSize - 1) - localIndex;
+            accTmp[baseOffset + localOffset] =
+                accIn1[baseOffset + localOffset] +
+                accIn2[baseOffset + localOffset];
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
+            auto index = i.get_global_linear_id();
+            accOut[index] = accTmp[index] * accIn3[index];
+          });
+    });
+
+    fw.complete_fusion();
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/buffer_internalization.cpp b/SYCL/KernelFusion/buffer_internalization.cpp
new file mode 100644
index 0000000000..22251fbb21
--- /dev/null
+++ b/SYCL/KernelFusion/buffer_internalization.cpp
@@ -0,0 +1,72 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with private internalization specified on the
+// buffer.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{
+        tmp,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_private{}}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/cancel_fusion.cpp b/SYCL/KernelFusion/cancel_fusion.cpp
index 3005a78d5b..6c94f99025 100644
--- a/SYCL/KernelFusion/cancel_fusion.cpp
+++ b/SYCL/KernelFusion/cancel_fusion.cpp
@@ -2,6 +2,7 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
 
 // Test cancel fusion
 
diff --git a/SYCL/KernelFusion/complete_fusion.cpp b/SYCL/KernelFusion/complete_fusion.cpp
new file mode 100644
index 0000000000..0ffeca17a5
--- /dev/null
+++ b/SYCL/KernelFusion/complete_fusion.cpp
@@ -0,0 +1,67 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion without any internalization
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/diamond_shape.cpp b/SYCL/KernelFusion/diamond_shape.cpp
new file mode 100644
index 0000000000..0f009a1d60
--- /dev/null
+++ b/SYCL/KernelFusion/diamond_shape.cpp
@@ -0,0 +1,106 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with private internalization specified on the
+// accessors for a combination of four kernels, forming a diamond-like shape and
+// repeating one of the kernels.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+struct AddKernel {
+  accessor<int, 1> accIn1;
+  accessor<int, 1> accIn2;
+  accessor<int, 1> accOut;
+
+  void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
+};
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp1[dataSize],
+      tmp2[dataSize], tmp3[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp1[i] = -1;
+    tmp2[i] = -1;
+    tmp3[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp1{tmp1, range{dataSize}};
+    buffer<int> bTmp2{tmp2, range{dataSize}};
+    buffer<int> bTmp3{tmp3, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp1 = bTmp1.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<AddKernel>(dataSize, AddKernel{accIn1, accIn2, accTmp1});
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp1 = bTmp1.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accTmp2 = bTmp2.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp1 = bTmp1.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accTmp3 = bTmp3.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp2 = bTmp2.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accTmp3 = bTmp3.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<AddKernel>(dataSize,
+                                  AddKernel{accTmp2, accTmp3, accOut});
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i + i * 25) && "Computation error");
+    assert(tmp1[i] == -1 && "tmp1 not internalized");
+    assert(tmp2[i] == -1 && "tmp2 not internalized");
+    assert(tmp3[i] == -1 && "tmp3 not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/event_wait_cancel.cpp b/SYCL/KernelFusion/event_wait_cancel.cpp
index a7b04347c2..63a049aaa1 100644
--- a/SYCL/KernelFusion/event_wait_cancel.cpp
+++ b/SYCL/KernelFusion/event_wait_cancel.cpp
@@ -2,10 +2,12 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
 
 // Test validity of events after cancel_fusion.
 
 #include "fusion_event_test_common.h"
+
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
@@ -15,6 +17,10 @@ int main() {
 
   queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
 
+  if (!q.get_device().has(sycl::aspect::usm_shared_allocations)) {
+    return 0;
+  }
+
   int *in1 = sycl::malloc_shared<int>(dataSize, q);
   int *in2 = sycl::malloc_shared<int>(dataSize, q);
   int *in3 = sycl::malloc_shared<int>(dataSize, q);
diff --git a/SYCL/KernelFusion/event_wait_complete.cpp b/SYCL/KernelFusion/event_wait_complete.cpp
new file mode 100644
index 0000000000..4d049242ad
--- /dev/null
+++ b/SYCL/KernelFusion/event_wait_complete.cpp
@@ -0,0 +1,88 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test validity of events after complete_fusion.
+
+#include "fusion_event_test_common.h"
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  if (!q.get_device().has(sycl::aspect::usm_shared_allocations)) {
+    return 0;
+  }
+
+  int *in1 = sycl::malloc_shared<int>(dataSize, q);
+  int *in2 = sycl::malloc_shared<int>(dataSize, q);
+  int *in3 = sycl::malloc_shared<int>(dataSize, q);
+  int *tmp = sycl::malloc_shared<int>(dataSize, q);
+  int *out = sycl::malloc_shared<int>(dataSize, q);
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  ext::codeplay::experimental::fusion_wrapper fw{q};
+  fw.start_fusion();
+
+  assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+  auto kernel1 = q.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelOne>(
+        dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
+  });
+
+  auto kernel2 = q.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelTwo>(
+        dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
+  });
+
+  auto complete = fw.complete_fusion(
+      {ext::codeplay::experimental::property::no_barriers{}});
+
+  assert(!fw.is_in_fusion_mode() &&
+         "Queue should not be in fusion mode anymore");
+
+  complete.wait();
+  assert(isEventComplete(complete) && "Event should be complete");
+  // The execution of the fused kennel does not depend on any events.
+  assert(complete.get_wait_list().size() == 0);
+
+  assert(isEventComplete(kernel1) && "Event should be complete");
+  // The event returned for submissions while in fusion mode depends on three
+  // events, for the two original kernels (which do not execute) and the fused
+  // kernel to be executed.
+  assert(kernel1.get_wait_list().size() == 3);
+
+  assert(isEventComplete(kernel2) && "Event should be complete");
+  // The event returned for submissions while in fusion mode depends on three
+  // events, for the two original kernels (which do not execute) and the fused
+  // kernel to be executed.
+  assert(kernel2.get_wait_list().size() == 3);
+
+  // Check the results.
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  sycl::free(in1, q);
+  sycl::free(in2, q);
+  sycl::free(in3, q);
+  sycl::free(tmp, q);
+  sycl::free(out, q);
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/internal_explicit_dependency.cpp b/SYCL/KernelFusion/internal_explicit_dependency.cpp
new file mode 100644
index 0000000000..760cffb5b1
--- /dev/null
+++ b/SYCL/KernelFusion/internal_explicit_dependency.cpp
@@ -0,0 +1,80 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion where one kernel in the fusion list specifies an
+// explicit dependency (via events) on another kernel in the fusion list.
+
+#include "fusion_event_test_common.h"
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  if (!q.get_device().has(sycl::aspect::usm_shared_allocations)) {
+    return 0;
+  }
+
+  int *in1 = sycl::malloc_shared<int>(dataSize, q);
+  int *in2 = sycl::malloc_shared<int>(dataSize, q);
+  int *in3 = sycl::malloc_shared<int>(dataSize, q);
+  int *tmp = sycl::malloc_shared<int>(dataSize, q);
+  int *out = sycl::malloc_shared<int>(dataSize, q);
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  ext::codeplay::experimental::fusion_wrapper fw{q};
+  fw.start_fusion();
+
+  assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+  auto kernel1 = q.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelOne>(
+        dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
+  });
+
+  auto kernel2 = q.submit([&](handler &cgh) {
+    cgh.depends_on(kernel1);
+    cgh.parallel_for<class KernelTwo>(
+        dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
+  });
+
+  auto complete = fw.complete_fusion(
+      {ext::codeplay::experimental::property::no_barriers{}});
+
+  assert(!fw.is_in_fusion_mode() &&
+         "Queue should not be in fusion mode anymore");
+
+  complete.wait();
+  assert(isEventComplete(complete) && "Event should be complete");
+
+  assert(isEventComplete(kernel1) && "Event should be complete");
+
+  assert(isEventComplete(kernel2) && "Event should be complete");
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  sycl::free(in1, q);
+  sycl::free(in2, q);
+  sycl::free(in3, q);
+  sycl::free(tmp, q);
+  sycl::free(out, q);
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/internalize_array_wrapper.cpp b/SYCL/KernelFusion/internalize_array_wrapper.cpp
new file mode 100644
index 0000000000..d1b41ea7bd
--- /dev/null
+++ b/SYCL/KernelFusion/internalize_array_wrapper.cpp
@@ -0,0 +1,139 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test internalization of a nested array type.
+
+#include <array>
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <size_t N, size_t M> struct array_wrapper {
+  static constexpr size_t rows{N};
+  static constexpr size_t columns{M};
+  static constexpr size_t vec_width{2};
+
+  using value_type = vec<int, vec_width>;
+  using reference_type = value_type &;
+  using const_reference_type = const value_type &;
+
+  std::array<std::array<value_type, columns>, rows> vs;
+
+  explicit array_wrapper(const_reference_type v) {
+    std::array<value_type, columns> el;
+    el.fill(v);
+    vs.fill(el);
+  }
+
+  array_wrapper() : array_wrapper{value_type{}} {}
+
+  constexpr std::array<value_type, columns> &operator[](size_t i) {
+    return vs[i];
+  }
+
+  constexpr const std::array<value_type, columns> &operator[](size_t i) const {
+    return vs[i];
+  }
+};
+
+int main() {
+  constexpr size_t dataSize = 2;
+  constexpr size_t rows = 2;
+  constexpr size_t columns = 2;
+
+  using array_type = array_wrapper<rows, columns>;
+
+  array_type in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize],
+      out[dataSize];
+
+  for (size_t id = 0; id < dataSize; ++id) {
+    for (size_t i = 0; i < rows; ++i) {
+      for (size_t j = 0; j < columns; ++j) {
+        in1[id][i][j].s0() = in1[id][i][j].s1() = id * 2;
+        in2[id][i][j].s0() = in2[id][i][j].s1() = id * 3;
+        in3[id][i][j].s0() = in3[id][i][j].s1() = id * 4;
+        tmp[id][i][j].s0() = tmp[id][i][j].s1() = -1;
+        out[id][i][j].s0() = out[id][i][j].s1() = -1;
+      }
+    }
+  }
+
+  queue q{default_selector_v,
+          {ext::codeplay::experimental::property::queue::enable_fusion{}}};
+
+  {
+    buffer<array_type> bIn1{in1, range{dataSize}};
+    buffer<array_type> bIn2{in2, range{dataSize}};
+    buffer<array_type> bIn3{in3, range{dataSize}};
+    buffer<array_type> bTmp{tmp, range{dataSize}};
+    buffer<array_type> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(dataSize, [=](id<1> id) {
+        const auto &accIn1Wrapp = accIn1[id];
+        const auto &accIn2Wrapp = accIn2[id];
+        auto &accTmpWrapp = accTmp[id];
+        for (size_t i = 0; i < dataSize; ++i) {
+          const auto &in1 = accIn1Wrapp[i];
+          const auto &in2 = accIn2Wrapp[i];
+          auto &tmp = accTmpWrapp[i];
+          for (size_t j = 0; j < columns; ++j) {
+            tmp[j] = in1[j] + in2[j];
+          }
+        }
+      });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(dataSize, [=](id<1> id) {
+        const auto &tmpWrapp = accTmp[id];
+        const auto &accIn3Wrapp = accIn3[id];
+        auto &accOutWrapp = accOut[id];
+        for (size_t i = 0; i < dataSize; ++i) {
+          const auto &tmp = tmpWrapp[i];
+          const auto &in3 = accIn3Wrapp[i];
+          auto &out = accOutWrapp[i];
+          for (size_t j = 0; j < columns; ++j) {
+            out[j] = tmp[j] * in3[j];
+          }
+        }
+      });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  constexpr array_type::value_type not_written{-1, -1};
+  for (size_t id = 0; id < dataSize; ++id) {
+    const array_type::value_type expected{20 * id * id, 20 * id * id};
+    for (size_t i = 0; i < rows; ++i) {
+      for (size_t j = 0; j < columns; ++j) {
+        assert(all(out[id][i][j] == expected) && "Computation error");
+        assert(all(tmp[id][i][j] == not_written) && "Not internalizing");
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/internalize_deep.cpp b/SYCL/KernelFusion/internalize_deep.cpp
new file mode 100644
index 0000000000..172ea3c750
--- /dev/null
+++ b/SYCL/KernelFusion/internalize_deep.cpp
@@ -0,0 +1,106 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with internalization of a deep struct type.
+
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+struct deep_vec {
+  using value_type = vec<int, 4>;
+  struct level_0 {
+    struct level_1 {
+      struct level_2 {
+        deep_vec::value_type v;
+
+        constexpr level_2() = default;
+        constexpr explicit level_2(const deep_vec::value_type &v) : v{v} {}
+      } v;
+      constexpr level_1() = default;
+      constexpr explicit level_1(const deep_vec::value_type &v) : v{v} {}
+    } v;
+
+    constexpr level_0() = default;
+    constexpr explicit level_0(const deep_vec::value_type &v) : v{v} {}
+  } v;
+
+  constexpr deep_vec() = default;
+  constexpr explicit deep_vec(const value_type &v) : v{v} {}
+
+  constexpr value_type &operator*() { return v.v.v.v; }
+  constexpr value_type *operator->() { return &this->operator*(); }
+};
+
+deep_vec operator+(deep_vec lhs, deep_vec rhs) { return deep_vec{*lhs + *rhs}; }
+deep_vec operator*(deep_vec lhs, deep_vec rhs) { return deep_vec{*lhs * *rhs}; }
+
+int main() {
+  constexpr size_t dataSize = 512;
+
+  deep_vec in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize],
+      out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i]->s0() = in1[i]->s1() = in1[i]->s2() = in1[i]->s3() = i * 2;
+    in2[i]->s0() = in2[i]->s1() = in2[i]->s2() = in2[i]->s3() = i * 3;
+    in3[i]->s0() = in3[i]->s1() = in3[i]->s2() = in3[i]->s3() = i * 4;
+    tmp[i]->s0() = tmp[i]->s1() = tmp[i]->s2() = tmp[i]->s3() = -1;
+    out[i]->s0() = out[i]->s1() = out[i]->s2() = out[i]->s3() = -1;
+  }
+
+  queue q{default_selector_v,
+          {ext::codeplay::experimental::property::queue::enable_fusion{}}};
+
+  {
+    buffer<deep_vec> bIn1{in1, range{dataSize}};
+    buffer<deep_vec> bIn2{in2, range{dataSize}};
+    buffer<deep_vec> bIn3{in3, range{dataSize}};
+    buffer<deep_vec> bTmp{tmp, range{dataSize}};
+    buffer<deep_vec> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  constexpr deep_vec::value_type not_written{-1, -1, -1, -1};
+  for (size_t i = 0; i < dataSize; ++i) {
+    const deep_vec::value_type expected{20 * i * i, 20 * i * i, 20 * i * i,
+                                        20 * i * i};
+    assert(all(*out[i] == expected) && "Computation error");
+    assert(all(*tmp[i] == not_written) && "Not internalizing");
+  };
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/internalize_multi_ptr.cpp b/SYCL/KernelFusion/internalize_multi_ptr.cpp
new file mode 100644
index 0000000000..b6937b0350
--- /dev/null
+++ b/SYCL/KernelFusion/internalize_multi_ptr.cpp
@@ -0,0 +1,81 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with private internalization specified on the
+// accessors for a device kernel using multi_ptr to global address space.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(dataSize, [=](id<1> i) {
+        size_t offset = i;
+        decorated_global_ptr<int> in1Ptr{accIn1};
+        decorated_global_ptr<int> in2Ptr{accIn2};
+        decorated_global_ptr<int> tmpPtr{accTmp};
+        tmpPtr[offset] = in1Ptr[offset] + in2Ptr[offset];
+      });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(dataSize, [=](id<1> i) {
+        size_t offset = i;
+        decorated_global_ptr<int> in3Ptr{accIn3};
+        decorated_global_ptr<int> tmpPtr{accTmp};
+        decorated_global_ptr<int> outPtr{accOut};
+        outPtr[offset] = in3Ptr[offset] * tmpPtr[offset];
+      });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/internalize_vec.cpp b/SYCL/KernelFusion/internalize_vec.cpp
new file mode 100644
index 0000000000..9f3a24f715
--- /dev/null
+++ b/SYCL/KernelFusion/internalize_vec.cpp
@@ -0,0 +1,75 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with internalization of a struct type.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+
+  vec<int, 4> in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize],
+      out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i].s0() = in1[i].s1() = in1[i].s2() = in1[i].s3() = i * 2;
+    in2[i].s0() = in2[i].s1() = in2[i].s2() = in2[i].s3() = i * 3;
+    in3[i].s0() = in3[i].s1() = in3[i].s2() = in3[i].s3() = i * 4;
+    tmp[i].s0() = tmp[i].s1() = tmp[i].s2() = tmp[i].s3() = -1;
+    out[i].s0() = out[i].s1() = out[i].s2() = out[i].s3() = -1;
+  }
+
+  queue q{default_selector_v,
+          {ext::codeplay::experimental::property::queue::enable_fusion{}}};
+
+  {
+    buffer<vec<int, 4>> bIn1{in1, range{dataSize}};
+    buffer<vec<int, 4>> bIn2{in2, range{dataSize}};
+    buffer<vec<int, 4>> bIn3{in3, range{dataSize}};
+    buffer<vec<int, 4>> bTmp{tmp, range{dataSize}};
+    buffer<vec<int, 4>> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  constexpr vec<int, 4> not_written{-1, -1, -1, -1};
+  for (size_t i = 0; i < dataSize; ++i) {
+    const vec<int, 4> expected{20 * i * i, 20 * i * i, 20 * i * i, 20 * i * i};
+    assert(all(out[i] == expected) && "Computation error");
+    assert(all(tmp[i] == not_written) && "Not internalizing");
+  };
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/internalize_vfunc.cpp b/SYCL/KernelFusion/internalize_vfunc.cpp
new file mode 100644
index 0000000000..abc9c2419f
--- /dev/null
+++ b/SYCL/KernelFusion/internalize_vfunc.cpp
@@ -0,0 +1,88 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with private internalization specified on the
+// accessors for a device kernel with sycl::vec::load and sycl::vec::store.
+
+#define VEC 4
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t numVec = 512;
+  constexpr size_t dataSize = numVec * VEC;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(numVec, [=](id<1> i) {
+        size_t offset = i;
+        vec<int, VEC> in1;
+        in1.load(offset, accIn1.get_pointer());
+        vec<int, VEC> in2;
+        in2.load(offset, accIn2.get_pointer());
+        auto tmp = in1 + in2;
+        tmp.store(offset, accTmp.get_pointer());
+      });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(numVec, [=](id<1> i) {
+        size_t offset = i;
+        vec<int, VEC> tmp;
+        tmp.load(offset, accTmp.get_pointer());
+        vec<int, VEC> in3;
+        in3.load(offset, accIn3.get_pointer());
+        auto out = tmp * in3;
+        out.store(offset, accOut.get_pointer());
+      });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/local_internalization.cpp b/SYCL/KernelFusion/local_internalization.cpp
new file mode 100644
index 0000000000..a9677b4a2e
--- /dev/null
+++ b/SYCL/KernelFusion/local_internalization.cpp
@@ -0,0 +1,73 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization specified on the
+// accessors.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_local{});
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_local{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/non_unit_local_size.cpp b/SYCL/KernelFusion/non_unit_local_size.cpp
new file mode 100644
index 0000000000..917eda6e09
--- /dev/null
+++ b/SYCL/KernelFusion/non_unit_local_size.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization specified on the
+// accessors, where each work-item processes multiple data-items.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_local{});
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{128}, {8}}, [=](item<1> i) {
+            auto baseOffset = i.get_linear_id() * 4;
+            for (size_t j = 0; j < 4; ++j) {
+              accTmp[baseOffset + j] =
+                  accIn1[baseOffset + j] + accIn2[baseOffset + j];
+            }
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_local{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{128}, {8}}, [=](item<1> i) {
+            auto baseOffset = i.get_linear_id() * 4;
+            for (size_t j = 0; j < 4; ++j) {
+              accOut[baseOffset + j] =
+                  accTmp[baseOffset + j] * accIn3[baseOffset + j];
+            }
+          });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/pointer_arg_function.cpp b/SYCL/KernelFusion/pointer_arg_function.cpp
new file mode 100644
index 0000000000..ffe5178cda
--- /dev/null
+++ b/SYCL/KernelFusion/pointer_arg_function.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+// This test currently fails because InferAddressSpace is not able to remove all
+// address-space casts, causing internalization to fail.
+// XFAIL: *
+
+// Test complete fusion with private internalization specified on the
+// accessors, calling a function with a raw pointer taken from an accessor in
+// one of the kernels.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+void __attribute__((noinline))
+addFunc(int *in1, int *in2, int *out, size_t linearID) {
+  out[linearID] = in1[linearID] + in2[linearID];
+}
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(dataSize, [=](item<1> i) {
+        addFunc(accIn1.get_pointer(), accIn2.get_pointer(),
+                accTmp.get_pointer(), i.get_linear_id());
+      });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/private_internalization.cpp b/SYCL/KernelFusion/private_internalization.cpp
new file mode 100644
index 0000000000..05120a68c3
--- /dev/null
+++ b/SYCL/KernelFusion/private_internalization.cpp
@@ -0,0 +1,71 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with private internalization specified on the
+// accessors.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/ranged_offset_accessor.cpp b/SYCL/KernelFusion/ranged_offset_accessor.cpp
new file mode 100644
index 0000000000..95f0b06c44
--- /dev/null
+++ b/SYCL/KernelFusion/ranged_offset_accessor.cpp
@@ -0,0 +1,80 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with private internalization on accessors with different
+// offset and range.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize * 5], in2[dataSize * 5], in3[dataSize * 5],
+      tmp[dataSize * 5], out[dataSize * 5];
+
+  size_t offsetIn1 = 0;
+  size_t offsetIn2 = 512;
+  size_t offsetIn3 = 1024;
+  size_t offsetTmp = 1536;
+  size_t offsetOut = 2048;
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[offsetIn1 + i] = i * 2;
+    in2[offsetIn2 + i] = i * 3;
+    in3[offsetIn3 + i] = i * 4;
+    tmp[offsetTmp + i] = -1;
+    out[offsetOut + i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize * 5}};
+    buffer<int> bIn2{in2, range{dataSize * 5}};
+    buffer<int> bIn3{in3, range{dataSize * 5}};
+    buffer<int> bTmp{tmp, range{dataSize * 5}};
+    buffer<int> bOut{out, range{dataSize * 5}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh, range{516}, id{offsetIn1});
+      auto accIn2 = bIn2.get_access(cgh, range{513}, id{offsetIn2});
+      auto accTmp = bTmp.get_access(
+          cgh, range{514}, id{offsetTmp},
+          sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, range{514}, id{offsetTmp},
+          sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh, range{515}, id{offsetIn3});
+      auto accOut = bOut.get_access(cgh, range{512}, id{offsetOut});
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[offsetOut + i] == (20 * i * i) && "Computation error");
+    assert(tmp[offsetTmp + i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/struct_with_array.cpp b/SYCL/KernelFusion/struct_with_array.cpp
new file mode 100644
index 0000000000..dca54abfa2
--- /dev/null
+++ b/SYCL/KernelFusion/struct_with_array.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with private internalization on a kernel functor with an
+// array member.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+struct KernelTwo {
+  accessor<int> buf;
+  accessor<int> out;
+  int coef[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  KernelTwo(accessor<int> buf, accessor<int> out) : buf{buf}, out{out} {}
+
+  void operator()(nd_item<1> i) const {
+    out[i.get_global_linear_id()] =
+        buf[i.get_global_linear_id()] * coef[i.get_local_linear_id()];
+  }
+};
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {8}},
+          [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for(nd_range<1>{{dataSize}, {8}}, KernelTwo{accTmp, accOut});
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (5 * i * (i % 8)) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/sync_two_queues_event_dep.cpp b/SYCL/KernelFusion/sync_two_queues_event_dep.cpp
index f1389bdf99..bb33fcdcb8 100644
--- a/SYCL/KernelFusion/sync_two_queues_event_dep.cpp
+++ b/SYCL/KernelFusion/sync_two_queues_event_dep.cpp
@@ -4,10 +4,8 @@
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
 // UNSUPPORTED: cuda || hip
-
-// For this test, complete_fusion must be supported, which is currently not the
-// case on Windows.
-// REQUIRES: linux
+// For this test, complete_fusion must be supported.
+// REQUIRES: fusion
 
 // Test fusion cancellation on event dependency between two active fusions.
 
diff --git a/SYCL/KernelFusion/sync_two_queues_requirement.cpp b/SYCL/KernelFusion/sync_two_queues_requirement.cpp
index 716ebd36ea..990049a1f7 100644
--- a/SYCL/KernelFusion/sync_two_queues_requirement.cpp
+++ b/SYCL/KernelFusion/sync_two_queues_requirement.cpp
@@ -4,10 +4,8 @@
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
 // UNSUPPORTED: cuda || hip
-
-// For this test, complete_fusion must be supported, which is currently not the
-// case on Windows.
-// REQUIRES: linux
+// For this test, complete_fusion must be supported.
+// REQUIRES: fusion
 
 // Test fusion cancellation for requirement between two active fusions.
 
diff --git a/SYCL/KernelFusion/three_dimensional.cpp b/SYCL/KernelFusion/three_dimensional.cpp
new file mode 100644
index 0000000000..e8006ca091
--- /dev/null
+++ b/SYCL/KernelFusion/three_dimensional.cpp
@@ -0,0 +1,75 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with private internalization specified on the
+// accessors for three-dimensional range.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t sizeX = 16;
+  constexpr size_t sizeY = 8;
+  constexpr size_t sizeZ = 4;
+  constexpr size_t dataSize = sizeX * sizeY * sizeZ;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    range<3> xyRange{sizeZ, sizeY, sizeX};
+    buffer<int, 3> bIn1{in1, xyRange};
+    buffer<int, 3> bIn2{in2, xyRange};
+    buffer<int, 3> bIn3{in3, xyRange};
+    buffer<int, 3> bTmp{tmp, xyRange};
+    buffer<int, 3> bOut{out, xyRange};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(
+          xyRange, [=](id<3> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          xyRange, [=](id<3> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/two_dimensional.cpp b/SYCL/KernelFusion/two_dimensional.cpp
new file mode 100644
index 0000000000..62bca54ff1
--- /dev/null
+++ b/SYCL/KernelFusion/two_dimensional.cpp
@@ -0,0 +1,74 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with private internalization specified on the
+// accessors for two-dimensional range.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t sizeX = 16;
+  constexpr size_t sizeY = 32;
+  constexpr size_t dataSize = sizeX * sizeY;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    range<2> xyRange{sizeY, sizeX};
+    buffer<int, 2> bIn1{in1, xyRange};
+    buffer<int, 2> bIn2{in2, xyRange};
+    buffer<int, 2> bIn3{in3, xyRange};
+    buffer<int, 2> bTmp{tmp, xyRange};
+    buffer<int, 2> bOut{out, xyRange};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      cgh.parallel_for<class KernelOne>(
+          xyRange, [=](id<2> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_private{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          xyRange, [=](id<2> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/usm_no_dependencies.cpp b/SYCL/KernelFusion/usm_no_dependencies.cpp
new file mode 100644
index 0000000000..2f18f758ba
--- /dev/null
+++ b/SYCL/KernelFusion/usm_no_dependencies.cpp
@@ -0,0 +1,70 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion using USM pointers.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  if (!q.get_device().has(sycl::aspect::usm_shared_allocations)) {
+    return 0;
+  }
+
+  int *in1 = sycl::malloc_shared<int>(dataSize, q);
+  int *in2 = sycl::malloc_shared<int>(dataSize, q);
+  int *in3 = sycl::malloc_shared<int>(dataSize, q);
+  int *tmp = sycl::malloc_shared<int>(dataSize, q);
+  int *out = sycl::malloc_shared<int>(dataSize, q);
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  ext::codeplay::experimental::fusion_wrapper fw{q};
+  fw.start_fusion();
+
+  assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+  q.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelOne>(
+        dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
+  });
+
+  q.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelTwo>(
+        dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
+  });
+
+  fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+  assert(!fw.is_in_fusion_mode() &&
+         "Queue should not be in fusion mode anymore");
+
+  q.wait();
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  sycl::free(in1, q);
+  sycl::free(in2, q);
+  sycl::free(in3, q);
+  sycl::free(tmp, q);
+  sycl::free(out, q);
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/work_group_barrier.cpp b/SYCL/KernelFusion/work_group_barrier.cpp
new file mode 100644
index 0000000000..7141c37be8
--- /dev/null
+++ b/SYCL/KernelFusion/work_group_barrier.cpp
@@ -0,0 +1,79 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with a combination of kernels that require a work-group
+// barrier to be inserted by fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
+            auto workgroupSize = i.get_local_range(0);
+            auto baseOffset = i.get_group_linear_id() * workgroupSize;
+            auto localIndex = i.get_local_linear_id();
+            auto localOffset = (workgroupSize - 1) - localIndex;
+            accTmp[baseOffset + localOffset] =
+                accIn1[baseOffset + localOffset] +
+                accIn2[baseOffset + localOffset];
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
+            auto index = i.get_global_linear_id();
+            accOut[index] = accTmp[index] * accIn3[index];
+          });
+    });
+
+    fw.complete_fusion();
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/wrapped_usm.cpp b/SYCL/KernelFusion/wrapped_usm.cpp
new file mode 100644
index 0000000000..8532a9dadf
--- /dev/null
+++ b/SYCL/KernelFusion/wrapped_usm.cpp
@@ -0,0 +1,80 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion using an wrapped USM pointer as kernel functor argument.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <typename T> struct wrapper {
+  T *data;
+
+  wrapper(size_t dataSize, queue &q)
+      : data{sycl::malloc_shared<int>(dataSize, q)} {}
+
+  T &operator[](size_t i) { return data[i]; }
+  const T &operator[](size_t i) const { return data[i]; }
+};
+
+int main() {
+  constexpr size_t dataSize = 512;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  if (!q.get_device().has(sycl::aspect::usm_shared_allocations)) {
+    return 0;
+  }
+
+  wrapper<int> in1{dataSize, q};
+  wrapper<int> in2{dataSize, q};
+  wrapper<int> in3{dataSize, q};
+  wrapper<int> tmp{dataSize, q};
+  wrapper<int> out{dataSize, q};
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  ext::codeplay::experimental::fusion_wrapper fw{q};
+  fw.start_fusion();
+
+  assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+  q.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelOne>(
+        dataSize, [=](id<1> i) { tmp.data[i] = in1.data[i] + in2.data[i]; });
+  });
+
+  q.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelTwo>(
+        dataSize, [=](id<1> i) { out.data[i] = tmp.data[i] * in3.data[i]; });
+  });
+
+  fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+  assert(!fw.is_in_fusion_mode() &&
+         "Queue should not be in fusion mode anymore");
+
+  q.wait();
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  sycl::free(in1.data, q);
+  sycl::free(in2.data, q);
+  sycl::free(in3.data, q);
+  sycl::free(tmp.data, q);
+  sycl::free(out.data, q);
+
+  return 0;
+}
diff --git a/SYCL/lit.cfg.py b/SYCL/lit.cfg.py
index 3a7e281285..144f4f0aff 100644
--- a/SYCL/lit.cfg.py
+++ b/SYCL/lit.cfg.py
@@ -427,6 +427,23 @@
     else:
         lit_config.warning("Couldn't find pre-installed AOT device compiler " + aot_tool)
 
+# Check if kernel fusion is available by compiling a small program that will
+# be ill-formed (compilation stops with non-zero exit code) if the feature
+# test macro for kernel fusion is not defined.
+check_fusion_file = 'check_fusion.cpp'
+with open(check_fusion_file, 'w') as ff:
+    ff.write('#include <sycl/sycl.hpp>\n')
+    ff.write('#ifndef SYCL_EXT_CODEPLAY_KERNEL_FUSION\n')
+    ff.write('#error \"Feature test for fusion failed\"\n')
+    ff.write('#endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION\n')
+    ff.write('int main() { return 0; }\n')
+
+status = subprocess.getstatusoutput(config.dpcpp_compiler + ' -fsycl  ' +
+                                    check_fusion_file)
+if status[0] == 0:
+    lit_config.note('Kernel fusion extension enabled')
+    config.available_features.add('fusion')
+
 # Set timeout for a single test
 try:
     import psutil