diff --git a/SYCL/KernelFusion/cancel_fusion.cpp b/SYCL/KernelFusion/cancel_fusion.cpp
index 77959fdc74..3005a78d5b 100644
--- a/SYCL/KernelFusion/cancel_fusion.cpp
+++ b/SYCL/KernelFusion/cancel_fusion.cpp
@@ -1,4 +1,6 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 // UNSUPPORTED: cuda || hip
 
 // Test cancel fusion
diff --git a/SYCL/KernelFusion/event_wait_cancel.cpp b/SYCL/KernelFusion/event_wait_cancel.cpp
new file mode 100644
index 0000000000..a7b04347c2
--- /dev/null
+++ b/SYCL/KernelFusion/event_wait_cancel.cpp
@@ -0,0 +1,77 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+
+// Test validity of events after cancel_fusion.
+
+#include "fusion_event_test_common.h"
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  int *in1 = sycl::malloc_shared<int>(dataSize, q);
+  int *in2 = sycl::malloc_shared<int>(dataSize, q);
+  int *in3 = sycl::malloc_shared<int>(dataSize, q);
+  int *tmp = sycl::malloc_shared<int>(dataSize, q);
+  int *out = sycl::malloc_shared<int>(dataSize, q);
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  ext::codeplay::experimental::fusion_wrapper fw{q};
+  fw.start_fusion();
+
+  assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+  auto kernel1 = q.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelOne>(
+        dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
+  });
+
+  auto kernel2 = q.submit([&](handler &cgh) {
+    cgh.depends_on(kernel1);
+    cgh.parallel_for<class KernelTwo>(
+        dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
+  });
+
+  fw.cancel_fusion();
+
+  assert(!fw.is_in_fusion_mode() &&
+         "Queue should not be in fusion mode anymore");
+
+  kernel1.wait();
+  assert(isEventComplete(kernel1) && "Event should be complete");
+  // The event returned by submit while in fusion mode depends on both
+  // individual kernels to be executed.
+  assert(kernel1.get_wait_list().size() == 2);
+
+  kernel2.wait();
+  assert(isEventComplete(kernel2) && "Event should be complete");
+  // The event returned by submit while in fusion mode depends on both
+  // individual kernels to be executed.
+  assert(kernel2.get_wait_list().size() == 2);
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  sycl::free(in1, q);
+  sycl::free(in2, q);
+  sycl::free(in3, q);
+  sycl::free(tmp, q);
+  sycl::free(out, q);
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/fusion_event_test_common.h b/SYCL/KernelFusion/fusion_event_test_common.h
new file mode 100644
index 0000000000..041e8c0b2e
--- /dev/null
+++ b/SYCL/KernelFusion/fusion_event_test_common.h
@@ -0,0 +1,8 @@
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+static bool isEventComplete(sycl::event &ev) {
+  return ev.get_info<info::event::command_execution_status>() ==
+         info::event_command_status::complete;
+}
diff --git a/SYCL/KernelFusion/sync_acc_mem_op.cpp b/SYCL/KernelFusion/sync_acc_mem_op.cpp
new file mode 100644
index 0000000000..14643a3d81
--- /dev/null
+++ b/SYCL/KernelFusion/sync_acc_mem_op.cpp
@@ -0,0 +1,84 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on an explicit memory operation on an accessor
+// happening before complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+  int dst[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+    dst[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+      auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    // This explicit copy operation has an overlapping requirement with one of
+    // the kernels and therefore requires synchronization. This should lead to
+    // cancellation of the fusion.
+    auto copyEvt = q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.copy(accTmp, dst);
+    });
+
+    copyEvt.wait();
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(dst[i] == (5 * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
diff --git a/SYCL/KernelFusion/sync_buffer_destruction.cpp b/SYCL/KernelFusion/sync_buffer_destruction.cpp
new file mode 100644
index 0000000000..627a8cdbfe
--- /dev/null
+++ b/SYCL/KernelFusion/sync_buffer_destruction.cpp
@@ -0,0 +1,76 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on buffer destruction happening before
+// complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    {
+      buffer<int> bIn3{in3, range{dataSize}};
+
+      fw.start_fusion();
+
+      assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+      q.submit([&](handler &cgh) {
+        auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+        auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+        auto accTmp = bTmp.get_access(cgh);
+        cgh.parallel_for<class KernelOne>(
+            dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+      });
+
+      q.submit([&](handler &cgh) {
+        auto accTmp = bTmp.get_access(cgh);
+        auto accIn3 = bIn3.get_access(cgh);
+        auto accOut = bOut.get_access(cgh);
+        cgh.parallel_for<class KernelTwo>(
+            dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+      });
+      // Buffer bIn3, which is accessed by one of the kernels in the fusion list
+      // goes out scope, causing a blocking wait for one of the kernels in the
+      // fusion list. This should lead to cancellation of the fusion.
+    }
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
diff --git a/SYCL/KernelFusion/sync_event_wait.cpp b/SYCL/KernelFusion/sync_event_wait.cpp
new file mode 100644
index 0000000000..d34393638e
--- /dev/null
+++ b/SYCL/KernelFusion/sync_event_wait.cpp
@@ -0,0 +1,75 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on event::wait() happening before
+// complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    auto kernel1Ev = q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+      auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    // This event::wait() causes a blocking wait for one of the kernels in the
+    // fusion list. This should lead to cancellation of the fusion.
+    kernel1Ev.wait();
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
diff --git a/SYCL/KernelFusion/sync_host_accessor.cpp b/SYCL/KernelFusion/sync_host_accessor.cpp
new file mode 100644
index 0000000000..854803f347
--- /dev/null
+++ b/SYCL/KernelFusion/sync_host_accessor.cpp
@@ -0,0 +1,77 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on host accessor creation happening before
+// complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+      auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    // This host accessor requests access to bIn3, which is accessed by one of
+    // the kernels in the fusion list. This causes a blocking wait for one of
+    // the kernels in the fusion list. This should lead to cancellation of the
+    // fusion.
+    auto hostAcc = bIn3.get_access<access::mode::read>();
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
diff --git a/SYCL/KernelFusion/sync_host_task.cpp b/SYCL/KernelFusion/sync_host_task.cpp
new file mode 100644
index 0000000000..fc94fa9b3d
--- /dev/null
+++ b/SYCL/KernelFusion/sync_host_task.cpp
@@ -0,0 +1,84 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on host task submission happening before
+// complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+      auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    // This host task requests access to bOut, which is accessed by one of
+    // the kernels in the fusion list, creating a requirement for one of the
+    // kernels in the fusion list. This should lead to cancellation of the
+    // fusion.
+    q.submit([&](handler &cgh) {
+      auto accOut = bOut.get_access(cgh);
+      cgh.host_task([=]() { accOut[256] = 42; });
+    });
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    if (i == 256) {
+      assert(out[i] == 42 && "Computation error");
+    } else {
+      assert(out[i] == (20 * i * i) && "Computation error");
+    }
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
diff --git a/SYCL/KernelFusion/sync_queue_destruction.cpp b/SYCL/KernelFusion/sync_queue_destruction.cpp
new file mode 100644
index 0000000000..145fde97b5
--- /dev/null
+++ b/SYCL/KernelFusion/sync_queue_destruction.cpp
@@ -0,0 +1,71 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on queue destruction happening before
+// complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+
+    {
+      queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+      ext::codeplay::experimental::fusion_wrapper fw{q};
+      fw.start_fusion();
+
+      assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+      q.submit([&](handler &cgh) {
+        auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+        auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+        auto accTmp = bTmp.get_access(cgh);
+        cgh.parallel_for<class KernelOne>(
+            dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+      });
+
+      q.submit([&](handler &cgh) {
+        auto accTmp = bTmp.get_access(cgh);
+        auto accIn3 = bIn3.get_access(cgh);
+        auto accOut = bOut.get_access(cgh);
+        cgh.parallel_for<class KernelTwo>(
+            dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+      });
+      // Queue q, which is still in fusion mode and to which all kernels have
+      // been submitted, goes out-of-scope here. This should lead to
+      // cancellation of the fusion.
+    }
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
diff --git a/SYCL/KernelFusion/complete_fusion.cpp b/SYCL/KernelFusion/sync_queue_wait.cpp
similarity index 70%
rename from SYCL/KernelFusion/complete_fusion.cpp
rename to SYCL/KernelFusion/sync_queue_wait.cpp
index fda3932cd9..5fe768d60c 100644
--- a/SYCL/KernelFusion/complete_fusion.cpp
+++ b/SYCL/KernelFusion/sync_queue_wait.cpp
@@ -1,7 +1,12 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
 // UNSUPPORTED: cuda || hip
 
-// Test complete fusion without any internalization
+// Test fusion cancellation on queue::wait() happening before
+// complete_fusion.
 
 #include <sycl/sycl.hpp>
 
@@ -34,8 +39,8 @@ int main() {
     assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
 
     q.submit([&](handler &cgh) {
-      auto accIn1 = bIn1.get_access(cgh);
-      auto accIn2 = bIn2.get_access(cgh);
+      auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+      auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
       auto accTmp = bTmp.get_access(cgh);
       cgh.parallel_for<class KernelOne>(
           dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
@@ -49,10 +54,14 @@ int main() {
           dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
     });
 
-    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+    // This queue.wait() causes a blocking wait for all of the kernels in the
+    // fusion list. This should lead to cancellation of the fusion.
+    q.wait();
 
     assert(!fw.is_in_fusion_mode() &&
            "Queue should not be in fusion mode anymore");
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
   }
 
   // Check the results
@@ -62,3 +71,5 @@ int main() {
 
   return 0;
 }
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
diff --git a/SYCL/KernelFusion/sync_second_queue.cpp b/SYCL/KernelFusion/sync_second_queue.cpp
new file mode 100644
index 0000000000..057c969353
--- /dev/null
+++ b/SYCL/KernelFusion/sync_second_queue.cpp
@@ -0,0 +1,84 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on submission of kernel with requirements to a
+// different queue happening before complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+      auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    queue q2{};
+
+    // This kernel in a different queue creates a requirement for buffer bOut,
+    // which is also accessed by one of the kernels in the fusion list. This
+    // should lead to cancellation of the fusion to avoid circular dependencies.
+    q2.submit([&](handler &cgh) {
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class OtherKernel>(
+          dataSize, [=](id<1> i) { accOut[i] = accOut[i] * 2; });
+    });
+
+    q2.wait();
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (40 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
diff --git a/SYCL/KernelFusion/sync_two_queues_event_dep.cpp b/SYCL/KernelFusion/sync_two_queues_event_dep.cpp
new file mode 100644
index 0000000000..f1389bdf99
--- /dev/null
+++ b/SYCL/KernelFusion/sync_two_queues_event_dep.cpp
@@ -0,0 +1,91 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// For this test, complete_fusion must be supported, which is currently not the
+// case on Windows.
+// REQUIRES: linux
+
+// Test fusion cancellation on event dependency between two active fusions.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+
+  queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}};
+  queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  int *in1 = sycl::malloc_shared<int>(dataSize, q1);
+  int *in2 = sycl::malloc_shared<int>(dataSize, q1);
+  int *in3 = sycl::malloc_shared<int>(dataSize, q1);
+  int *tmp = sycl::malloc_shared<int>(dataSize, q1);
+  int *out = sycl::malloc_shared<int>(dataSize, q1);
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  ext::codeplay::experimental::fusion_wrapper fw1{q1};
+  fw1.start_fusion();
+
+  assert(fw1.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+  auto kernel1 = q1.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelOne>(
+        dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
+  });
+
+  ext::codeplay::experimental::fusion_wrapper fw2{q2};
+  fw2.start_fusion();
+
+  auto kernel3 = q2.submit([&](handler &cgh) {
+    cgh.depends_on(kernel1);
+    cgh.parallel_for<class KernelThree>(dataSize,
+                                        [=](id<1> i) { tmp[i] *= 2; });
+  });
+
+  // kernel3 specifies an event dependency on kernel1. To avoid circular
+  // dependencies between two fusions, the fusion for q1 needs to cancelled.
+  assert(!fw1.is_in_fusion_mode() &&
+         "Queue should not be in fusion mode anymore");
+
+  assert(fw2.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+  auto kernel2 = q1.submit([&](handler &cgh) {
+    cgh.depends_on(kernel3);
+    cgh.parallel_for<class KernelTwo>(
+        dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
+  });
+
+  // kernel2 specifies an event dependency on kernel3, which leads to
+  // cancellation of the fusion for q2.
+  assert(!fw2.is_in_fusion_mode() &&
+         "Queue should not be in fusion mode anymore");
+
+  fw1.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+  fw2.cancel_fusion();
+
+  q1.wait();
+  q2.wait();
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (40 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because of event dependency from a different fusion
+// CHECK-NEXT: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
diff --git a/SYCL/KernelFusion/sync_two_queues_requirement.cpp b/SYCL/KernelFusion/sync_two_queues_requirement.cpp
new file mode 100644
index 0000000000..716ebd36ea
--- /dev/null
+++ b/SYCL/KernelFusion/sync_two_queues_requirement.cpp
@@ -0,0 +1,96 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// For this test, complete_fusion must be supported, which is currently not the
+// case on Windows.
+// REQUIRES: linux
+
+// Test fusion cancellation for requirement between two active fusions.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}};
+  queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw1{q1};
+    fw1.start_fusion();
+
+    assert(fw1.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q1.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
+      auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
+    });
+
+    ext::codeplay::experimental::fusion_wrapper fw2{q2};
+    fw2.start_fusion();
+
+    q2.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelThree>(dataSize,
+                                          [=](id<1> i) { accTmp[i] *= 2; });
+    });
+
+    // KernelThree specifies a requirement on KernelOne. To avoid circular
+    // dependencies between two fusions, the fusion for q1 needs to cancelled.
+    assert(!fw1.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    assert(fw2.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q1.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    // KernelTwo specifies a requirement on KernelThree, which leads to
+    // cancellation of the fusion for q2.
+    assert(!fw2.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+
+    fw1.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    fw2.cancel_fusion();
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (40 * i * i) && "Computation error");
+  }
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because of requirement from a different fusion
+// CHECK-NEXT: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
diff --git a/SYCL/KernelFusion/sync_usm_mem_op.cpp b/SYCL/KernelFusion/sync_usm_mem_op.cpp
new file mode 100644
index 0000000000..67af367316
--- /dev/null
+++ b/SYCL/KernelFusion/sync_usm_mem_op.cpp
@@ -0,0 +1,79 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+
+// Test fusion cancellation on an explicit memory operation on an USM pointer
+// happening before complete_fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  int *in1 = sycl::malloc_shared<int>(dataSize, q);
+  int *in2 = sycl::malloc_shared<int>(dataSize, q);
+  int *in3 = sycl::malloc_shared<int>(dataSize, q);
+  int *tmp = sycl::malloc_shared<int>(dataSize, q);
+  int *out = sycl::malloc_shared<int>(dataSize, q);
+  int dst[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+    dst[i] = -1;
+  }
+
+  ext::codeplay::experimental::fusion_wrapper fw{q};
+  fw.start_fusion();
+
+  assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+  auto kernel1 = q.submit([&](handler &cgh) {
+    cgh.parallel_for<class KernelOne>(
+        dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
+  });
+
+  auto kernel2 = q.submit([&](handler &cgh) {
+    cgh.depends_on(kernel1);
+    cgh.parallel_for<class KernelTwo>(
+        dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
+  });
+
+  // This explicit copy operation has an explicit dependency on one of the
+  // kernels and therefore requires synchronization. This should lead to
+  // cancellation of the fusion.
+  auto copyEvt = q.copy(tmp, dst, dataSize, kernel1);
+
+  copyEvt.wait();
+
+  assert(!fw.is_in_fusion_mode() &&
+         "Queue should not be in fusion mode anymore");
+
+  fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(dst[i] == (5 * i) && "Computation error");
+  }
+
+  sycl::free(in1, q);
+  sycl::free(in2, q);
+  sycl::free(in3, q);
+  sycl::free(tmp, q);
+  sycl::free(out, q);
+
+  return 0;
+}
+
+// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested