diff --git a/SYCL/KernelFusion/cancel_fusion.cpp b/SYCL/KernelFusion/cancel_fusion.cpp index 77959fdc74..3005a78d5b 100644 --- a/SYCL/KernelFusion/cancel_fusion.cpp +++ b/SYCL/KernelFusion/cancel_fusion.cpp @@ -1,4 +1,6 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out // UNSUPPORTED: cuda || hip // Test cancel fusion diff --git a/SYCL/KernelFusion/event_wait_cancel.cpp b/SYCL/KernelFusion/event_wait_cancel.cpp new file mode 100644 index 0000000000..a7b04347c2 --- /dev/null +++ b/SYCL/KernelFusion/event_wait_cancel.cpp @@ -0,0 +1,77 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip + +// Test validity of events after cancel_fusion. + +#include "fusion_event_test_common.h" +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + int *in1 = sycl::malloc_shared(dataSize, q); + int *in2 = sycl::malloc_shared(dataSize, q); + int *in3 = sycl::malloc_shared(dataSize, q); + int *tmp = sycl::malloc_shared(dataSize, q); + int *out = sycl::malloc_shared(dataSize, q); + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + auto kernel1 = q.submit([&](handler &cgh) { + cgh.parallel_for( + dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; }); + }); + + auto kernel2 = q.submit([&](handler &cgh) { + cgh.depends_on(kernel1); + cgh.parallel_for( + dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; }); + }); + + fw.cancel_fusion(); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + kernel1.wait(); + assert(isEventComplete(kernel1) && "Event should be complete"); + // The event returned by submit while in fusion mode depends on both + // individual kernels to be executed. + assert(kernel1.get_wait_list().size() == 2); + + kernel2.wait(); + assert(isEventComplete(kernel2) && "Event should be complete"); + // The event returned by submit while in fusion mode depends on both + // individual kernels to be executed. + assert(kernel2.get_wait_list().size() == 2); + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + sycl::free(in1, q); + sycl::free(in2, q); + sycl::free(in3, q); + sycl::free(tmp, q); + sycl::free(out, q); + + return 0; +} diff --git a/SYCL/KernelFusion/fusion_event_test_common.h b/SYCL/KernelFusion/fusion_event_test_common.h new file mode 100644 index 0000000000..041e8c0b2e --- /dev/null +++ b/SYCL/KernelFusion/fusion_event_test_common.h @@ -0,0 +1,8 @@ +#include + +using namespace sycl; + +static bool isEventComplete(sycl::event &ev) { + return ev.get_info() == + info::event_command_status::complete; +} diff --git a/SYCL/KernelFusion/sync_acc_mem_op.cpp b/SYCL/KernelFusion/sync_acc_mem_op.cpp new file mode 100644 index 0000000000..14643a3d81 --- /dev/null +++ b/SYCL/KernelFusion/sync_acc_mem_op.cpp @@ -0,0 +1,84 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip + +// Test fusion cancellation on an explicit memory operation on an accessor +// happening before complete_fusion. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + int dst[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + dst[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + // This explicit copy operation has an overlapping requirement with one of + // the kernels and therefore requires synchronization. This should lead to + // cancellation of the fusion. + auto copyEvt = q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + cgh.copy(accTmp, dst); + }); + + copyEvt.wait(); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(dst[i] == (5 * i) && "Computation error"); + } + + return 0; +} + +// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested diff --git a/SYCL/KernelFusion/sync_buffer_destruction.cpp b/SYCL/KernelFusion/sync_buffer_destruction.cpp new file mode 100644 index 0000000000..627a8cdbfe --- /dev/null +++ b/SYCL/KernelFusion/sync_buffer_destruction.cpp @@ -0,0 +1,76 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip + +// Test fusion cancellation on buffer destruction happening before +// complete_fusion. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + { + buffer bIn3{in3, range{dataSize}}; + + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + // Buffer bIn3, which is accessed by one of the kernels in the fusion list + // goes out scope, causing a blocking wait for one of the kernels in the + // fusion list. This should lead to cancellation of the fusion. + } + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + return 0; +} + +// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested diff --git a/SYCL/KernelFusion/sync_event_wait.cpp b/SYCL/KernelFusion/sync_event_wait.cpp new file mode 100644 index 0000000000..d34393638e --- /dev/null +++ b/SYCL/KernelFusion/sync_event_wait.cpp @@ -0,0 +1,75 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip + +// Test fusion cancellation on event::wait() happening before +// complete_fusion. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + auto kernel1Ev = q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + // This event::wait() causes a blocking wait for one of the kernels in the + // fusion list. This should lead to cancellation of the fusion. + kernel1Ev.wait(); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + return 0; +} + +// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested diff --git a/SYCL/KernelFusion/sync_host_accessor.cpp b/SYCL/KernelFusion/sync_host_accessor.cpp new file mode 100644 index 0000000000..854803f347 --- /dev/null +++ b/SYCL/KernelFusion/sync_host_accessor.cpp @@ -0,0 +1,77 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip + +// Test fusion cancellation on host accessor creation happening before +// complete_fusion. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + // This host accessor requests access to bIn3, which is accessed by one of + // the kernels in the fusion list. This causes a blocking wait for one of + // the kernels in the fusion list. This should lead to cancellation of the + // fusion. + auto hostAcc = bIn3.get_access(); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + return 0; +} + +// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested diff --git a/SYCL/KernelFusion/sync_host_task.cpp b/SYCL/KernelFusion/sync_host_task.cpp new file mode 100644 index 0000000000..fc94fa9b3d --- /dev/null +++ b/SYCL/KernelFusion/sync_host_task.cpp @@ -0,0 +1,84 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip + +// Test fusion cancellation on host task submission happening before +// complete_fusion. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + // This host task requests access to bOut, which is accessed by one of + // the kernels in the fusion list, creating a requirement for one of the + // kernels in the fusion list. This should lead to cancellation of the + // fusion. + q.submit([&](handler &cgh) { + auto accOut = bOut.get_access(cgh); + cgh.host_task([=]() { accOut[256] = 42; }); + }); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + if (i == 256) { + assert(out[i] == 42 && "Computation error"); + } else { + assert(out[i] == (20 * i * i) && "Computation error"); + } + } + + return 0; +} + +// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested diff --git a/SYCL/KernelFusion/sync_queue_destruction.cpp b/SYCL/KernelFusion/sync_queue_destruction.cpp new file mode 100644 index 0000000000..145fde97b5 --- /dev/null +++ b/SYCL/KernelFusion/sync_queue_destruction.cpp @@ -0,0 +1,71 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip + +// Test fusion cancellation on queue destruction happening before +// complete_fusion. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + + { + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + // Queue q, which is still in fusion mode and to which all kernels have + // been submitted, goes out-of-scope here. This should lead to + // cancellation of the fusion. + } + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + return 0; +} + +// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested diff --git a/SYCL/KernelFusion/complete_fusion.cpp b/SYCL/KernelFusion/sync_queue_wait.cpp similarity index 70% rename from SYCL/KernelFusion/complete_fusion.cpp rename to SYCL/KernelFusion/sync_queue_wait.cpp index fda3932cd9..5fe768d60c 100644 --- a/SYCL/KernelFusion/complete_fusion.cpp +++ b/SYCL/KernelFusion/sync_queue_wait.cpp @@ -1,7 +1,12 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER // UNSUPPORTED: cuda || hip -// Test complete fusion without any internalization +// Test fusion cancellation on queue::wait() happening before +// complete_fusion. #include @@ -34,8 +39,8 @@ int main() { assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); q.submit([&](handler &cgh) { - auto accIn1 = bIn1.get_access(cgh); - auto accIn2 = bIn2.get_access(cgh); + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); auto accTmp = bTmp.get_access(cgh); cgh.parallel_for( dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); @@ -49,10 +54,14 @@ int main() { dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); }); - fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + // This queue.wait() causes a blocking wait for all of the kernels in the + // fusion list. This should lead to cancellation of the fusion. + q.wait(); assert(!fw.is_in_fusion_mode() && "Queue should not be in fusion mode anymore"); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); } // Check the results @@ -62,3 +71,5 @@ int main() { return 0; } + +// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested diff --git a/SYCL/KernelFusion/sync_second_queue.cpp b/SYCL/KernelFusion/sync_second_queue.cpp new file mode 100644 index 0000000000..057c969353 --- /dev/null +++ b/SYCL/KernelFusion/sync_second_queue.cpp @@ -0,0 +1,84 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip + +// Test fusion cancellation on submission of kernel with requirements to a +// different queue happening before complete_fusion. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + queue q2{}; + + // This kernel in a different queue creates a requirement for buffer bOut, + // which is also accessed by one of the kernels in the fusion list. This + // should lead to cancellation of the fusion to avoid circular dependencies. + q2.submit([&](handler &cgh) { + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accOut[i] * 2; }); + }); + + q2.wait(); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (40 * i * i) && "Computation error"); + } + + return 0; +} + +// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested diff --git a/SYCL/KernelFusion/sync_two_queues_event_dep.cpp b/SYCL/KernelFusion/sync_two_queues_event_dep.cpp new file mode 100644 index 0000000000..f1389bdf99 --- /dev/null +++ b/SYCL/KernelFusion/sync_two_queues_event_dep.cpp @@ -0,0 +1,91 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip + +// For this test, complete_fusion must be supported, which is currently not the +// case on Windows. +// REQUIRES: linux + +// Test fusion cancellation on event dependency between two active fusions. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + + queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}}; + queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + int *in1 = sycl::malloc_shared(dataSize, q1); + int *in2 = sycl::malloc_shared(dataSize, q1); + int *in3 = sycl::malloc_shared(dataSize, q1); + int *tmp = sycl::malloc_shared(dataSize, q1); + int *out = sycl::malloc_shared(dataSize, q1); + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + ext::codeplay::experimental::fusion_wrapper fw1{q1}; + fw1.start_fusion(); + + assert(fw1.is_in_fusion_mode() && "Queue should be in fusion mode"); + + auto kernel1 = q1.submit([&](handler &cgh) { + cgh.parallel_for( + dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; }); + }); + + ext::codeplay::experimental::fusion_wrapper fw2{q2}; + fw2.start_fusion(); + + auto kernel3 = q2.submit([&](handler &cgh) { + cgh.depends_on(kernel1); + cgh.parallel_for(dataSize, + [=](id<1> i) { tmp[i] *= 2; }); + }); + + // kernel3 specifies an event dependency on kernel1. To avoid circular + // dependencies between two fusions, the fusion for q1 needs to cancelled. + assert(!fw1.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + assert(fw2.is_in_fusion_mode() && "Queue should be in fusion mode"); + + auto kernel2 = q1.submit([&](handler &cgh) { + cgh.depends_on(kernel3); + cgh.parallel_for( + dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; }); + }); + + // kernel2 specifies an event dependency on kernel3, which leads to + // cancellation of the fusion for q2. + assert(!fw2.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + fw1.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + fw2.cancel_fusion(); + + q1.wait(); + q2.wait(); + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (40 * i * i) && "Computation error"); + } + + return 0; +} + +// CHECK: WARNING: Aborting fusion because of event dependency from a different fusion +// CHECK-NEXT: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested diff --git a/SYCL/KernelFusion/sync_two_queues_requirement.cpp b/SYCL/KernelFusion/sync_two_queues_requirement.cpp new file mode 100644 index 0000000000..716ebd36ea --- /dev/null +++ b/SYCL/KernelFusion/sync_two_queues_requirement.cpp @@ -0,0 +1,96 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip + +// For this test, complete_fusion must be supported, which is currently not the +// case on Windows. +// REQUIRES: linux + +// Test fusion cancellation for requirement between two active fusions. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}}; + queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw1{q1}; + fw1.start_fusion(); + + assert(fw1.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q1.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + ext::codeplay::experimental::fusion_wrapper fw2{q2}; + fw2.start_fusion(); + + q2.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for(dataSize, + [=](id<1> i) { accTmp[i] *= 2; }); + }); + + // KernelThree specifies a requirement on KernelOne. To avoid circular + // dependencies between two fusions, the fusion for q1 needs to cancelled. + assert(!fw1.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + assert(fw2.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q1.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + // KernelTwo specifies a requirement on KernelThree, which leads to + // cancellation of the fusion for q2. + assert(!fw2.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + fw1.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + fw2.cancel_fusion(); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (40 * i * i) && "Computation error"); + } + + return 0; +} + +// CHECK: WARNING: Aborting fusion because of requirement from a different fusion +// CHECK-NEXT: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested diff --git a/SYCL/KernelFusion/sync_usm_mem_op.cpp b/SYCL/KernelFusion/sync_usm_mem_op.cpp new file mode 100644 index 0000000000..67af367316 --- /dev/null +++ b/SYCL/KernelFusion/sync_usm_mem_op.cpp @@ -0,0 +1,79 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip + +// Test fusion cancellation on an explicit memory operation on an USM pointer +// happening before complete_fusion. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + int *in1 = sycl::malloc_shared(dataSize, q); + int *in2 = sycl::malloc_shared(dataSize, q); + int *in3 = sycl::malloc_shared(dataSize, q); + int *tmp = sycl::malloc_shared(dataSize, q); + int *out = sycl::malloc_shared(dataSize, q); + int dst[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + dst[i] = -1; + } + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + auto kernel1 = q.submit([&](handler &cgh) { + cgh.parallel_for( + dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; }); + }); + + auto kernel2 = q.submit([&](handler &cgh) { + cgh.depends_on(kernel1); + cgh.parallel_for( + dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; }); + }); + + // This explicit copy operation has an explicit dependency on one of the + // kernels and therefore requires synchronization. This should lead to + // cancellation of the fusion. + auto copyEvt = q.copy(tmp, dst, dataSize, kernel1); + + copyEvt.wait(); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(dst[i] == (5 * i) && "Computation error"); + } + + sycl::free(in1, q); + sycl::free(in2, q); + sycl::free(in3, q); + sycl::free(tmp, q); + sycl::free(out, q); + + return 0; +} + +// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested