diff --git a/SYCL/KernelFusion/cancel_fusion.cpp b/SYCL/KernelFusion/cancel_fusion.cpp new file mode 100644 index 0000000000..77959fdc74 --- /dev/null +++ b/SYCL/KernelFusion/cancel_fusion.cpp @@ -0,0 +1,64 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// UNSUPPORTED: cuda || hip + +// Test cancel fusion + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.cancel_fusion(); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/complete_fusion.cpp b/SYCL/KernelFusion/complete_fusion.cpp new file mode 100644 index 0000000000..fda3932cd9 --- /dev/null +++ b/SYCL/KernelFusion/complete_fusion.cpp @@ -0,0 +1,64 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// UNSUPPORTED: cuda || hip + +// Test complete fusion without any internalization + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + return 0; +}