From 87158a295cd1c23f58af45f15ab797760e9b19ad Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 7 Dec 2022 14:52:14 +0000 Subject: [PATCH] [SYCL][Fusion] Test kernel fusion and optimization Signed-off-by: Lukas Sommer --- SYCL/KernelFusion/abort_fusion.cpp | 105 +++++++++++ SYCL/KernelFusion/abort_internalization.cpp | 174 ++++++++++++++++++ .../barrier_local_internalization.cpp | 83 +++++++++ SYCL/KernelFusion/buffer_internalization.cpp | 72 ++++++++ SYCL/KernelFusion/cancel_fusion.cpp | 1 + SYCL/KernelFusion/complete_fusion.cpp | 67 +++++++ SYCL/KernelFusion/diamond_shape.cpp | 106 +++++++++++ SYCL/KernelFusion/event_wait_cancel.cpp | 6 + SYCL/KernelFusion/event_wait_complete.cpp | 88 +++++++++ .../internal_explicit_dependency.cpp | 80 ++++++++ .../internalize_array_wrapper.cpp | 139 ++++++++++++++ SYCL/KernelFusion/internalize_deep.cpp | 106 +++++++++++ SYCL/KernelFusion/internalize_multi_ptr.cpp | 81 ++++++++ SYCL/KernelFusion/internalize_vec.cpp | 75 ++++++++ SYCL/KernelFusion/internalize_vfunc.cpp | 88 +++++++++ SYCL/KernelFusion/local_internalization.cpp | 73 ++++++++ SYCL/KernelFusion/non_unit_local_size.cpp | 82 +++++++++ SYCL/KernelFusion/pointer_arg_function.cpp | 82 +++++++++ SYCL/KernelFusion/private_internalization.cpp | 71 +++++++ SYCL/KernelFusion/ranged_offset_accessor.cpp | 80 ++++++++ SYCL/KernelFusion/struct_with_array.cpp | 82 +++++++++ .../sync_two_queues_event_dep.cpp | 6 +- .../sync_two_queues_requirement.cpp | 6 +- SYCL/KernelFusion/three_dimensional.cpp | 75 ++++++++ SYCL/KernelFusion/two_dimensional.cpp | 74 ++++++++ SYCL/KernelFusion/usm_no_dependencies.cpp | 70 +++++++ SYCL/KernelFusion/work_group_barrier.cpp | 79 ++++++++ SYCL/KernelFusion/wrapped_usm.cpp | 80 ++++++++ SYCL/lit.cfg.py | 17 ++ 29 files changed, 2140 insertions(+), 8 deletions(-) create mode 100644 SYCL/KernelFusion/abort_fusion.cpp create mode 100644 SYCL/KernelFusion/abort_internalization.cpp create mode 100644 SYCL/KernelFusion/barrier_local_internalization.cpp create mode 100644 SYCL/KernelFusion/buffer_internalization.cpp create mode 100644 SYCL/KernelFusion/complete_fusion.cpp create mode 100644 SYCL/KernelFusion/diamond_shape.cpp create mode 100644 SYCL/KernelFusion/event_wait_complete.cpp create mode 100644 SYCL/KernelFusion/internal_explicit_dependency.cpp create mode 100644 SYCL/KernelFusion/internalize_array_wrapper.cpp create mode 100644 SYCL/KernelFusion/internalize_deep.cpp create mode 100644 SYCL/KernelFusion/internalize_multi_ptr.cpp create mode 100644 SYCL/KernelFusion/internalize_vec.cpp create mode 100644 SYCL/KernelFusion/internalize_vfunc.cpp create mode 100644 SYCL/KernelFusion/local_internalization.cpp create mode 100644 SYCL/KernelFusion/non_unit_local_size.cpp create mode 100644 SYCL/KernelFusion/pointer_arg_function.cpp create mode 100644 SYCL/KernelFusion/private_internalization.cpp create mode 100644 SYCL/KernelFusion/ranged_offset_accessor.cpp create mode 100644 SYCL/KernelFusion/struct_with_array.cpp create mode 100644 SYCL/KernelFusion/three_dimensional.cpp create mode 100644 SYCL/KernelFusion/two_dimensional.cpp create mode 100644 SYCL/KernelFusion/usm_no_dependencies.cpp create mode 100644 SYCL/KernelFusion/work_group_barrier.cpp create mode 100644 SYCL/KernelFusion/wrapped_usm.cpp diff --git a/SYCL/KernelFusion/abort_fusion.cpp b/SYCL/KernelFusion/abort_fusion.cpp new file mode 100644 index 0000000000..cdc2e67caf --- /dev/null +++ b/SYCL/KernelFusion/abort_fusion.cpp @@ -0,0 +1,105 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test fusion being aborted: Different scenarios causing the JIT compiler +// to abort fusion due to constraint violations for fusion. Also check that +// warnings are printed when SYCL_RT_WARNING_LEVEL=1. + +#include + +using namespace sycl; + +constexpr size_t dataSize = 512; + +enum class Internalization { None, Local, Private }; + +template +void performFusion(queue &q, range k1Global, + range k1Local) { + int in[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in[i] = i; + tmp[i] = -1; + out[i] = -1; + } + { + buffer bIn{in, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw(q); + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn = bIn.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for(nd_range{k1Global, k1Local}, + [=](item i) { + auto LID = i.get_linear_id(); + accTmp[LID] = accIn[LID] + 5; + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for(nd_range<1>{{dataSize}, {8}}, [=](id<1> i) { + accOut[i] = accTmp[i] * 2; + }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + size_t numErrors = 0; + for (size_t i = 0; i < k1Global.size(); ++i) { + if (out[i] != ((i + 5) * 2)) { + ++numErrors; + } + } + if (numErrors) { + std::cout << "COMPUTATION ERROR\n"; + } else { + std::cout << "COMPUTATION OK\n"; + } +} + +int main() { + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + // Scenario: Fusing two kernels with different dimensionality should lead to + // fusion being aborted. + performFusion(q, range<2>{32, 16}, + range<2>{1, 8}); + // CHECK: WARNING: Cannot fuse kernels with different dimensionality + // CHECK-NEXT: COMPUTATION OK + + // Scenario: Fusing two kernels with different global size should lead to + // fusion being aborted. + performFusion(q, range<1>{256}, + range<1>{8}); + // CHECK-NEXT: WARNING: Cannot fuse kerneles with different global size + // CHECK-NEXT: COMPUTATION OK + + // Scenario: Fusing two kernels with different local size should lead to + // fusion being aborted. + performFusion(q, range<1>{dataSize}, + range<1>{16}); + // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size + // CHECK-NEXT: COMPUTATION OK + + return 0; +} diff --git a/SYCL/KernelFusion/abort_internalization.cpp b/SYCL/KernelFusion/abort_internalization.cpp new file mode 100644 index 0000000000..23a74dd00c --- /dev/null +++ b/SYCL/KernelFusion/abort_internalization.cpp @@ -0,0 +1,174 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test incomplete internalization: Different scenarios causing the JIT compiler +// to abort internalization due to target or parameter mismatch. Also check that +// warnings are printed when SYCL_RT_WARNING_LEVEL=1. + +#include + +using namespace sycl; + +constexpr size_t dataSize = 512; + +enum class Internalization { None, Local, Private }; + +void performFusion(queue &q, Internalization intKernel1, + size_t localSizeKernel1, Internalization intKernel2, + size_t localSizeKernel2, + bool expectInternalization = false) { + int in[dataSize], tmp[dataSize], out[dataSize]; + for (size_t i = 0; i < dataSize; ++i) { + in[i] = i; + tmp[i] = -1; + out[i] = -1; + } + { + buffer bIn{in, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn = bIn.get_access(cgh); + property_list properties{}; + if (intKernel1 == Internalization::Private) { + properties = { + sycl::ext::codeplay::experimental::property::promote_private{}}; + } else if (intKernel1 == Internalization::Local) { + properties = { + sycl::ext::codeplay::experimental::property::promote_local{}}; + } + accessor accTmp = bTmp.get_access(cgh, properties); + + if (localSizeKernel1 > 0) { + cgh.parallel_for( + nd_range<1>{{dataSize}, {localSizeKernel1}}, + [=](id<1> i) { accTmp[i] = accIn[i] + 5; }); + } else { + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn[i] + 5; }); + } + }); + + q.submit([&](handler &cgh) { + property_list properties{}; + if (intKernel2 == Internalization::Private) { + properties = { + sycl::ext::codeplay::experimental::property::promote_private{}}; + } else if (intKernel2 == Internalization::Local) { + properties = { + sycl::ext::codeplay::experimental::property::promote_local{}}; + } + accessor accTmp = bTmp.get_access(cgh, properties); + auto accOut = bOut.get_access(cgh); + if (localSizeKernel2 > 0) { + cgh.parallel_for( + nd_range<1>{{dataSize}, {localSizeKernel2}}, + [=](id<1> i) { accOut[i] = accTmp[i] * 2; }); + } else { + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * 2; }); + } + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + size_t numErrors = 0; + size_t numInternalized = 0; + for (size_t i = 0; i < dataSize; ++i) { + if (out[i] != ((i + 5) * 2)) { + ++numErrors; + } + if (tmp[i] == -1) { + ++numInternalized; + } + } + if (numErrors) { + std::cout << "COMPUTATION ERROR\n"; + return; + } + if (!expectInternalization && numInternalized) { + std::cout << "WRONG INTERNALIZATION\n"; + return; + } + std::cout << "COMPUTATION OK\n"; +} + +int main() { + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + // Scenario: One accessor without internalization, one with local + // internalization. Should fall back to no internalization and print a + // warning. + std::cout << "None, Local(0)\n"; + performFusion(q, Internalization::None, 0, Internalization::Local, 0); + // CHECK: None, Local(0) + // CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion + // CHECK-NEXT: COMPUTATION OK + + // Scenario: One accessor without internalization, one with private + // internalization. Should fall back to no internalization and print a + // warning. + std::cout << "None, Private\n"; + performFusion(q, Internalization::None, 0, Internalization::Private, 0); + // CHECK-NEXT: None, Private + // CHECK-NEXT: WARNING: Not performing specified private promotion, due to previous mismatch or because previous accessor specified no promotion + // CHECK-NEXT: COMPUTATION OK + + // Scenario: Both accessor with local promotion, but the second kernel does + // not specify a work-group size. No promotion should happen and a warning + // should be printed. + std::cout << "Local(8), Local(0)\n"; + performFusion(q, Internalization::Local, 8, Internalization::Local, 0); + // CHECK-NEXT: Local(8), Local(0) + // CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization + // CHECK-NEXT: COMPUTATION OK + + // Scenario: Both accessor with local promotion, but the first kernel does + // not specify a work-group size. No promotion should happen and a warning + // should be printed. + std::cout << "Local(0), Local(8)\n"; + performFusion(q, Internalization::Local, 0, Internalization::Local, 8); + // CHECK-NEXT: Local(0), Local(8) + // CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization + // CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion + // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size + // CHECK-NEXT: COMPUTATION OK + + // Scenario: Both accessor with local promotion, but the kernels specify + // different work-group sizes. No promotion should happen and a warning should + // be printed. + std::cout << "Local(8), Local(16)\n"; + performFusion(q, Internalization::Local, 8, Internalization::Local, 16); + // CHECK-NEXT: Local(8), Local(16) + // CHECK-NEXT: WARNING: Not performing specified local promotion due to work-group size mismatch + // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size + // CHECK-NEXT: COMPUTATION OK + + // Scenario: One accessor with local internalization, one with private + // internalization. Should fall back to local internalization and print a + // warning. + std::cout << "Local(8), Private(8)\n"; + performFusion(q, Internalization::Local, 8, Internalization::Private, 8, + /* expectInternalization */ true); + // CHECK-NEXT: Local(8), Private(8) + // CHECK-NEXT: WARNING: Performing local internalization instead, because previous accessor specified local promotion + // CHECK-NEXT: COMPUTATION OK + + return 0; +} diff --git a/SYCL/KernelFusion/barrier_local_internalization.cpp b/SYCL/KernelFusion/barrier_local_internalization.cpp new file mode 100644 index 0000000000..12302c3177 --- /dev/null +++ b/SYCL/KernelFusion/barrier_local_internalization.cpp @@ -0,0 +1,83 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with local internalization and a combination of kernels +// that require a work-group barrier to be inserted by fusion. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{ + tmp, + range{dataSize}, + {sycl::ext::codeplay::experimental::property::promote_local{}}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) { + auto workgroupSize = i.get_local_range(0); + auto baseOffset = i.get_group_linear_id() * workgroupSize; + auto localIndex = i.get_local_linear_id(); + auto localOffset = (workgroupSize - 1) - localIndex; + accTmp[baseOffset + localOffset] = + accIn1[baseOffset + localOffset] + + accIn2[baseOffset + localOffset]; + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) { + auto index = i.get_global_linear_id(); + accOut[index] = accTmp[index] * accIn3[index]; + }); + }); + + fw.complete_fusion(); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/buffer_internalization.cpp b/SYCL/KernelFusion/buffer_internalization.cpp new file mode 100644 index 0000000000..22251fbb21 --- /dev/null +++ b/SYCL/KernelFusion/buffer_internalization.cpp @@ -0,0 +1,72 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with private internalization specified on the +// buffer. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{ + tmp, + range{dataSize}, + {sycl::ext::codeplay::experimental::property::promote_private{}}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/cancel_fusion.cpp b/SYCL/KernelFusion/cancel_fusion.cpp index 3005a78d5b..6c94f99025 100644 --- a/SYCL/KernelFusion/cancel_fusion.cpp +++ b/SYCL/KernelFusion/cancel_fusion.cpp @@ -2,6 +2,7 @@ // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // UNSUPPORTED: cuda || hip +// REQUIRES: fusion // Test cancel fusion diff --git a/SYCL/KernelFusion/complete_fusion.cpp b/SYCL/KernelFusion/complete_fusion.cpp new file mode 100644 index 0000000000..0ffeca17a5 --- /dev/null +++ b/SYCL/KernelFusion/complete_fusion.cpp @@ -0,0 +1,67 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion without any internalization + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/diamond_shape.cpp b/SYCL/KernelFusion/diamond_shape.cpp new file mode 100644 index 0000000000..0f009a1d60 --- /dev/null +++ b/SYCL/KernelFusion/diamond_shape.cpp @@ -0,0 +1,106 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with private internalization specified on the +// accessors for a combination of four kernels, forming a diamond-like shape and +// repeating one of the kernels. + +#include + +using namespace sycl; + +struct AddKernel { + accessor accIn1; + accessor accIn2; + accessor accOut; + + void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; } +}; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp1[dataSize], + tmp2[dataSize], tmp3[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp1[i] = -1; + tmp2[i] = -1; + tmp3[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp1{tmp1, range{dataSize}}; + buffer bTmp2{tmp2, range{dataSize}}; + buffer bTmp3{tmp3, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp1 = bTmp1.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for(dataSize, AddKernel{accIn1, accIn2, accTmp1}); + }); + + q.submit([&](handler &cgh) { + auto accTmp1 = bTmp1.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh); + auto accTmp2 = bTmp2.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp1 = bTmp1.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accTmp3 = bTmp3.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp2 = bTmp2.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accTmp3 = bTmp3.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for(dataSize, + AddKernel{accTmp2, accTmp3, accOut}); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i + i * 25) && "Computation error"); + assert(tmp1[i] == -1 && "tmp1 not internalized"); + assert(tmp2[i] == -1 && "tmp2 not internalized"); + assert(tmp3[i] == -1 && "tmp3 not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/event_wait_cancel.cpp b/SYCL/KernelFusion/event_wait_cancel.cpp index a7b04347c2..63a049aaa1 100644 --- a/SYCL/KernelFusion/event_wait_cancel.cpp +++ b/SYCL/KernelFusion/event_wait_cancel.cpp @@ -2,10 +2,12 @@ // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // UNSUPPORTED: cuda || hip +// REQUIRES: fusion // Test validity of events after cancel_fusion. #include "fusion_event_test_common.h" + #include using namespace sycl; @@ -15,6 +17,10 @@ int main() { queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + if (!q.get_device().has(sycl::aspect::usm_shared_allocations)) { + return 0; + } + int *in1 = sycl::malloc_shared(dataSize, q); int *in2 = sycl::malloc_shared(dataSize, q); int *in3 = sycl::malloc_shared(dataSize, q); diff --git a/SYCL/KernelFusion/event_wait_complete.cpp b/SYCL/KernelFusion/event_wait_complete.cpp new file mode 100644 index 0000000000..4d049242ad --- /dev/null +++ b/SYCL/KernelFusion/event_wait_complete.cpp @@ -0,0 +1,88 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test validity of events after complete_fusion. + +#include "fusion_event_test_common.h" + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + if (!q.get_device().has(sycl::aspect::usm_shared_allocations)) { + return 0; + } + + int *in1 = sycl::malloc_shared(dataSize, q); + int *in2 = sycl::malloc_shared(dataSize, q); + int *in3 = sycl::malloc_shared(dataSize, q); + int *tmp = sycl::malloc_shared(dataSize, q); + int *out = sycl::malloc_shared(dataSize, q); + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + auto kernel1 = q.submit([&](handler &cgh) { + cgh.parallel_for( + dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; }); + }); + + auto kernel2 = q.submit([&](handler &cgh) { + cgh.parallel_for( + dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; }); + }); + + auto complete = fw.complete_fusion( + {ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + complete.wait(); + assert(isEventComplete(complete) && "Event should be complete"); + // The execution of the fused kennel does not depend on any events. + assert(complete.get_wait_list().size() == 0); + + assert(isEventComplete(kernel1) && "Event should be complete"); + // The event returned for submissions while in fusion mode depends on three + // events, for the two original kernels (which do not execute) and the fused + // kernel to be executed. + assert(kernel1.get_wait_list().size() == 3); + + assert(isEventComplete(kernel2) && "Event should be complete"); + // The event returned for submissions while in fusion mode depends on three + // events, for the two original kernels (which do not execute) and the fused + // kernel to be executed. + assert(kernel2.get_wait_list().size() == 3); + + // Check the results. + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + sycl::free(in1, q); + sycl::free(in2, q); + sycl::free(in3, q); + sycl::free(tmp, q); + sycl::free(out, q); + + return 0; +} diff --git a/SYCL/KernelFusion/internal_explicit_dependency.cpp b/SYCL/KernelFusion/internal_explicit_dependency.cpp new file mode 100644 index 0000000000..760cffb5b1 --- /dev/null +++ b/SYCL/KernelFusion/internal_explicit_dependency.cpp @@ -0,0 +1,80 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion where one kernel in the fusion list specifies an +// explicit dependency (via events) on another kernel in the fusion list. + +#include "fusion_event_test_common.h" + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + if (!q.get_device().has(sycl::aspect::usm_shared_allocations)) { + return 0; + } + + int *in1 = sycl::malloc_shared(dataSize, q); + int *in2 = sycl::malloc_shared(dataSize, q); + int *in3 = sycl::malloc_shared(dataSize, q); + int *tmp = sycl::malloc_shared(dataSize, q); + int *out = sycl::malloc_shared(dataSize, q); + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + auto kernel1 = q.submit([&](handler &cgh) { + cgh.parallel_for( + dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; }); + }); + + auto kernel2 = q.submit([&](handler &cgh) { + cgh.depends_on(kernel1); + cgh.parallel_for( + dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; }); + }); + + auto complete = fw.complete_fusion( + {ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + complete.wait(); + assert(isEventComplete(complete) && "Event should be complete"); + + assert(isEventComplete(kernel1) && "Event should be complete"); + + assert(isEventComplete(kernel2) && "Event should be complete"); + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + sycl::free(in1, q); + sycl::free(in2, q); + sycl::free(in3, q); + sycl::free(tmp, q); + sycl::free(out, q); + + return 0; +} diff --git a/SYCL/KernelFusion/internalize_array_wrapper.cpp b/SYCL/KernelFusion/internalize_array_wrapper.cpp new file mode 100644 index 0000000000..d1b41ea7bd --- /dev/null +++ b/SYCL/KernelFusion/internalize_array_wrapper.cpp @@ -0,0 +1,139 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test internalization of a nested array type. + +#include + +#include + +using namespace sycl; + +template struct array_wrapper { + static constexpr size_t rows{N}; + static constexpr size_t columns{M}; + static constexpr size_t vec_width{2}; + + using value_type = vec; + using reference_type = value_type &; + using const_reference_type = const value_type &; + + std::array, rows> vs; + + explicit array_wrapper(const_reference_type v) { + std::array el; + el.fill(v); + vs.fill(el); + } + + array_wrapper() : array_wrapper{value_type{}} {} + + constexpr std::array &operator[](size_t i) { + return vs[i]; + } + + constexpr const std::array &operator[](size_t i) const { + return vs[i]; + } +}; + +int main() { + constexpr size_t dataSize = 2; + constexpr size_t rows = 2; + constexpr size_t columns = 2; + + using array_type = array_wrapper; + + array_type in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], + out[dataSize]; + + for (size_t id = 0; id < dataSize; ++id) { + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < columns; ++j) { + in1[id][i][j].s0() = in1[id][i][j].s1() = id * 2; + in2[id][i][j].s0() = in2[id][i][j].s1() = id * 3; + in3[id][i][j].s0() = in3[id][i][j].s1() = id * 4; + tmp[id][i][j].s0() = tmp[id][i][j].s1() = -1; + out[id][i][j].s0() = out[id][i][j].s1() = -1; + } + } + } + + queue q{default_selector_v, + {ext::codeplay::experimental::property::queue::enable_fusion{}}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for(dataSize, [=](id<1> id) { + const auto &accIn1Wrapp = accIn1[id]; + const auto &accIn2Wrapp = accIn2[id]; + auto &accTmpWrapp = accTmp[id]; + for (size_t i = 0; i < dataSize; ++i) { + const auto &in1 = accIn1Wrapp[i]; + const auto &in2 = accIn2Wrapp[i]; + auto &tmp = accTmpWrapp[i]; + for (size_t j = 0; j < columns; ++j) { + tmp[j] = in1[j] + in2[j]; + } + } + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for(dataSize, [=](id<1> id) { + const auto &tmpWrapp = accTmp[id]; + const auto &accIn3Wrapp = accIn3[id]; + auto &accOutWrapp = accOut[id]; + for (size_t i = 0; i < dataSize; ++i) { + const auto &tmp = tmpWrapp[i]; + const auto &in3 = accIn3Wrapp[i]; + auto &out = accOutWrapp[i]; + for (size_t j = 0; j < columns; ++j) { + out[j] = tmp[j] * in3[j]; + } + } + }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + constexpr array_type::value_type not_written{-1, -1}; + for (size_t id = 0; id < dataSize; ++id) { + const array_type::value_type expected{20 * id * id, 20 * id * id}; + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < columns; ++j) { + assert(all(out[id][i][j] == expected) && "Computation error"); + assert(all(tmp[id][i][j] == not_written) && "Not internalizing"); + } + } + } + + return 0; +} diff --git a/SYCL/KernelFusion/internalize_deep.cpp b/SYCL/KernelFusion/internalize_deep.cpp new file mode 100644 index 0000000000..172ea3c750 --- /dev/null +++ b/SYCL/KernelFusion/internalize_deep.cpp @@ -0,0 +1,106 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with internalization of a deep struct type. + +#include + +#include + +using namespace sycl; + +struct deep_vec { + using value_type = vec; + struct level_0 { + struct level_1 { + struct level_2 { + deep_vec::value_type v; + + constexpr level_2() = default; + constexpr explicit level_2(const deep_vec::value_type &v) : v{v} {} + } v; + constexpr level_1() = default; + constexpr explicit level_1(const deep_vec::value_type &v) : v{v} {} + } v; + + constexpr level_0() = default; + constexpr explicit level_0(const deep_vec::value_type &v) : v{v} {} + } v; + + constexpr deep_vec() = default; + constexpr explicit deep_vec(const value_type &v) : v{v} {} + + constexpr value_type &operator*() { return v.v.v.v; } + constexpr value_type *operator->() { return &this->operator*(); } +}; + +deep_vec operator+(deep_vec lhs, deep_vec rhs) { return deep_vec{*lhs + *rhs}; } +deep_vec operator*(deep_vec lhs, deep_vec rhs) { return deep_vec{*lhs * *rhs}; } + +int main() { + constexpr size_t dataSize = 512; + + deep_vec in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], + out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i]->s0() = in1[i]->s1() = in1[i]->s2() = in1[i]->s3() = i * 2; + in2[i]->s0() = in2[i]->s1() = in2[i]->s2() = in2[i]->s3() = i * 3; + in3[i]->s0() = in3[i]->s1() = in3[i]->s2() = in3[i]->s3() = i * 4; + tmp[i]->s0() = tmp[i]->s1() = tmp[i]->s2() = tmp[i]->s3() = -1; + out[i]->s0() = out[i]->s1() = out[i]->s2() = out[i]->s3() = -1; + } + + queue q{default_selector_v, + {ext::codeplay::experimental::property::queue::enable_fusion{}}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + constexpr deep_vec::value_type not_written{-1, -1, -1, -1}; + for (size_t i = 0; i < dataSize; ++i) { + const deep_vec::value_type expected{20 * i * i, 20 * i * i, 20 * i * i, + 20 * i * i}; + assert(all(*out[i] == expected) && "Computation error"); + assert(all(*tmp[i] == not_written) && "Not internalizing"); + }; + + return 0; +} diff --git a/SYCL/KernelFusion/internalize_multi_ptr.cpp b/SYCL/KernelFusion/internalize_multi_ptr.cpp new file mode 100644 index 0000000000..b6937b0350 --- /dev/null +++ b/SYCL/KernelFusion/internalize_multi_ptr.cpp @@ -0,0 +1,81 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with private internalization specified on the +// accessors for a device kernel using multi_ptr to global address space. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for(dataSize, [=](id<1> i) { + size_t offset = i; + decorated_global_ptr in1Ptr{accIn1}; + decorated_global_ptr in2Ptr{accIn2}; + decorated_global_ptr tmpPtr{accTmp}; + tmpPtr[offset] = in1Ptr[offset] + in2Ptr[offset]; + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for(dataSize, [=](id<1> i) { + size_t offset = i; + decorated_global_ptr in3Ptr{accIn3}; + decorated_global_ptr tmpPtr{accTmp}; + decorated_global_ptr outPtr{accOut}; + outPtr[offset] = in3Ptr[offset] * tmpPtr[offset]; + }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/internalize_vec.cpp b/SYCL/KernelFusion/internalize_vec.cpp new file mode 100644 index 0000000000..9f3a24f715 --- /dev/null +++ b/SYCL/KernelFusion/internalize_vec.cpp @@ -0,0 +1,75 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with internalization of a struct type. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + + vec in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], + out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i].s0() = in1[i].s1() = in1[i].s2() = in1[i].s3() = i * 2; + in2[i].s0() = in2[i].s1() = in2[i].s2() = in2[i].s3() = i * 3; + in3[i].s0() = in3[i].s1() = in3[i].s2() = in3[i].s3() = i * 4; + tmp[i].s0() = tmp[i].s1() = tmp[i].s2() = tmp[i].s3() = -1; + out[i].s0() = out[i].s1() = out[i].s2() = out[i].s3() = -1; + } + + queue q{default_selector_v, + {ext::codeplay::experimental::property::queue::enable_fusion{}}}; + + { + buffer> bIn1{in1, range{dataSize}}; + buffer> bIn2{in2, range{dataSize}}; + buffer> bIn3{in3, range{dataSize}}; + buffer> bTmp{tmp, range{dataSize}}; + buffer> bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + constexpr vec not_written{-1, -1, -1, -1}; + for (size_t i = 0; i < dataSize; ++i) { + const vec expected{20 * i * i, 20 * i * i, 20 * i * i, 20 * i * i}; + assert(all(out[i] == expected) && "Computation error"); + assert(all(tmp[i] == not_written) && "Not internalizing"); + }; + + return 0; +} diff --git a/SYCL/KernelFusion/internalize_vfunc.cpp b/SYCL/KernelFusion/internalize_vfunc.cpp new file mode 100644 index 0000000000..abc9c2419f --- /dev/null +++ b/SYCL/KernelFusion/internalize_vfunc.cpp @@ -0,0 +1,88 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with private internalization specified on the +// accessors for a device kernel with sycl::vec::load and sycl::vec::store. + +#define VEC 4 + +#include + +using namespace sycl; + +int main() { + constexpr size_t numVec = 512; + constexpr size_t dataSize = numVec * VEC; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for(numVec, [=](id<1> i) { + size_t offset = i; + vec in1; + in1.load(offset, accIn1.get_pointer()); + vec in2; + in2.load(offset, accIn2.get_pointer()); + auto tmp = in1 + in2; + tmp.store(offset, accTmp.get_pointer()); + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for(numVec, [=](id<1> i) { + size_t offset = i; + vec tmp; + tmp.load(offset, accTmp.get_pointer()); + vec in3; + in3.load(offset, accIn3.get_pointer()); + auto out = tmp * in3; + out.store(offset, accOut.get_pointer()); + }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/local_internalization.cpp b/SYCL/KernelFusion/local_internalization.cpp new file mode 100644 index 0000000000..a9677b4a2e --- /dev/null +++ b/SYCL/KernelFusion/local_internalization.cpp @@ -0,0 +1,73 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with local internalization specified on the +// accessors. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_local{}); + cgh.parallel_for( + nd_range<1>{{dataSize}, {16}}, + [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_local{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{dataSize}, {16}}, + [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/non_unit_local_size.cpp b/SYCL/KernelFusion/non_unit_local_size.cpp new file mode 100644 index 0000000000..917eda6e09 --- /dev/null +++ b/SYCL/KernelFusion/non_unit_local_size.cpp @@ -0,0 +1,82 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with local internalization specified on the +// accessors, where each work-item processes multiple data-items. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_local{}); + cgh.parallel_for( + nd_range<1>{{128}, {8}}, [=](item<1> i) { + auto baseOffset = i.get_linear_id() * 4; + for (size_t j = 0; j < 4; ++j) { + accTmp[baseOffset + j] = + accIn1[baseOffset + j] + accIn2[baseOffset + j]; + } + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_local{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{128}, {8}}, [=](item<1> i) { + auto baseOffset = i.get_linear_id() * 4; + for (size_t j = 0; j < 4; ++j) { + accOut[baseOffset + j] = + accTmp[baseOffset + j] * accIn3[baseOffset + j]; + } + }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/pointer_arg_function.cpp b/SYCL/KernelFusion/pointer_arg_function.cpp new file mode 100644 index 0000000000..ffe5178cda --- /dev/null +++ b/SYCL/KernelFusion/pointer_arg_function.cpp @@ -0,0 +1,82 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion +// This test currently fails because InferAddressSpace is not able to remove all +// address-space casts, causing internalization to fail. +// XFAIL: * + +// Test complete fusion with private internalization specified on the +// accessors, calling a function with a raw pointer taken from an accessor in +// one of the kernels. + +#include + +using namespace sycl; + +void __attribute__((noinline)) +addFunc(int *in1, int *in2, int *out, size_t linearID) { + out[linearID] = in1[linearID] + in2[linearID]; +} + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for(dataSize, [=](item<1> i) { + addFunc(accIn1.get_pointer(), accIn2.get_pointer(), + accTmp.get_pointer(), i.get_linear_id()); + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/private_internalization.cpp b/SYCL/KernelFusion/private_internalization.cpp new file mode 100644 index 0000000000..05120a68c3 --- /dev/null +++ b/SYCL/KernelFusion/private_internalization.cpp @@ -0,0 +1,71 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with private internalization specified on the +// accessors. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/ranged_offset_accessor.cpp b/SYCL/KernelFusion/ranged_offset_accessor.cpp new file mode 100644 index 0000000000..95f0b06c44 --- /dev/null +++ b/SYCL/KernelFusion/ranged_offset_accessor.cpp @@ -0,0 +1,80 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with private internalization on accessors with different +// offset and range. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize * 5], in2[dataSize * 5], in3[dataSize * 5], + tmp[dataSize * 5], out[dataSize * 5]; + + size_t offsetIn1 = 0; + size_t offsetIn2 = 512; + size_t offsetIn3 = 1024; + size_t offsetTmp = 1536; + size_t offsetOut = 2048; + + for (size_t i = 0; i < dataSize; ++i) { + in1[offsetIn1 + i] = i * 2; + in2[offsetIn2 + i] = i * 3; + in3[offsetIn3 + i] = i * 4; + tmp[offsetTmp + i] = -1; + out[offsetOut + i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize * 5}}; + buffer bIn2{in2, range{dataSize * 5}}; + buffer bIn3{in3, range{dataSize * 5}}; + buffer bTmp{tmp, range{dataSize * 5}}; + buffer bOut{out, range{dataSize * 5}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh, range{516}, id{offsetIn1}); + auto accIn2 = bIn2.get_access(cgh, range{513}, id{offsetIn2}); + auto accTmp = bTmp.get_access( + cgh, range{514}, id{offsetTmp}, + sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for( + dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, range{514}, id{offsetTmp}, + sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh, range{515}, id{offsetIn3}); + auto accOut = bOut.get_access(cgh, range{512}, id{offsetOut}); + cgh.parallel_for( + dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[offsetOut + i] == (20 * i * i) && "Computation error"); + assert(tmp[offsetTmp + i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/struct_with_array.cpp b/SYCL/KernelFusion/struct_with_array.cpp new file mode 100644 index 0000000000..dca54abfa2 --- /dev/null +++ b/SYCL/KernelFusion/struct_with_array.cpp @@ -0,0 +1,82 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with private internalization on a kernel functor with an +// array member. + +#include + +using namespace sycl; + +struct KernelTwo { + accessor buf; + accessor out; + int coef[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + KernelTwo(accessor buf, accessor out) : buf{buf}, out{out} {} + + void operator()(nd_item<1> i) const { + out[i.get_global_linear_id()] = + buf[i.get_global_linear_id()] * coef[i.get_local_linear_id()]; + } +}; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for( + nd_range<1>{{dataSize}, {8}}, + [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for(nd_range<1>{{dataSize}, {8}}, KernelTwo{accTmp, accOut}); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (5 * i * (i % 8)) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/sync_two_queues_event_dep.cpp b/SYCL/KernelFusion/sync_two_queues_event_dep.cpp index f1389bdf99..bb33fcdcb8 100644 --- a/SYCL/KernelFusion/sync_two_queues_event_dep.cpp +++ b/SYCL/KernelFusion/sync_two_queues_event_dep.cpp @@ -4,10 +4,8 @@ // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER // UNSUPPORTED: cuda || hip - -// For this test, complete_fusion must be supported, which is currently not the -// case on Windows. -// REQUIRES: linux +// For this test, complete_fusion must be supported. +// REQUIRES: fusion // Test fusion cancellation on event dependency between two active fusions. diff --git a/SYCL/KernelFusion/sync_two_queues_requirement.cpp b/SYCL/KernelFusion/sync_two_queues_requirement.cpp index 716ebd36ea..990049a1f7 100644 --- a/SYCL/KernelFusion/sync_two_queues_requirement.cpp +++ b/SYCL/KernelFusion/sync_two_queues_requirement.cpp @@ -4,10 +4,8 @@ // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ // RUN: %GPU_CHECK_PLACEHOLDER // UNSUPPORTED: cuda || hip - -// For this test, complete_fusion must be supported, which is currently not the -// case on Windows. -// REQUIRES: linux +// For this test, complete_fusion must be supported. +// REQUIRES: fusion // Test fusion cancellation for requirement between two active fusions. diff --git a/SYCL/KernelFusion/three_dimensional.cpp b/SYCL/KernelFusion/three_dimensional.cpp new file mode 100644 index 0000000000..e8006ca091 --- /dev/null +++ b/SYCL/KernelFusion/three_dimensional.cpp @@ -0,0 +1,75 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with private internalization specified on the +// accessors for three-dimensional range. + +#include + +using namespace sycl; + +int main() { + constexpr size_t sizeX = 16; + constexpr size_t sizeY = 8; + constexpr size_t sizeZ = 4; + constexpr size_t dataSize = sizeX * sizeY * sizeZ; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + range<3> xyRange{sizeZ, sizeY, sizeX}; + buffer bIn1{in1, xyRange}; + buffer bIn2{in2, xyRange}; + buffer bIn3{in3, xyRange}; + buffer bTmp{tmp, xyRange}; + buffer bOut{out, xyRange}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for( + xyRange, [=](id<3> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + xyRange, [=](id<3> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/two_dimensional.cpp b/SYCL/KernelFusion/two_dimensional.cpp new file mode 100644 index 0000000000..62bca54ff1 --- /dev/null +++ b/SYCL/KernelFusion/two_dimensional.cpp @@ -0,0 +1,74 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with private internalization specified on the +// accessors for two-dimensional range. + +#include + +using namespace sycl; + +int main() { + constexpr size_t sizeX = 16; + constexpr size_t sizeY = 32; + constexpr size_t dataSize = sizeX * sizeY; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + range<2> xyRange{sizeY, sizeX}; + buffer bIn1{in1, xyRange}; + buffer bIn2{in2, xyRange}; + buffer bIn3{in3, xyRange}; + buffer bTmp{tmp, xyRange}; + buffer bOut{out, xyRange}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + cgh.parallel_for( + xyRange, [=](id<2> i) { accTmp[i] = accIn1[i] + accIn2[i]; }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access( + cgh, sycl::ext::codeplay::experimental::property::promote_private{}); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + xyRange, [=](id<2> i) { accOut[i] = accTmp[i] * accIn3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + assert(tmp[i] == -1 && "Not internalized"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/usm_no_dependencies.cpp b/SYCL/KernelFusion/usm_no_dependencies.cpp new file mode 100644 index 0000000000..2f18f758ba --- /dev/null +++ b/SYCL/KernelFusion/usm_no_dependencies.cpp @@ -0,0 +1,70 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion using USM pointers. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + if (!q.get_device().has(sycl::aspect::usm_shared_allocations)) { + return 0; + } + + int *in1 = sycl::malloc_shared(dataSize, q); + int *in2 = sycl::malloc_shared(dataSize, q); + int *in3 = sycl::malloc_shared(dataSize, q); + int *tmp = sycl::malloc_shared(dataSize, q); + int *out = sycl::malloc_shared(dataSize, q); + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + cgh.parallel_for( + dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; }); + }); + + q.submit([&](handler &cgh) { + cgh.parallel_for( + dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + q.wait(); + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + sycl::free(in1, q); + sycl::free(in2, q); + sycl::free(in3, q); + sycl::free(tmp, q); + sycl::free(out, q); + + return 0; +} diff --git a/SYCL/KernelFusion/work_group_barrier.cpp b/SYCL/KernelFusion/work_group_barrier.cpp new file mode 100644 index 0000000000..7141c37be8 --- /dev/null +++ b/SYCL/KernelFusion/work_group_barrier.cpp @@ -0,0 +1,79 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion with a combination of kernels that require a work-group +// barrier to be inserted by fusion. + +#include + +using namespace sycl; + +int main() { + constexpr size_t dataSize = 512; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + { + buffer bIn1{in1, range{dataSize}}; + buffer bIn2{in2, range{dataSize}}; + buffer bIn3{in3, range{dataSize}}; + buffer bTmp{tmp, range{dataSize}}; + buffer bOut{out, range{dataSize}}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + auto accTmp = bTmp.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) { + auto workgroupSize = i.get_local_range(0); + auto baseOffset = i.get_group_linear_id() * workgroupSize; + auto localIndex = i.get_local_linear_id(); + auto localOffset = (workgroupSize - 1) - localIndex; + accTmp[baseOffset + localOffset] = + accIn1[baseOffset + localOffset] + + accIn2[baseOffset + localOffset]; + }); + }); + + q.submit([&](handler &cgh) { + auto accTmp = bTmp.get_access(cgh); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for( + nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) { + auto index = i.get_global_linear_id(); + accOut[index] = accTmp[index] * accIn3[index]; + }); + }); + + fw.complete_fusion(); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + return 0; +} diff --git a/SYCL/KernelFusion/wrapped_usm.cpp b/SYCL/KernelFusion/wrapped_usm.cpp new file mode 100644 index 0000000000..8532a9dadf --- /dev/null +++ b/SYCL/KernelFusion/wrapped_usm.cpp @@ -0,0 +1,80 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test complete fusion using an wrapped USM pointer as kernel functor argument. + +#include + +using namespace sycl; + +template struct wrapper { + T *data; + + wrapper(size_t dataSize, queue &q) + : data{sycl::malloc_shared(dataSize, q)} {} + + T &operator[](size_t i) { return data[i]; } + const T &operator[](size_t i) const { return data[i]; } +}; + +int main() { + constexpr size_t dataSize = 512; + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + if (!q.get_device().has(sycl::aspect::usm_shared_allocations)) { + return 0; + } + + wrapper in1{dataSize, q}; + wrapper in2{dataSize, q}; + wrapper in3{dataSize, q}; + wrapper tmp{dataSize, q}; + wrapper out{dataSize, q}; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + cgh.parallel_for( + dataSize, [=](id<1> i) { tmp.data[i] = in1.data[i] + in2.data[i]; }); + }); + + q.submit([&](handler &cgh) { + cgh.parallel_for( + dataSize, [=](id<1> i) { out.data[i] = tmp.data[i] * in3.data[i]; }); + }); + + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + + q.wait(); + + // Check the results + for (size_t i = 0; i < dataSize; ++i) { + assert(out[i] == (20 * i * i) && "Computation error"); + } + + sycl::free(in1.data, q); + sycl::free(in2.data, q); + sycl::free(in3.data, q); + sycl::free(tmp.data, q); + sycl::free(out.data, q); + + return 0; +} diff --git a/SYCL/lit.cfg.py b/SYCL/lit.cfg.py index 3a7e281285..144f4f0aff 100644 --- a/SYCL/lit.cfg.py +++ b/SYCL/lit.cfg.py @@ -427,6 +427,23 @@ else: lit_config.warning("Couldn't find pre-installed AOT device compiler " + aot_tool) +# Check if kernel fusion is available by compiling a small program that will +# be ill-formed (compilation stops with non-zero exit code) if the feature +# test macro for kernel fusion is not defined. +check_fusion_file = 'check_fusion.cpp' +with open(check_fusion_file, 'w') as ff: + ff.write('#include \n') + ff.write('#ifndef SYCL_EXT_CODEPLAY_KERNEL_FUSION\n') + ff.write('#error \"Feature test for fusion failed\"\n') + ff.write('#endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION\n') + ff.write('int main() { return 0; }\n') + +status = subprocess.getstatusoutput(config.dpcpp_compiler + ' -fsycl ' + + check_fusion_file) +if status[0] == 0: + lit_config.note('Kernel fusion extension enabled') + config.available_features.add('fusion') + # Set timeout for a single test try: import psutil