From a4e4864f6234b2d0ef88dee5af0b6a3587b50d3b Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Thu, 19 Jan 2023 10:07:57 +0000 Subject: [PATCH] [SYCL][Fusion] Test caching of fused kernels Signed-off-by: Lukas Sommer --- SYCL/KernelFusion/jit_caching.cpp | 142 ++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 SYCL/KernelFusion/jit_caching.cpp diff --git a/SYCL/KernelFusion/jit_caching.cpp b/SYCL/KernelFusion/jit_caching.cpp new file mode 100644 index 0000000000..d49f7b63a6 --- /dev/null +++ b/SYCL/KernelFusion/jit_caching.cpp @@ -0,0 +1,142 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION" +// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ +// RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION" +// UNSUPPORTED: cuda || hip +// REQUIRES: fusion + +// Test caching for JIT fused kernels. Also test for debug messages being +// printed when SYCL_RT_WARNING_LEVEL=1. + +#include +#include + +using namespace sycl; + +constexpr size_t dataSize = 512; + +enum class Internalization { None, Local, Private }; + +void performFusion(queue &q, Internalization internalize, range<1> globalSize, + int beta, int gamma, bool insertBarriers = false) { + int alpha = 1; + int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; + + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + { + buffer bIn1{in1, globalSize}; + buffer bIn2{in2, globalSize}; + buffer bIn3{in3, globalSize}; + buffer bTmp{tmp, globalSize}; + buffer bOut{out, globalSize}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + fw.start_fusion(); + + assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); + + q.submit([&](handler &cgh) { + auto accIn1 = bIn1.get_access(cgh); + auto accIn2 = bIn2.get_access(cgh); + property_list properties{}; + if (internalize == Internalization::Private) { + properties = { + sycl::ext::codeplay::experimental::property::promote_private{}}; + } else if (internalize == Internalization::Local) { + properties = { + sycl::ext::codeplay::experimental::property::promote_local{}}; + } + accessor accTmp = bTmp.get_access(cgh, properties); + cgh.parallel_for(globalSize, [=](id<1> i) { + accTmp[i] = accIn1[i] + accIn2[i] * alpha; + }); + }); + + q.submit([&](handler &cgh) { + property_list properties{}; + if (internalize == Internalization::Private) { + properties = { + sycl::ext::codeplay::experimental::property::promote_private{}}; + } else if (internalize == Internalization::Local) { + properties = { + sycl::ext::codeplay::experimental::property::promote_local{}}; + } + accessor accTmp = bTmp.get_access(cgh, properties); + auto accIn3 = bIn3.get_access(cgh); + auto accOut = bOut.get_access(cgh); + cgh.parallel_for(globalSize, [=](id<1> i) { + accOut[i] = accTmp[i] * accIn3[i] * beta * gamma; + }); + }); + + if (insertBarriers) { + fw.complete_fusion(); + } else { + fw.complete_fusion( + {ext::codeplay::experimental::property::no_barriers{}}); + } + + assert(!fw.is_in_fusion_mode() && + "Queue should not be in fusion mode anymore"); + } + + // Check the results + size_t numErrors = 0; + size_t numInternalized = 0; + for (size_t i = 0; i < dataSize; ++i) { + if (i < globalSize.size() && out[i] != (20 * i * i * beta * gamma)) { + ++numErrors; + } + if (tmp[i] == -1) { + ++numInternalized; + } + } + if (numErrors) { + std::cout << "COMPUTATION ERROR\n"; + } + if ((internalize == Internalization::None) && numInternalized) { + std::cout << "WRONG INTERNALIZATION\n"; + } +} + +int main() { + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + + // Initial invocation + performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1); + // CHECK: JIT DEBUG: Compiling new kernel, no suitable cached kernel found + + // Identical invocation, should lead to JIT cache hit. + performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1); + // CHECK-NEXT: JIT DEBUG: Re-using cached JIT kernel + // CHECK-NEXT: INFO: Re-using existing device binary for fused kernel + + // Invocation with a different beta. Because beta was identical to alpha so + // far, this should lead to a cache miss. + performFusion(q, Internalization::Private, range<1>{dataSize}, 2, 1); + // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found + + // Invocation with barrier insertion should lead to a cache miss. + performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1, + /* insertBarriers */ true); + // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found + + // Invocation with different internalization target should lead to a cache + // miss. + performFusion(q, Internalization::None, range<1>{dataSize}, 1, 1); + // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found + + // Invocation with a different gamma should lead to a cache miss because gamma + // participates in constant propagation. + performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 2); + // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found + + return 0; +}