From 7d97996a2547d7831f679c0ac354ceaa36d33809 Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Mon, 2 Dec 2024 10:56:16 +0000 Subject: [PATCH 1/3] [SYCL][RTC] Preliminary support for ESIMD kernels Signed-off-by: Julian Oppermann --- sycl-jit/common/include/Kernel.h | 4 + sycl-jit/jit-compiler/CMakeLists.txt | 2 + sycl-jit/jit-compiler/lib/KernelFusion.cpp | 9 +- .../lib/rtc/DeviceCompilation.cpp | 86 +++++++++++++----- .../jit-compiler/lib/rtc/DeviceCompilation.h | 5 +- sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp | 77 ++++++++++++++++ sycl-jit/jit-compiler/lib/rtc/ESIMD.h | 23 +++++ .../kernel_compiler_sycl_jit.cpp | 90 ++++++++++++++++++- 8 files changed, 264 insertions(+), 32 deletions(-) create mode 100644 sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp create mode 100644 sycl-jit/jit-compiler/lib/rtc/ESIMD.h diff --git a/sycl-jit/common/include/Kernel.h b/sycl-jit/common/include/Kernel.h index 794126a462e20..efd6e1ded3051 100644 --- a/sycl-jit/common/include/Kernel.h +++ b/sycl-jit/common/include/Kernel.h @@ -403,6 +403,10 @@ struct RTCBundleInfo { RTCBundleBinaryInfo BinaryInfo; FrozenSymbolTable SymbolTable; FrozenPropertyRegistry Properties; + + RTCBundleInfo() = default; + RTCBundleInfo(RTCBundleInfo &&) = default; + RTCBundleInfo &operator=(RTCBundleInfo &&) = default; }; } // namespace jit_compiler diff --git a/sycl-jit/jit-compiler/CMakeLists.txt b/sycl-jit/jit-compiler/CMakeLists.txt index 82d2356691c3f..63bb2ecc34ad9 100644 --- a/sycl-jit/jit-compiler/CMakeLists.txt +++ b/sycl-jit/jit-compiler/CMakeLists.txt @@ -8,6 +8,7 @@ add_llvm_library(sycl-jit lib/fusion/JITContext.cpp lib/fusion/ModuleHelper.cpp lib/rtc/DeviceCompilation.cpp + lib/rtc/ESIMD.cpp lib/helper/ConfigHelper.cpp SHARED @@ -32,6 +33,7 @@ add_llvm_library(sycl-jit TargetParser MC SYCLLowerIR + GenXIntrinsics ${LLVM_TARGETS_TO_BUILD} LINK_LIBS diff --git a/sycl-jit/jit-compiler/lib/KernelFusion.cpp b/sycl-jit/jit-compiler/lib/KernelFusion.cpp index fae9a3c29dcf4..2f58660e22c8b 100644 --- a/sycl-jit/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-jit/jit-compiler/lib/KernelFusion.cpp @@ -258,12 +258,13 @@ compileSYCL(InMemoryFile SourceFile, View IncludeFiles, return errorTo(std::move(Error), "Device linking failed"); } - auto BundleInfoOrError = performPostLink(*Module, UserArgList); - if (!BundleInfoOrError) { - return errorTo(BundleInfoOrError.takeError(), + auto PostLinkResultOrError = performPostLink(std::move(Module), UserArgList); + if (!PostLinkResultOrError) { + return errorTo(PostLinkResultOrError.takeError(), "Post-link phase failed"); } - auto BundleInfo = std::move(*BundleInfoOrError); + RTCBundleInfo BundleInfo; + std::tie(BundleInfo, Module) = std::move(*PostLinkResultOrError); auto BinaryInfoOrError = translation::KernelTranslator::translateBundleToSPIRV( diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 1d75136160e99..3ed57fa4a9ad5 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "DeviceCompilation.h" +#include "ESIMD.h" #include #include @@ -23,6 +24,8 @@ #include #include #include +#include +#include #include #include #include @@ -376,42 +379,82 @@ template static bool runModulePass(llvm::Module &M) { return !Res.areAllPreserved(); } -Expected jit_compiler::performPostLink( - llvm::Module &Module, [[maybe_unused]] const InputArgList &UserArgList) { +llvm::Expected jit_compiler::performPostLink( + std::unique_ptr Module, + [[maybe_unused]] const llvm::opt::InputArgList &UserArgList) { // This is a simplified version of `processInputModule` in // `llvm/tools/sycl-post-link.cpp`. Assertions/TODOs point to functionality // left out of the algorithm for now. - assert(!Module.getGlobalVariable("llvm.used") && - !Module.getGlobalVariable("llvm.compiler.used")); + // TODO: SplitMode can be controlled by the user. + const auto SplitMode = SPLIT_NONE; + + // TODO: EmitOnlyKernelsAsEntryPoints is controlled by + // `shouldEmitOnlyKernelsAsEntryPoints` in + // `clang/lib/Driver/ToolChains/Clang.cpp`. + const bool EmitOnlyKernelsAsEntryPoints = true; + + // TODO: The optlevel passed to `sycl-post-link` is determined by + // `getSYCLPostLinkOptimizationLevel` in + // `clang/lib/Driver/ToolChains/Clang.cpp`. + const bool PerformOpts = true; + + // Propagate ESIMD attribute to wrapper functions to prevent spurious splits + // and kernel link errors. + runModulePass(*Module); + + assert(!Module->getGlobalVariable("llvm.used") && + !Module->getGlobalVariable("llvm.compiler.used")); // Otherwise: Port over the `removeSYCLKernelsConstRefArray` and // `removeDeviceGlobalFromCompilerUsed` methods. - assert(!isModuleUsingAsan(Module)); + assert(!isModuleUsingAsan(*Module)); // Otherwise: Need to instrument each image scope device globals if the module // has been instrumented by sanitizer pass. // Transform Joint Matrix builtin calls to align them with SPIR-V friendly // LLVM IR specification. - runModulePass(Module); + runModulePass(*Module); + + // Do invoke_simd processing before splitting because this: + // - saves processing time (the pass is run once, even though on larger IR) + // - doing it before SYCL/ESIMD splitting is required for correctness + if (runModulePass(*Module)) { + return createStringError("`invoke_simd` calls detected"); + } // TODO: Implement actual device code splitting. We're just using the splitter // to obtain additional information about the module for now. - // TODO: EmitOnlyKernelsAsEntryPoints is controlled by - // `shouldEmitOnlyKernelsAsEntryPoints` in - // `clang/lib/Driver/ToolChains/Clang.cpp`. + std::unique_ptr Splitter = getDeviceCodeSplitter( - ModuleDesc{std::unique_ptr{&Module}}, SPLIT_NONE, - /*IROutputOnly=*/false, - /*EmitOnlyKernelsAsEntryPoints=*/true); - assert(Splitter->remainingSplits() == 1); + ModuleDesc{std::move(Module)}, SplitMode, + /*IROutputOnly=*/false, EmitOnlyKernelsAsEntryPoints); + assert(Splitter->hasMoreSplits()); + if (Splitter->remainingSplits() > 1) { + return createStringError("Device code requires splitting"); + } // TODO: Call `verifyNoCrossModuleDeviceGlobalUsage` if device globals shall // be processed. - assert(Splitter->hasMoreSplits()); ModuleDesc MDesc = Splitter->nextSplit(); - assert(&Module == &MDesc.getModule()); + + // TODO: Call `MDesc.fixupLinkageOfDirectInvokeSimdTargets()` when + // `invoke_simd` is supported. + + SmallVector ESIMDSplits = + splitByESIMD(std::move(MDesc), EmitOnlyKernelsAsEntryPoints); + assert(!ESIMDSplits.empty()); + if (ESIMDSplits.size() > 1) { + return createStringError("Mixing SYCL and ESIMD code is unsupported"); + } + MDesc = std::move(ESIMDSplits.front()); + + if (MDesc.isESIMD()) { + // TODO: We're assuming ESIMD lowering is not deactivated (why would it?). + lowerEsimdConstructs(MDesc, PerformOpts); + } + MDesc.saveSplitInformationAsMetadata(); RTCBundleInfo BundleInfo; @@ -448,10 +491,7 @@ Expected jit_compiler::performPostLink( } }; - // Regain ownership of the module. - MDesc.releaseModulePtr().release(); - - return std::move(BundleInfo); + return PostLinkResult{std::move(BundleInfo), MDesc.releaseModulePtr()}; } Expected @@ -513,11 +553,9 @@ jit_compiler::parseUserArgs(View UserArgs) { return createStringError("Device code splitting is not yet supported"); } - if (AL.hasArg(OPT_fsycl_device_code_split_esimd, - OPT_fno_sycl_device_code_split_esimd)) { - // TODO: There are more ESIMD-related options. - return createStringError( - "Runtime compilation of ESIMD kernels is not yet supported"); + if (!AL.hasFlag(OPT_fsycl_device_code_split_esimd, + OPT_fno_sycl_device_code_split_esimd, true)) { + return createStringError("ESIMD device code split cannot be deactivated"); } if (AL.hasFlag(OPT_fsycl_dead_args_optimization, diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h index 8aa47939e3b1d..9d5bea7639353 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h @@ -27,8 +27,9 @@ compileDeviceCode(InMemoryFile SourceFile, View IncludeFiles, llvm::Error linkDeviceLibraries(llvm::Module &Module, const llvm::opt::InputArgList &UserArgList); -llvm::Expected -performPostLink(llvm::Module &Module, +using PostLinkResult = std::pair>; +llvm::Expected +performPostLink(std::unique_ptr Module, const llvm::opt::InputArgList &UserArgList); llvm::Expected diff --git a/sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp b/sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp new file mode 100644 index 0000000000000..1caa5cf078d91 --- /dev/null +++ b/sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp @@ -0,0 +1,77 @@ +//===------------- ESIMD.cpp - Driver for ESIMD lowering ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ESIMD.h" + +#include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/GenXIntrinsics/GenXSPIRVWriterAdaptor.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/SYCLLowerIR/ESIMD/LowerESIMD.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Scalar/DCE.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Transforms/Scalar/SROA.h" + +using namespace llvm; + +using string_vector = std::vector; + +// When ESIMD code was separated from the regular SYCL code, +// we can safely process ESIMD part. +void jit_compiler::lowerEsimdConstructs(module_split::ModuleDesc &MD, + bool PerformOpts) { + LoopAnalysisManager LAM; + CGSCCAnalysisManager CGAM; + FunctionAnalysisManager FAM; + ModuleAnalysisManager MAM; + + PassBuilder PB; + PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerFunctionAnalyses(FAM); + PB.registerLoopAnalyses(LAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + ModulePassManager MPM; + MPM.addPass(SYCLLowerESIMDPass(/*ModuleContainsScalar=*/false)); + + if (PerformOpts) { + FunctionPassManager FPM; + FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(ESIMDOptimizeVecArgCallConvPass{}); + FunctionPassManager MainFPM; + MainFPM.addPass(ESIMDLowerLoadStorePass{}); + + if (!PerformOpts) { + MainFPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + MainFPM.addPass(EarlyCSEPass(true)); + MainFPM.addPass(InstCombinePass{}); + MainFPM.addPass(DCEPass{}); + // TODO: maybe remove some passes below that don't affect code quality + MainFPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + MainFPM.addPass(EarlyCSEPass(true)); + MainFPM.addPass(InstCombinePass{}); + MainFPM.addPass(DCEPass{}); + } + MPM.addPass(ESIMDLowerSLMReservationCalls{}); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM))); + MPM.addPass(GenXSPIRVWriterAdaptor(/*RewriteTypes=*/true, + /*RewriteSingleElementVectorsIn*/ false)); + // GenXSPIRVWriterAdaptor pass replaced some functions with "rewritten" + // versions so the entry point table must be rebuilt. + // TODO Change entry point search to analysis? + std::vector Names; + MD.saveEntryPointNames(Names); + MPM.run(MD.getModule(), MAM); + MD.rebuildEntryPoints(Names); +} diff --git a/sycl-jit/jit-compiler/lib/rtc/ESIMD.h b/sycl-jit/jit-compiler/lib/rtc/ESIMD.h new file mode 100644 index 0000000000000..ebf610b462fae --- /dev/null +++ b/sycl-jit/jit-compiler/lib/rtc/ESIMD.h @@ -0,0 +1,23 @@ +//===-------------- ESIMD.h - Driver for ESIMD lowering -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SYCL_JIT_COMPILER_RTC_ESIMD_H +#define SYCL_JIT_COMPILER_RTC_ESIMD_H + +#include "llvm/SYCLLowerIR/ModuleSplitter.h" + +namespace jit_compiler { + +// Runs a pass pipeline to lower ESIMD constructs on the given split model, +// which may only contain ESIMD entrypoints. This is a copy of the similar +// function in `sycl-post-link`. +void lowerEsimdConstructs(llvm::module_split::ModuleDesc &MD, bool PerformOpts); + +} // namespace jit_compiler + +#endif // SYCL_JIT_COMPILER_RTC_ESIMD_H diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp index 563f75d313e95..8bb9cb96c398d 100644 --- a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp +++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp @@ -77,6 +77,27 @@ void ff_templated(T *ptr, T *unused) { } )==="; +auto constexpr ESIMDSource = R"===( +#include +#include + +using namespace sycl::ext::intel::esimd; + +constexpr int VL = 16; + +extern "C" SYCL_EXTERNAL SYCL_ESIMD_KERNEL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((sycl::ext::oneapi::experimental::nd_range_kernel<1>)) +void vector_add_esimd(float *A, float *B, float *C) { + sycl::nd_item<1> item = sycl::ext::oneapi::this_work_item::get_nd_item<1>(); + unsigned int i = item.get_global_id(0); + unsigned int offset = i * VL ; + + simd va(A + offset); + simd vb(B + offset); + simd vc = va + vb; + vc.copy_to(C + offset); +} +)==="; + void test_1(sycl::queue &Queue, sycl::kernel &Kernel, int seed) { constexpr int Range = 10; int *usmPtr = sycl::malloc_shared(Range, Queue); @@ -205,16 +226,81 @@ int test_unsupported_options() { CheckUnsupported({"-Xsycl-target-frontend=spir64", "-fsanitize=address"}); CheckUnsupported({"-Xarch_device", "-fsanitize=address"}); CheckUnsupported({"-fsycl-device-code-split=kernel"}); - CheckUnsupported({"-fsycl-device-code-split-esimd"}); + CheckUnsupported({"-fno-sycl-device-code-split-esimd"}); CheckUnsupported({"-fsycl-dead-args-optimization"}); return 0; } +int test_esimd() { + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::queue q; + sycl::context ctx = q.get_context(); + + if (!q.get_device().has(sycl::aspect::ext_intel_esimd)) { + std::cout << "Device does not support ESIMD" << std::endl; + return -1; + } + + bool ok = + q.get_device().ext_oneapi_can_compile(syclex::source_language::sycl_jit); + if (!ok) { + return -1; + } + + std::string log; + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl_jit, ESIMDSource); + exe_kb kbExe = + syclex::build(kbSrc, syclex::properties{syclex::save_log{&log}}); + + // extern "C" was used, so the name "vector_add_esimd" is not mangled and can + // be used directly. + sycl::kernel k = kbExe.ext_oneapi_get_kernel("vector_add_esimd"); + + // Now test it. + constexpr int VL = 16; // this constant also in ESIMDSource string. + constexpr int size = VL * 16; + + float *A = sycl::malloc_shared(size, q); + float *B = sycl::malloc_shared(size, q); + float *C = sycl::malloc_shared(size, q); + for (size_t i = 0; i < size; i++) { + A[i] = float(1); + B[i] = float(2); + C[i] = 0.0f; + } + sycl::range<1> GlobalRange{size / VL}; + sycl::range<1> LocalRange{1}; + sycl::nd_range<1> NDRange{GlobalRange, LocalRange}; + + q.submit([&](sycl::handler &h) { + h.set_arg(0, A); + h.set_arg(1, B); + h.set_arg(2, C); + h.parallel_for(NDRange, k); + }).wait(); + + // Check. + for (size_t i = 0; i < size; i++) { + assert(C[i] == 3.0f); + } + + sycl::free(A, q); + sycl::free(B, q); + sycl::free(C, q); + + return 0; +} + int main() { #ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER - return test_build_and_run() || test_unsupported_options(); + return test_build_and_run() || test_unsupported_options() || test_esimd(); #else static_assert(false, "Kernel Compiler feature test macro undefined"); #endif From 89098b457b682c4bbf201faa69eb275c72082e56 Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Mon, 2 Dec 2024 13:15:29 +0000 Subject: [PATCH 2/3] Skip ESIMD test if device doesn't support it Signed-off-by: Julian Oppermann --- .../KernelCompiler/kernel_compiler_sycl_jit.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp index 8bb9cb96c398d..ab52bb2db81d3 100644 --- a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp +++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp @@ -241,13 +241,19 @@ int test_esimd() { sycl::context ctx = q.get_context(); if (!q.get_device().has(sycl::aspect::ext_intel_esimd)) { - std::cout << "Device does not support ESIMD" << std::endl; - return -1; + std::cout << "Device '" + << q.get_device().get_info() + << "' does not support ESIMD, skipping test." << std::endl; + return 0; } bool ok = q.get_device().ext_oneapi_can_compile(syclex::source_language::sycl_jit); if (!ok) { + std::cout << "Apparently this device does not support `sycl_jit` source " + "kernel bundle extension: " + << q.get_device().get_info() + << std::endl; return -1; } From 7ed02b0a076a9a1d437afe1fd19e3975833a163c Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Mon, 2 Dec 2024 14:57:08 +0000 Subject: [PATCH 3/3] Address feedback Signed-off-by: Julian Oppermann --- sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp | 4 +++- sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp | 2 +- sycl-jit/jit-compiler/lib/rtc/ESIMD.h | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 3ed57fa4a9ad5..74ddec06b299a 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -451,7 +451,9 @@ llvm::Expected jit_compiler::performPostLink( MDesc = std::move(ESIMDSplits.front()); if (MDesc.isESIMD()) { - // TODO: We're assuming ESIMD lowering is not deactivated (why would it?). + // `sycl-post-link` has a `-lower-esimd` option, but there's no clang driver + // option to influence it. Rather, the driver sets it unconditionally in the + // multi-file output mode, which we are mimicking here. lowerEsimdConstructs(MDesc, PerformOpts); } diff --git a/sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp b/sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp index 1caa5cf078d91..2341ec1dde323 100644 --- a/sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp @@ -52,7 +52,7 @@ void jit_compiler::lowerEsimdConstructs(module_split::ModuleDesc &MD, FunctionPassManager MainFPM; MainFPM.addPass(ESIMDLowerLoadStorePass{}); - if (!PerformOpts) { + if (PerformOpts) { MainFPM.addPass(SROAPass(SROAOptions::ModifyCFG)); MainFPM.addPass(EarlyCSEPass(true)); MainFPM.addPass(InstCombinePass{}); diff --git a/sycl-jit/jit-compiler/lib/rtc/ESIMD.h b/sycl-jit/jit-compiler/lib/rtc/ESIMD.h index ebf610b462fae..fb66a49c526d6 100644 --- a/sycl-jit/jit-compiler/lib/rtc/ESIMD.h +++ b/sycl-jit/jit-compiler/lib/rtc/ESIMD.h @@ -14,7 +14,7 @@ namespace jit_compiler { // Runs a pass pipeline to lower ESIMD constructs on the given split model, -// which may only contain ESIMD entrypoints. This is a copy of the similar +// which must only contain ESIMD entrypoints. This is a copy of the similar // function in `sycl-post-link`. void lowerEsimdConstructs(llvm::module_split::ModuleDesc &MD, bool PerformOpts);