diff --git a/sycl-jit/common/include/Kernel.h b/sycl-jit/common/include/Kernel.h index 794126a462e20..efd6e1ded3051 100644 --- a/sycl-jit/common/include/Kernel.h +++ b/sycl-jit/common/include/Kernel.h @@ -403,6 +403,10 @@ struct RTCBundleInfo { RTCBundleBinaryInfo BinaryInfo; FrozenSymbolTable SymbolTable; FrozenPropertyRegistry Properties; + + RTCBundleInfo() = default; + RTCBundleInfo(RTCBundleInfo &&) = default; + RTCBundleInfo &operator=(RTCBundleInfo &&) = default; }; } // namespace jit_compiler diff --git a/sycl-jit/jit-compiler/CMakeLists.txt b/sycl-jit/jit-compiler/CMakeLists.txt index 82d2356691c3f..63bb2ecc34ad9 100644 --- a/sycl-jit/jit-compiler/CMakeLists.txt +++ b/sycl-jit/jit-compiler/CMakeLists.txt @@ -8,6 +8,7 @@ add_llvm_library(sycl-jit lib/fusion/JITContext.cpp lib/fusion/ModuleHelper.cpp lib/rtc/DeviceCompilation.cpp + lib/rtc/ESIMD.cpp lib/helper/ConfigHelper.cpp SHARED @@ -32,6 +33,7 @@ add_llvm_library(sycl-jit TargetParser MC SYCLLowerIR + GenXIntrinsics ${LLVM_TARGETS_TO_BUILD} LINK_LIBS diff --git a/sycl-jit/jit-compiler/lib/KernelFusion.cpp b/sycl-jit/jit-compiler/lib/KernelFusion.cpp index 7b706ba88a16b..d7e18003d2a78 100644 --- a/sycl-jit/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-jit/jit-compiler/lib/KernelFusion.cpp @@ -261,12 +261,13 @@ compileSYCL(InMemoryFile SourceFile, View IncludeFiles, return errorTo(std::move(Error), "Device linking failed"); } - auto BundleInfoOrError = performPostLink(*Module, UserArgList); - if (!BundleInfoOrError) { - return errorTo(BundleInfoOrError.takeError(), + auto PostLinkResultOrError = performPostLink(std::move(Module), UserArgList); + if (!PostLinkResultOrError) { + return errorTo(PostLinkResultOrError.takeError(), "Post-link phase failed"); } - auto BundleInfo = std::move(*BundleInfoOrError); + RTCBundleInfo BundleInfo; + std::tie(BundleInfo, Module) = std::move(*PostLinkResultOrError); auto BinaryInfoOrError = translation::KernelTranslator::translateBundleToSPIRV( diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 876c4e431d700..fe2c9756ce209 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "DeviceCompilation.h" +#include "ESIMD.h" #include #include @@ -27,6 +28,8 @@ #include #include #include +#include +#include #include #include #include @@ -432,42 +435,84 @@ template static bool runModulePass(llvm::Module &M) { return !Res.areAllPreserved(); } -Expected jit_compiler::performPostLink( - llvm::Module &Module, [[maybe_unused]] const InputArgList &UserArgList) { +llvm::Expected jit_compiler::performPostLink( + std::unique_ptr Module, + [[maybe_unused]] const llvm::opt::InputArgList &UserArgList) { // This is a simplified version of `processInputModule` in // `llvm/tools/sycl-post-link.cpp`. Assertions/TODOs point to functionality // left out of the algorithm for now. - assert(!Module.getGlobalVariable("llvm.used") && - !Module.getGlobalVariable("llvm.compiler.used")); + // TODO: SplitMode can be controlled by the user. + const auto SplitMode = SPLIT_NONE; + + // TODO: EmitOnlyKernelsAsEntryPoints is controlled by + // `shouldEmitOnlyKernelsAsEntryPoints` in + // `clang/lib/Driver/ToolChains/Clang.cpp`. + const bool EmitOnlyKernelsAsEntryPoints = true; + + // TODO: The optlevel passed to `sycl-post-link` is determined by + // `getSYCLPostLinkOptimizationLevel` in + // `clang/lib/Driver/ToolChains/Clang.cpp`. + const bool PerformOpts = true; + + // Propagate ESIMD attribute to wrapper functions to prevent spurious splits + // and kernel link errors. + runModulePass(*Module); + + assert(!Module->getGlobalVariable("llvm.used") && + !Module->getGlobalVariable("llvm.compiler.used")); // Otherwise: Port over the `removeSYCLKernelsConstRefArray` and // `removeDeviceGlobalFromCompilerUsed` methods. - assert(!isModuleUsingAsan(Module)); + assert(!isModuleUsingAsan(*Module)); // Otherwise: Need to instrument each image scope device globals if the module // has been instrumented by sanitizer pass. // Transform Joint Matrix builtin calls to align them with SPIR-V friendly // LLVM IR specification. - runModulePass(Module); + runModulePass(*Module); + + // Do invoke_simd processing before splitting because this: + // - saves processing time (the pass is run once, even though on larger IR) + // - doing it before SYCL/ESIMD splitting is required for correctness + if (runModulePass(*Module)) { + return createStringError("`invoke_simd` calls detected"); + } // TODO: Implement actual device code splitting. We're just using the splitter // to obtain additional information about the module for now. - // TODO: EmitOnlyKernelsAsEntryPoints is controlled by - // `shouldEmitOnlyKernelsAsEntryPoints` in - // `clang/lib/Driver/ToolChains/Clang.cpp`. + std::unique_ptr Splitter = getDeviceCodeSplitter( - ModuleDesc{std::unique_ptr{&Module}}, SPLIT_NONE, - /*IROutputOnly=*/false, - /*EmitOnlyKernelsAsEntryPoints=*/true); - assert(Splitter->remainingSplits() == 1); + ModuleDesc{std::move(Module)}, SplitMode, + /*IROutputOnly=*/false, EmitOnlyKernelsAsEntryPoints); + assert(Splitter->hasMoreSplits()); + if (Splitter->remainingSplits() > 1) { + return createStringError("Device code requires splitting"); + } // TODO: Call `verifyNoCrossModuleDeviceGlobalUsage` if device globals shall // be processed. - assert(Splitter->hasMoreSplits()); ModuleDesc MDesc = Splitter->nextSplit(); - assert(&Module == &MDesc.getModule()); + + // TODO: Call `MDesc.fixupLinkageOfDirectInvokeSimdTargets()` when + // `invoke_simd` is supported. + + SmallVector ESIMDSplits = + splitByESIMD(std::move(MDesc), EmitOnlyKernelsAsEntryPoints); + assert(!ESIMDSplits.empty()); + if (ESIMDSplits.size() > 1) { + return createStringError("Mixing SYCL and ESIMD code is unsupported"); + } + MDesc = std::move(ESIMDSplits.front()); + + if (MDesc.isESIMD()) { + // `sycl-post-link` has a `-lower-esimd` option, but there's no clang driver + // option to influence it. Rather, the driver sets it unconditionally in the + // multi-file output mode, which we are mimicking here. + lowerEsimdConstructs(MDesc, PerformOpts); + } + MDesc.saveSplitInformationAsMetadata(); RTCBundleInfo BundleInfo; @@ -504,10 +549,7 @@ Expected jit_compiler::performPostLink( } }; - // Regain ownership of the module. - MDesc.releaseModulePtr().release(); - - return std::move(BundleInfo); + return PostLinkResult{std::move(BundleInfo), MDesc.releaseModulePtr()}; } Expected @@ -569,11 +611,9 @@ jit_compiler::parseUserArgs(View UserArgs) { return createStringError("Device code splitting is not yet supported"); } - if (AL.hasArg(OPT_fsycl_device_code_split_esimd, - OPT_fno_sycl_device_code_split_esimd)) { - // TODO: There are more ESIMD-related options. - return createStringError( - "Runtime compilation of ESIMD kernels is not yet supported"); + if (!AL.hasFlag(OPT_fsycl_device_code_split_esimd, + OPT_fno_sycl_device_code_split_esimd, true)) { + return createStringError("ESIMD device code split cannot be deactivated"); } if (AL.hasFlag(OPT_fsycl_dead_args_optimization, diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h index c378bb695a8f5..0fea81bdc1d35 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h @@ -30,8 +30,9 @@ llvm::Error linkDeviceLibraries(llvm::Module &Module, const llvm::opt::InputArgList &UserArgList, std::string &BuildLog); -llvm::Expected -performPostLink(llvm::Module &Module, +using PostLinkResult = std::pair>; +llvm::Expected +performPostLink(std::unique_ptr Module, const llvm::opt::InputArgList &UserArgList); llvm::Expected diff --git a/sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp b/sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp new file mode 100644 index 0000000000000..2341ec1dde323 --- /dev/null +++ b/sycl-jit/jit-compiler/lib/rtc/ESIMD.cpp @@ -0,0 +1,77 @@ +//===------------- ESIMD.cpp - Driver for ESIMD lowering ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ESIMD.h" + +#include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/GenXIntrinsics/GenXSPIRVWriterAdaptor.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/SYCLLowerIR/ESIMD/LowerESIMD.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Scalar/DCE.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Transforms/Scalar/SROA.h" + +using namespace llvm; + +using string_vector = std::vector; + +// When ESIMD code was separated from the regular SYCL code, +// we can safely process ESIMD part. +void jit_compiler::lowerEsimdConstructs(module_split::ModuleDesc &MD, + bool PerformOpts) { + LoopAnalysisManager LAM; + CGSCCAnalysisManager CGAM; + FunctionAnalysisManager FAM; + ModuleAnalysisManager MAM; + + PassBuilder PB; + PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerFunctionAnalyses(FAM); + PB.registerLoopAnalyses(LAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + ModulePassManager MPM; + MPM.addPass(SYCLLowerESIMDPass(/*ModuleContainsScalar=*/false)); + + if (PerformOpts) { + FunctionPassManager FPM; + FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + MPM.addPass(ESIMDOptimizeVecArgCallConvPass{}); + FunctionPassManager MainFPM; + MainFPM.addPass(ESIMDLowerLoadStorePass{}); + + if (PerformOpts) { + MainFPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + MainFPM.addPass(EarlyCSEPass(true)); + MainFPM.addPass(InstCombinePass{}); + MainFPM.addPass(DCEPass{}); + // TODO: maybe remove some passes below that don't affect code quality + MainFPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + MainFPM.addPass(EarlyCSEPass(true)); + MainFPM.addPass(InstCombinePass{}); + MainFPM.addPass(DCEPass{}); + } + MPM.addPass(ESIMDLowerSLMReservationCalls{}); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM))); + MPM.addPass(GenXSPIRVWriterAdaptor(/*RewriteTypes=*/true, + /*RewriteSingleElementVectorsIn*/ false)); + // GenXSPIRVWriterAdaptor pass replaced some functions with "rewritten" + // versions so the entry point table must be rebuilt. + // TODO Change entry point search to analysis? + std::vector Names; + MD.saveEntryPointNames(Names); + MPM.run(MD.getModule(), MAM); + MD.rebuildEntryPoints(Names); +} diff --git a/sycl-jit/jit-compiler/lib/rtc/ESIMD.h b/sycl-jit/jit-compiler/lib/rtc/ESIMD.h new file mode 100644 index 0000000000000..fb66a49c526d6 --- /dev/null +++ b/sycl-jit/jit-compiler/lib/rtc/ESIMD.h @@ -0,0 +1,23 @@ +//===-------------- ESIMD.h - Driver for ESIMD lowering -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SYCL_JIT_COMPILER_RTC_ESIMD_H +#define SYCL_JIT_COMPILER_RTC_ESIMD_H + +#include "llvm/SYCLLowerIR/ModuleSplitter.h" + +namespace jit_compiler { + +// Runs a pass pipeline to lower ESIMD constructs on the given split model, +// which must only contain ESIMD entrypoints. This is a copy of the similar +// function in `sycl-post-link`. +void lowerEsimdConstructs(llvm::module_split::ModuleDesc &MD, bool PerformOpts); + +} // namespace jit_compiler + +#endif // SYCL_JIT_COMPILER_RTC_ESIMD_H diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp index 798dc987cbffc..0835cf1116eba 100644 --- a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp +++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp @@ -78,6 +78,27 @@ void ff_templated(T *ptr, T *unused) { } )==="; +auto constexpr ESIMDSource = R"===( +#include +#include + +using namespace sycl::ext::intel::esimd; + +constexpr int VL = 16; + +extern "C" SYCL_EXTERNAL SYCL_ESIMD_KERNEL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((sycl::ext::oneapi::experimental::nd_range_kernel<1>)) +void vector_add_esimd(float *A, float *B, float *C) { + sycl::nd_item<1> item = sycl::ext::oneapi::this_work_item::get_nd_item<1>(); + unsigned int i = item.get_global_id(0); + unsigned int offset = i * VL ; + + simd va(A + offset); + simd vb(B + offset); + simd vc = va + vb; + vc.copy_to(C + offset); + } +)==="; + auto constexpr BadSource = R"===( #include @@ -196,6 +217,77 @@ int test_build_and_run() { return 0; } +int test_esimd() { + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::queue q; + sycl::context ctx = q.get_context(); + + if (!q.get_device().has(sycl::aspect::ext_intel_esimd)) { + std::cout << "Device '" + << q.get_device().get_info() + << "' does not support ESIMD, skipping test." << std::endl; + return 0; + } + + bool ok = + q.get_device().ext_oneapi_can_compile(syclex::source_language::sycl_jit); + if (!ok) { + std::cout << "Apparently this device does not support `sycl_jit` source " + "kernel bundle extension: " + << q.get_device().get_info() + << std::endl; + return -1; + } + + std::string log; + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl_jit, ESIMDSource); + exe_kb kbExe = + syclex::build(kbSrc, syclex::properties{syclex::save_log{&log}}); + + // extern "C" was used, so the name "vector_add_esimd" is not mangled and can + // be used directly. + sycl::kernel k = kbExe.ext_oneapi_get_kernel("vector_add_esimd"); + + // Now test it. + constexpr int VL = 16; // this constant also in ESIMDSource string. + constexpr int size = VL * 16; + + float *A = sycl::malloc_shared(size, q); + float *B = sycl::malloc_shared(size, q); + float *C = sycl::malloc_shared(size, q); + for (size_t i = 0; i < size; i++) { + A[i] = float(1); + B[i] = float(2); + C[i] = 0.0f; + } + sycl::range<1> GlobalRange{size / VL}; + sycl::range<1> LocalRange{1}; + sycl::nd_range<1> NDRange{GlobalRange, LocalRange}; + + q.submit([&](sycl::handler &h) { + h.set_arg(0, A); + h.set_arg(1, B); + h.set_arg(2, C); + h.parallel_for(NDRange, k); + }).wait(); + + // Check. + for (size_t i = 0; i < size; i++) { + assert(C[i] == 3.0f); + } + + sycl::free(A, q); + sycl::free(B, q); + sycl::free(C, q); + + return 0; +} + int test_unsupported_options() { namespace syclex = sycl::ext::oneapi::experimental; using source_kb = sycl::kernel_bundle; @@ -234,7 +326,7 @@ int test_unsupported_options() { CheckUnsupported({"-Xsycl-target-frontend=spir64", "-fsanitize=address"}); CheckUnsupported({"-Xarch_device", "-fsanitize=address"}); CheckUnsupported({"-fsycl-device-code-split=kernel"}); - CheckUnsupported({"-fsycl-device-code-split-esimd"}); + CheckUnsupported({"-fno-sycl-device-code-split-esimd"}); CheckUnsupported({"-fsycl-dead-args-optimization"}); return 0; @@ -298,8 +390,8 @@ int test_warning() { int main(int argc, char **) { #ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER int optional_tests = (argc > 1) ? test_warning() : 0; - return test_build_and_run() || test_unsupported_options() || test_error() || - optional_tests; + return test_build_and_run() || test_esimd() || test_unsupported_options() || + test_error() || optional_tests; #else static_assert(false, "Kernel Compiler feature test macro undefined"); #endif