From 4158578c14fea6e2645ed741da92e460ec6b6414 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Fri, 15 Mar 2024 05:58:15 -0400
Subject: [PATCH 01/22] [SYCL] Introduce SYCL_JIT_KERNELS env var

Also allow resetting of SYCL_CACHE_IN_MEM.
---
 sycl/source/detail/config.def |  1 +
 sycl/source/detail/config.hpp | 53 +++++++++++++++++++++++++----------
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def
index 04744c5c6841a..3e6ca9e081369 100644
--- a/sycl/source/detail/config.def
+++ b/sycl/source/detail/config.def
@@ -42,3 +42,4 @@ CONFIG(SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE, 16, __SYCL_REDUCTION_PREFERRED_W
 CONFIG(ONEAPI_DEVICE_SELECTOR, 1024, __ONEAPI_DEVICE_SELECTOR)
 CONFIG(SYCL_ENABLE_FUSION_CACHING, 1, __SYCL_ENABLE_FUSION_CACHING)
 CONFIG(SYCL_CACHE_IN_MEM, 1, __SYCL_CACHE_IN_MEM)
+CONFIG(SYCL_JIT_KERNELS, 1, __SYCL_JIT_KERNELS)
diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp
index 71ba0310a24e8..46177036115fe 100644
--- a/sycl/source/detail/config.hpp
+++ b/sycl/source/detail/config.hpp
@@ -592,30 +592,53 @@ template <> class SYCLConfig<SYCL_CACHE_IN_MEM> {
   using BaseT = SYCLConfigBase<SYCL_CACHE_IN_MEM>;
 
 public:
-  static constexpr bool Default = true; // default is true
-  static bool get() { return getCachedValue(); }
+  static bool get() {
+    constexpr bool DefaultValue = true;
+
+    const char *ValStr = getCachedValue();
+
+    if (!ValStr)
+      return DefaultValue;
+
+    return ValStr[0] == '1';
+  }
+
+  static void reset() { (void)getCachedValue(/*ResetCache=*/true); }
+
   static const char *getName() { return BaseT::MConfigName; }
 
 private:
-  static bool parseValue() {
-    const char *ValStr = BaseT::getRawValue();
+  static const char *getCachedValue(bool ResetCache = false) {
+    static const char *ValStr = BaseT::getRawValue();
+    if (ResetCache)
+      ValStr = BaseT::getRawValue();
+    return ValStr;
+  }
+};
+
+template <> class SYCLConfig<SYCL_JIT_KERNELS> {
+  using BaseT = SYCLConfigBase<SYCL_JIT_KERNELS>;
+
+public:
+  static bool get() {
+    constexpr bool DefaultValue = false;
+    const char *ValStr = getCachedValue();
     if (!ValStr)
-      return Default;
-    if (strlen(ValStr) != 1 || (ValStr[0] != '0' && ValStr[0] != '1')) {
-      std::string Msg =
-          std::string{"Invalid value for bool configuration variable "} +
-          getName() + std::string{": "} + ValStr;
-      throw runtime_error(Msg, PI_ERROR_INVALID_OPERATION);
-    }
+      return DefaultValue;
+
     return ValStr[0] == '1';
   }
 
-  static bool getCachedValue() {
-    static bool Val = parseValue();
-    return Val;
+  static const char *getName() { return BaseT::MConfigName; }
+
+private:
+  static const char *getCachedValue(bool ResetCache = false) {
+    static const char *ValStr = BaseT::getRawValue();
+    if (ResetCache)
+      ValStr = BaseT::getRawValue();
+    return ValStr;
   }
 };
-
 #undef INVALID_CONFIG_EXCEPTION
 
 } // namespace detail

From 9bf827fdaa1502eace71a6a4cfa0a9bc61c238a9 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Fri, 15 Mar 2024 06:14:32 -0400
Subject: [PATCH 02/22]  [SYCL] Extend kernel fusion with JIT-ing

---
 .../jit-compiler/include/KernelFusion.h       |  5 +
 .../jit-compiler/ld-version-script.txt        |  1 +
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 43 +++++++++
 sycl/source/detail/jit_compiler.cpp           | 91 +++++++++++++++++++
 sycl/source/detail/jit_compiler.hpp           |  7 ++
 sycl/source/detail/scheduler/commands.cpp     | 49 +++++++++-
 sycl/source/detail/scheduler/commands.hpp     |  5 +-
 sycl/source/detail/scheduler/scheduler.cpp    | 21 +++++
 sycl/source/detail/scheduler/scheduler.hpp    |  3 +
 sycl/source/handler.cpp                       |  7 +-
 10 files changed, 225 insertions(+), 7 deletions(-)

diff --git a/sycl-fusion/jit-compiler/include/KernelFusion.h b/sycl-fusion/jit-compiler/include/KernelFusion.h
index dd1865e2f8193..8b727e6ec03e5 100644
--- a/sycl-fusion/jit-compiler/include/KernelFusion.h
+++ b/sycl-fusion/jit-compiler/include/KernelFusion.h
@@ -63,6 +63,11 @@ FusionResult fuseKernels(View<SYCLKernelInfo> KernelInformation,
                          View<ParameterInternalization> Internalization,
                          View<jit_compiler::JITConstant> JITConstants);
 
+FusionResult
+materializeSpecConstants(const char *KernelName,
+                         jit_compiler::SYCLKernelBinaryInfo &BinInfo,
+                         std::vector<unsigned char> &SpecConstBlob);
+
 /// Clear all previously set options.
 void resetJITConfiguration();
 
diff --git a/sycl-fusion/jit-compiler/ld-version-script.txt b/sycl-fusion/jit-compiler/ld-version-script.txt
index 101355ab10c1a..eb7892fdfec9c 100644
--- a/sycl-fusion/jit-compiler/ld-version-script.txt
+++ b/sycl-fusion/jit-compiler/ld-version-script.txt
@@ -2,6 +2,7 @@
   global:
     /* Export the library entry points */
     fuseKernels;
+    materializeSpecConstants;
     resetJITConfiguration;
     addToJITConfiguration;
 
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 28f099c537a2a..72bac027b6095 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -70,6 +70,49 @@ static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
   }
 }
 
+extern "C" FusionResult
+materializeSpecConstants(const char *KernelName,
+                         jit_compiler::SYCLKernelBinaryInfo &BinInfo,
+                         std::vector<unsigned char> &SpecConstBlob) {
+  auto &JITCtx = JITContext::getInstance();
+
+  TargetInfo TargetInfo = ConfigHelper::get<option::JITTargetInfo>();
+  BinaryFormat TargetFormat = TargetInfo.getFormat();
+  if (TargetFormat != BinaryFormat::PTX &&
+      TargetFormat != BinaryFormat::AMDGCN) {
+    return FusionResult(
+        "Fusion output target format not supported by this build");
+  }
+
+  ::jit_compiler::SYCLKernelInfo KernelInfo{
+      KernelName, ::jit_compiler::SYCLArgumentDescriptor{},
+      ::jit_compiler::NDRange{}, BinInfo};
+  SYCLModuleInfo ModuleInfo;
+  ModuleInfo.kernels().insert(ModuleInfo.kernels().end(), KernelInfo);
+  // Load all input kernels from their respective SPIR-V modules into a single
+  // LLVM IR module.
+  llvm::Expected<std::unique_ptr<llvm::Module>> ModOrError =
+      translation::KernelTranslator::loadKernels(*JITCtx.getLLVMContext(),
+                                                 ModuleInfo.kernels());
+  if (auto Error = ModOrError.takeError()) {
+    return errorToFusionResult(std::move(Error), "Failed to load kernels");
+  }
+  std::unique_ptr<llvm::Module> NewMod = std::move(*ModOrError);
+  if (!fusion::FusionPipeline::runMaterializerPasses(*NewMod, SpecConstBlob) ||
+      !NewMod->getFunction(KernelName)) {
+    return FusionResult{"Materializer passes should not fail"};
+  }
+
+  SYCLKernelInfo &MaterializerKernelInfo = *ModuleInfo.getKernelFor(KernelName);
+  if (auto Error = translation::KernelTranslator::translateKernel(
+          MaterializerKernelInfo, *NewMod, JITCtx, TargetFormat)) {
+    return errorToFusionResult(std::move(Error),
+                               "Translation to output format failed");
+  }
+
+  return FusionResult{MaterializerKernelInfo};
+}
+
 extern "C" FusionResult
 fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
             View<ParameterIdentity> Identities, BarrierFlags BarriersFlags,
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 952482e42d79f..fafe37d0db14c 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -64,6 +64,16 @@ jit_compiler::jit_compiler() {
       return false;
     }
 
+    this->MaterializeSpecConstHandle =
+        reinterpret_cast<MaterializeSpecConstFuncT>(
+            sycl::detail::pi::getOsLibraryFuncAddress(
+                LibraryPtr, "materializeSpecConstants"));
+    if (!this->MaterializeSpecConstHandle) {
+      printPerformanceWarning(
+          "Cannot resolve JIT library function entry point");
+      return false;
+    }
+
     return true;
   };
   Available = checkJITLibrary();
@@ -678,6 +688,87 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
   }
 }
 
+sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
+    QueueImplPtr Queue, RTDeviceBinaryImage *BinImage,
+    const std::string &KernelName, std::vector<unsigned char> &SpecConstBlob) {
+  if (!BinImage) {
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "No suitable IR available for materializing");
+    return nullptr;
+  }
+  if (KernelName.empty()) {
+    throw sycl::exception(
+        sycl::make_error_code(sycl::errc::invalid),
+        "Cannot jit kernel with invalid kernel function name");
+    return nullptr;
+  }
+
+  auto &RawDeviceImage = BinImage->getRawData();
+  auto DeviceImageSize = static_cast<size_t>(RawDeviceImage.BinaryEnd -
+                                             RawDeviceImage.BinaryStart);
+  // Set 0 as the number of address bits, because the JIT compiler can set
+  // this field based on information from SPIR-V/LLVM module's data-layout.
+  auto BinaryImageFormat = translateBinaryImageFormat(BinImage->getFormat());
+  if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "No suitable IR available for materializing");
+    return nullptr;
+  }
+  ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
+      BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
+
+  ::jit_compiler::TargetInfo TargetInfo = getTargetInfo(Queue);
+  ::jit_compiler::BinaryFormat TargetFormat = TargetInfo.getFormat();
+  AddToConfigHandle(
+      ::jit_compiler::option::JITTargetInfo::set(std::move(TargetInfo)));
+  bool DebugEnabled =
+      detail::SYCLConfig<detail::SYCL_RT_WARNING_LEVEL>::get() > 0;
+  AddToConfigHandle(
+      ::jit_compiler::option::JITEnableVerbose::set(DebugEnabled));
+  AddToConfigHandle(::jit_compiler::option::JITEnableCaching::set(
+      detail::SYCLConfig<detail::SYCL_ENABLE_FUSION_CACHING>::get()));
+
+  auto MaterializerResult =
+      MaterializeSpecConstHandle(KernelName.c_str(), BinInfo, SpecConstBlob);
+  if (MaterializerResult.failed()) {
+    std::string Message{"Compilation for kernel failed with message:\n"};
+    Message.append(MaterializerResult.getErrorMessage());
+    if (DebugEnabled) {
+      std::cerr << Message << "\n";
+    }
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), Message);
+    return nullptr;
+  }
+
+  auto &MaterializerKernelInfo = MaterializerResult.getKernelInfo();
+  auto PIDeviceBinaries =
+      createPIDeviceBinary(MaterializerKernelInfo, TargetFormat);
+  auto &PM = detail::ProgramManager::getInstance();
+  PM.addImages(PIDeviceBinaries);
+
+  const bool OrigCacheCfg = SYCLConfig<SYCL_CACHE_IN_MEM>::get();
+  if (OrigCacheCfg) {
+    if (0 != setenv("SYCL_CACHE_IN_MEM", "0", true)) {
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::invalid),
+          "Failed to set env variable in materialize spec constel.");
+    }
+    SYCLConfig<SYCL_CACHE_IN_MEM>::reset();
+  }
+  auto NewKernel = std::get<0>(PM.getOrCreateKernel(
+      Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName));
+  if (OrigCacheCfg) {
+    if (0 != setenv("SYCL_CACHE_IN_MEM", "1", true)) {
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::invalid),
+          "Failed to set env variable in materialize spec const.");
+    }
+    SYCLConfig<SYCL_CACHE_IN_MEM>::reset();
+  }
+
+  return NewKernel;
+}
+
 std::unique_ptr<detail::CG>
 jit_compiler::fuseKernels(QueueImplPtr Queue,
                           std::vector<ExecCGCommand *> &InputKernels,
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 2612c26943c94..c283c5c19c223 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -40,6 +40,10 @@ class jit_compiler {
   std::unique_ptr<detail::CG>
   fuseKernels(QueueImplPtr Queue, std::vector<ExecCGCommand *> &InputKernels,
               const property_list &);
+  sycl::detail::pi::PiKernel
+  materializeSpecConstants(QueueImplPtr Queue, RTDeviceBinaryImage *BinImage,
+                           const std::string &KernelName,
+                           std::vector<unsigned char> &SpecConstBlob);
 
   bool isAvailable() { return Available; }
 
@@ -75,9 +79,12 @@ class jit_compiler {
 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
   // Handles to the entry points of the lazily loaded JIT library.
   using FuseKernelsFuncT = decltype(::jit_compiler::fuseKernels) *;
+  using MaterializeSpecConstFuncT =
+      decltype(::jit_compiler::materializeSpecConstants) *;
   using ResetConfigFuncT = decltype(::jit_compiler::resetJITConfiguration) *;
   using AddToConfigFuncT = decltype(::jit_compiler::addToJITConfiguration) *;
   FuseKernelsFuncT FuseKernelsHandle = nullptr;
+  MaterializeSpecConstFuncT MaterializeSpecConstHandle = nullptr;
   ResetConfigFuncT ResetConfigHandle = nullptr;
   AddToConfigFuncT AddToConfigHandle = nullptr;
 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 6ea0fc569bced..42338e57f17a1 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -76,6 +76,33 @@ void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
 }
 #endif
 
+RTDeviceBinaryImage *
+retrieveAMDGCNOrNVPTXKernelBinary(const DeviceImplPtr DeviceImpl,
+                                  const std::string &KernelName) {
+  const bool IsNvidia = DeviceImpl->getBackend() == backend::ext_oneapi_cuda;
+  const bool IsHIP = DeviceImpl->getBackend() == backend::ext_oneapi_hip;
+  if (!(IsNvidia || IsHIP))
+    throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+                          "Unsupported backend" +
+                              codeToString(PI_ERROR_INVALID_VALUE));
+  auto &PM = ProgramManager::getInstance();
+  const auto KernelID = PM.getSYCLKernelID(KernelName);
+  std::vector<kernel_id> KernelIds{KernelID};
+  const auto DeviceImages = PM.getRawDeviceImages(KernelIds);
+  const auto DeviceImage = std::find_if(
+      DeviceImages.begin(), DeviceImages.end(),
+      [IsNvidia](RTDeviceBinaryImage *DI) {
+        const std::string &TargetSpec =
+            IsNvidia ? std::string("llvm_nvptx64") : std::string("llvm_amdgcn");
+        return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
+               DI->getRawData().DeviceTargetSpec == TargetSpec;
+      });
+  if (DeviceImage == DeviceImages.end())
+    return nullptr;
+
+  return *DeviceImage;
+}
+
 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
 struct DemangleHandle {
   char *p;
@@ -111,7 +138,7 @@ static std::string deviceToString(device Device) {
 void applyFuncOnFilteredArgs(
     const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
     std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
-  if (!EliminatedArgMask) {
+  if (!EliminatedArgMask || EliminatedArgMask->size() == 0) {
     for (ArgDesc &Arg : Args) {
       Func(Arg, Arg.MIndex);
     }
@@ -2388,9 +2415,15 @@ static pi_result SetKernelParamsAndLaunch(
     const detail::EventImplPtr &OutEventImpl,
     const KernelArgMask *EliminatedArgMask,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    bool IsCooperative) {
+    bool IsCooperative, RTDeviceBinaryImage *BinImage,
+    const std::string &KernelName) {
   const PluginPtr &Plugin = Queue->getPlugin();
 
+  auto &SpecConstBlob = DeviceImageImpl->get_spec_const_blob_ref();
+  if (SYCLConfig<SYCL_JIT_KERNELS>::get() && !SpecConstBlob.empty()) {
+    Kernel = Scheduler::getInstance().completeSpecConstMaterialization(
+        Queue, BinImage, KernelName, SpecConstBlob);
+  }
   auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
                   &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
     SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,
@@ -2579,7 +2612,7 @@ pi_int32 enqueueImpKernel(
     const detail::EventImplPtr &OutEventImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
-    const bool KernelIsCooperative) {
+    const bool KernelIsCooperative, RTDeviceBinaryImage *BinImage) {
 
   // Run OpenCL kernel
   auto ContextImpl = Queue->getContextImplPtr();
@@ -2671,7 +2704,7 @@ pi_int32 enqueueImpKernel(
     Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
                                      NDRDesc, EventsWaitList, OutEventImpl,
                                      EliminatedArgMask, getMemAllocationFunc,
-                                     KernelIsCooperative);
+                                     KernelIsCooperative, BinImage, KernelName);
 
     const PluginPtr &Plugin = Queue->getPlugin();
     if (!SyclKernelImpl && !MSyclKernel) {
@@ -3039,10 +3072,16 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
       }
     }
 
+    RTDeviceBinaryImage *BinImage = nullptr;
+    if (detail::SYCLConfig<detail::SYCL_JIT_KERNELS>::get())
+      BinImage = retrieveAMDGCNOrNVPTXKernelBinary(MQueue->getDeviceImplPtr(),
+                                                   KernelName);
+
     return enqueueImpKernel(
         MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
         KernelName, RawEvents, EventImpl, getMemAllocationFunc,
-        ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative);
+        ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative,
+        BinImage);
   }
   case CG::CGTYPE::CopyUSM: {
     CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index ea1a5b5111149..58ed5b12bc097 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -38,6 +38,9 @@ void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
                                 xpti_td *TraceEvent, uint16_t Type,
                                 const void *Addr);
 #endif
+RTDeviceBinaryImage *
+retrieveAMDGCNOrNVPTXKernelBinary(const DeviceImplPtr DeviceImpl,
+                                  const std::string &KernelName);
 
 class queue_impl;
 class event_impl;
@@ -638,7 +641,7 @@ pi_int32 enqueueImpKernel(
     const detail::EventImplPtr &Event,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
-    bool KernelIsCooperative);
+    bool KernelIsCooperative, RTDeviceBinaryImage *BinImage = nullptr);
 
 class KernelFusionCommand;
 
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 78fd300460526..1cf457e0a86cc 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -9,6 +9,10 @@
 #include "detail/sycl_mem_obj_i.hpp"
 #include <detail/global_handler.hpp>
 #include <detail/graph_impl.hpp>
+#include <sycl/feature_test.hpp>
+#if SYCL_EXT_CODEPLAY_KERNEL_FUSION
+#include <detail/jit_compiler.hpp>
+#endif
 #include <detail/queue_impl.hpp>
 #include <detail/scheduler/scheduler.hpp>
 #include <detail/stream_impl.hpp>
@@ -633,6 +637,23 @@ void Scheduler::cancelFusion(QueueImplPtr Queue) {
   enqueueCommandForCG(nullptr, ToEnqueue);
 }
 
+sycl::detail::pi::PiKernel Scheduler::completeSpecConstMaterialization(
+    QueueImplPtr Queue, RTDeviceBinaryImage *BinImage,
+    const std::string &KernelName, std::vector<unsigned char> &SpecConstBlob) {
+#if SYCL_EXT_CODEPLAY_KERNEL_FUSION
+  return detail::jit_compiler::get_instance().materializeSpecConstants(
+      Queue, BinImage, KernelName, SpecConstBlob);
+#else  // SYCL_EXT_CODEPLAY_KERNEL_FUSION
+  printFusionWarning(
+      "Materialization of spec constants not supported by this build");
+  (void)Queue;
+  (void)BinImage;
+  (void)KernelName;
+  (void)SpecConstBlob;
+  return nullptr;
+#endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
+}
+
 EventImplPtr Scheduler::completeFusion(QueueImplPtr Queue,
                                        const property_list &PropList) {
   std::vector<Command *> ToEnqueue;
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 124fc1181116c..01e7739e53e01 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -467,6 +467,9 @@ class Scheduler {
   void cancelFusion(QueueImplPtr Queue);
 
   EventImplPtr completeFusion(QueueImplPtr Queue, const property_list &);
+  sycl::detail::pi::PiKernel completeSpecConstMaterialization(
+      QueueImplPtr Queue, RTDeviceBinaryImage *BinImage,
+      const std::string &KernelName, std::vector<unsigned char> &SpecConstBlob);
 
   bool isInFusionMode(QueueIdT Queue);
 
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 10ce364310912..c286780bec9f6 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -293,10 +293,15 @@ event handler::finalize() {
                                           : nullptr);
           Result = PI_SUCCESS;
         } else {
+          detail::RTDeviceBinaryImage *BinImage = nullptr;
+          if (detail::SYCLConfig<detail::SYCL_JIT_KERNELS>::get())
+            BinImage = detail::retrieveAMDGCNOrNVPTXKernelBinary(
+                MQueue->getDeviceImplPtr(), MKernelName.c_str());
+
           Result = enqueueImpKernel(
               MQueue, MNDRDesc, MArgs, KernelBundleImpPtr, MKernel,
               MKernelName.c_str(), RawEvents, NewEvent, nullptr,
-              MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative);
+              MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative, BinImage);
         }
 #ifdef XPTI_ENABLE_INSTRUMENTATION
         // Emit signal only when event is created

From 0bc1146b5bcd10df63efa947061764d09d1c8833 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Fri, 15 Mar 2024 06:37:43 -0400
Subject: [PATCH 03/22]  [SYCL] Define JIT pipeline and introduce materializer
 pass

---
 .../lib/fusion/FusionPipeline.cpp             |  49 +++
 .../jit-compiler/lib/fusion/FusionPipeline.h  |   6 +
 sycl-fusion/passes/CMakeLists.txt             |   2 +
 sycl-fusion/passes/SYCLFusionPasses.cpp       |   7 +
 .../SYCLSpecConstMaterializer.cpp             | 317 ++++++++++++++++++
 .../kernel-fusion/SYCLSpecConstMaterializer.h | 127 +++++++
 sycl-fusion/test/materializer/basic.ll        |  68 ++++
 sycl-fusion/test/materializer/debug_output.ll |  60 ++++
 sycl-fusion/test/materializer/multi_type.ll   |  82 +++++
 9 files changed, 718 insertions(+)
 create mode 100644 sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
 create mode 100644 sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
 create mode 100644 sycl-fusion/test/materializer/basic.ll
 create mode 100644 sycl-fusion/test/materializer/debug_output.ll
 create mode 100644 sycl-fusion/test/materializer/multi_type.ll

diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index bf769b50e2f30..f9a0d1fd74b43 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -12,6 +12,7 @@
 #include "helper/ConfigHelper.h"
 #include "internalization/Internalization.h"
 #include "kernel-fusion/SYCLKernelFusion.h"
+#include "kernel-fusion/SYCLSpecConstMaterializer.h"
 #include "kernel-info/SYCLKernelInfo.h"
 #include "syclcp/SYCLCP.h"
 
@@ -141,3 +142,51 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
 
   return std::make_unique<SYCLModuleInfo>(std::move(*NewModInfo.ModuleInfo));
 }
+
+bool FusionPipeline::runMaterializerPasses(
+    llvm::Module &Mod, std::vector<unsigned char> &SpecConstBlob) {
+  PassBuilder PB;
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  // Base the pipeline on O3 opt level.
+  ModulePassManager MPM =
+      PB.buildPerModuleDefaultPipeline(OptimizationLevel::O3);
+  // Register inserter and materializer passes.
+  {
+    FunctionPassManager FPM;
+    MPM.addPass(
+        SYCLSpecConstDataInserter{SpecConstBlob.data(), SpecConstBlob.size()});
+    FPM.addPass(SYCLSpecConstMaterializer{});
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+  // Add generic optimizations,
+  {
+    FunctionPassManager FPM;
+    MPM.addPass(AlwaysInlinerPass{});
+    FPM.addPass(SROAPass{SROAOptions::ModifyCFG});
+    FPM.addPass(SCCPPass{});
+    FPM.addPass(ADCEPass{});
+    FPM.addPass(EarlyCSEPass{/*UseMemorySSA*/ true});
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+  // followed by unrolling.
+  {
+    FunctionPassManager FPM;
+    FPM.addPass(createFunctionToLoopPassAdaptor(IndVarSimplifyPass{}));
+    LoopUnrollOptions UnrollOptions;
+    FPM.addPass(LoopUnrollPass{UnrollOptions});
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+
+  MPM.run(Mod, MAM);
+
+  return true;
+}
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
index 61ae8e0bcb274..28ad4dc1ce362 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
@@ -27,6 +27,12 @@ class FusionPipeline {
   static std::unique_ptr<SYCLModuleInfo>
   runFusionPasses(llvm::Module &Mod, SYCLModuleInfo &InputInfo,
                   BarrierFlags BarriersFlags);
+
+  ///
+  /// Run the necessary passes in a custom pass pipeline to perform
+  /// materialization of kernel specialization constants.
+  static bool runMaterializerPasses(llvm::Module &Mod,
+                                    std::vector<unsigned char> &SpecConstBlob);
 };
 } // namespace fusion
 } // namespace jit_compiler
diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index e768044f6e297..e3e8af1518560 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(SYCLKernelFusion MODULE
   SYCLFusionPasses.cpp
   kernel-fusion/Builtins.cpp
   kernel-fusion/SYCLKernelFusion.cpp
+  kernel-fusion/SYCLSpecConstMaterializer.cpp
   kernel-info/SYCLKernelInfo.cpp
   internalization/Internalization.cpp
   syclcp/SYCLCP.cpp
@@ -50,6 +51,7 @@ add_llvm_library(SYCLKernelFusionPasses
   SYCLFusionPasses.cpp
   kernel-fusion/Builtins.cpp
   kernel-fusion/SYCLKernelFusion.cpp
+  kernel-fusion/SYCLSpecConstMaterializer.cpp
   kernel-info/SYCLKernelInfo.cpp
   internalization/Internalization.cpp
   syclcp/SYCLCP.cpp
diff --git a/sycl-fusion/passes/SYCLFusionPasses.cpp b/sycl-fusion/passes/SYCLFusionPasses.cpp
index 16a938ec991ce..c7959fd70ee0b 100644
--- a/sycl-fusion/passes/SYCLFusionPasses.cpp
+++ b/sycl-fusion/passes/SYCLFusionPasses.cpp
@@ -13,6 +13,7 @@
 
 #include "internalization/Internalization.h"
 #include "kernel-fusion/SYCLKernelFusion.h"
+#include "kernel-fusion/SYCLSpecConstMaterializer.h"
 #include "kernel-info/SYCLKernelInfo.h"
 #include "syclcp/SYCLCP.h"
 
@@ -49,6 +50,12 @@ llvm::PassPluginLibraryInfo getSYCLKernelFusionPluginInfo() {
                 MPM.addPass(SYCLModuleInfoPrinter());
                 return true;
               }
+              if (Name == "sycl-spec-const-materializer") {
+                FunctionPassManager FPM;
+                FPM.addPass(SYCLSpecConstMaterializer());
+                MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+                return true;
+              }
               return false;
             });
         PB.registerAnalysisRegistrationCallback([](ModuleAnalysisManager &MAM) {
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
new file mode 100644
index 0000000000000..8cd6fe2a048b8
--- /dev/null
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
@@ -0,0 +1,317 @@
+//==-------------------- SYCLSpecConstMaterializer.cpp ---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SYCLSpecConstMaterializer.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/SYCLLowerIR/TargetHelpers.h"
+
+#define DEBUG_TYPE "sycl-spec-const-materializer"
+
+using namespace llvm;
+
+cl::opt<size_t> UseTestConstValues(
+    "sycl-materializer-debug-value-size",
+    cl::desc("Size of the spec const blob, debug use only."));
+
+const bool SYCLSpecConstMaterializer::IsDebug =
+    getenv("SYCL_MATERIALIZER_DEBUG");
+
+// When run through the JIT pipeline we have no way of using this pass' debug
+// type, hence the introduction of the environment variable above and the macro
+// below.
+#define MATERIALIZER_DEBUG(X)                                                  \
+  do {                                                                         \
+    if (IsDebug)                                                               \
+      X;                                                                       \
+    else                                                                       \
+      LLVM_DEBUG(X);                                                           \
+  } while (false)
+
+#define SPEC_CONST_DATA_NODE_NAME "SYCL_SpecConst_data"
+
+PreservedAnalyses SYCLSpecConstDataInserter::run(Module &M,
+                                                 ModuleAnalysisManager &) {
+  if (M.getNamedMetadata(SPEC_CONST_DATA_NODE_NAME))
+    llvm_unreachable("Did not expecte the node to be present.");
+
+  auto &Context = M.getContext();
+  auto *SYCLMD = M.getOrInsertNamedMetadata(SPEC_CONST_DATA_NODE_NAME);
+  auto *StringMD = MDString::get(
+      Context, std::string{(const char *)SpecConstData, SpecConstDataSize});
+  auto *TupleMD = MDTuple::get(Context, {StringMD});
+  SYCLMD->addOperand(TupleMD);
+
+  return PreservedAnalyses::all();
+}
+
+static Constant *getScalarConstant(const unsigned char **ValPtr, Type *Ty) {
+  if (Ty->isIntegerTy()) {
+    unsigned NumBytes = Ty->getIntegerBitWidth() / 8;
+    uint64_t IntValue = 0;
+    std::memcpy(&IntValue, *ValPtr, NumBytes);
+    *ValPtr = *ValPtr + NumBytes;
+    return ConstantInt::get(Ty, IntValue);
+  }
+  if (Ty->isDoubleTy()) {
+    double DoubleValue = *(reinterpret_cast<const double *>(*ValPtr));
+    *ValPtr = *ValPtr + sizeof(double);
+    return ConstantFP::get(Ty, DoubleValue);
+  }
+  if (Ty->isFloatTy()) {
+    float FloatValue = *(reinterpret_cast<const float *>(*ValPtr));
+    *ValPtr = *ValPtr + sizeof(float);
+    return ConstantFP::get(Ty, FloatValue);
+  }
+  if (Ty->isHalfTy()) {
+    uint16_t HalfValue = *(reinterpret_cast<const uint16_t *>(*ValPtr));
+    *ValPtr = *ValPtr + sizeof(uint16_t);
+    return ConstantFP::get(Ty, HalfValue);
+  }
+
+  llvm_unreachable("Scalar type not found.");
+}
+
+static Constant *getConstantOfType(const unsigned char **ValPtr, Type *Ty) {
+  assert(ValPtr && Ty && "Invalid input.");
+  if (Ty->isIntegerTy() || Ty->isFloatTy() || Ty->isDoubleTy() ||
+      Ty->isHalfTy())
+    return getScalarConstant(ValPtr, Ty);
+  if (auto *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    SmallVector<Constant *> Elems;
+    auto *ElemTy = ArrTy->getArrayElementType();
+    auto NumElems = ArrTy->getArrayNumElements();
+    for (uint64_t I = 0; I < NumElems; ++I)
+      Elems.push_back(getConstantOfType(ValPtr, ElemTy));
+    return ConstantArray::get(ArrayType::get(ElemTy, NumElems), Elems);
+  }
+  if (auto *StructTy = dyn_cast<StructType>(Ty)) {
+    SmallVector<Constant *> StructElems;
+    for (auto *ElemTy : StructTy->elements())
+      StructElems.push_back(getConstantOfType(ValPtr, ElemTy));
+    return ConstantStruct::get(StructTy, StructElems);
+  }
+
+  llvm_unreachable("Unknown type in getConstantOfType.");
+}
+
+void SYCLSpecConstMaterializer::allocateSpecConstant(StringRef KernelName) {
+  for (auto I : llvm::enumerate(TypesAndOffsets)) {
+    auto *const Ty = I.value().first;
+    assert(Ty->isSized());
+    const auto Offset = I.value().second;
+    assert(Offset < SpecConstDataSize && "Out of bounds access.");
+    const unsigned char *ValPtr = &SpecConstData[Offset];
+    auto *Initializer = getConstantOfType(&ValPtr, I.value().first);
+    // AMD's CONSTANT_ADDRESS and Nvidia's ADDRESS_SPACE_CONST happen to have
+    // the same value.
+    const unsigned AS = 4;
+    auto *SpecConstGlobal = new GlobalVariable(
+        *Mod, Ty, /*isConstant*/ true, GlobalValue::WeakODRLinkage, Initializer,
+        Twine("SpecConsBlob_" + std::string(KernelName) + "_" +
+              std::to_string(I.index())),
+        /*InsertBefore*/ nullptr, GlobalValue::NotThreadLocal, AS,
+        /*isExternallyInitialized*/ false);
+    TypesAndOffsetsToBlob[I.value()] = SpecConstGlobal;
+  }
+}
+
+void SYCLSpecConstMaterializer::fixupSpecConstantUses() {
+  IRBuilder B(Mod->getContext());
+  for (auto &LT : LoadsToTypes) {
+    auto *Load = LT.first;
+    auto &TyOff = LT.second;
+    auto *GV = TypesAndOffsetsToBlob[TyOff];
+    B.SetInsertPoint(Load);
+    auto *NewLoad = B.CreateLoad(TyOff.first, GV);
+    Load->replaceAllUsesWith(NewLoad);
+  }
+}
+
+SmallVector<LoadInst *>
+SYCLSpecConstMaterializer::collectGEPsLoads(GetElementPtrInst *GEP) {
+  SmallVector<LoadInst *> Loads;
+  SmallVector<Instruction *> WorkList;
+  WorkList.push_back(GEP);
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+    for (auto *U : I->users()) {
+      auto *NewI = cast<Instruction>(&*U);
+      switch (NewI->getOpcode()) {
+      default: {
+        std::string Str;
+        raw_string_ostream Out(Str);
+        Out << "Unhandled instruction: ";
+        NewI->print(Out);
+        llvm_unreachable(Str.c_str());
+      }
+      case Instruction::BitCast: {
+        WorkList.push_back(NewI);
+        break;
+      }
+      case Instruction::Load: {
+        Loads.push_back(cast<LoadInst>(NewI));
+        break;
+      }
+      }
+    }
+  }
+
+  return Loads;
+}
+
+void SYCLSpecConstMaterializer::populateUses(Argument *A) {
+  SmallVector<AddrSpaceCastInst *> ASCasts;
+  for (auto *U : A->users()) {
+    auto *I = cast<Instruction>(&*U);
+    switch (I->getOpcode()) {
+    default: {
+      std::string Str;
+      raw_string_ostream Out(Str);
+      Out << "Unhandled instruction: ";
+      I->print(Out);
+      llvm_unreachable(Str.c_str());
+    }
+    case Instruction::AddrSpaceCast: {
+      ASCasts.push_back(cast<AddrSpaceCastInst>(I));
+      break;
+    }
+    }
+  }
+
+  const DataLayout &DL = Mod->getDataLayout();
+  for (auto *AS : ASCasts) {
+    for (auto *U : AS->users()) {
+      auto *I = cast<Instruction>(&*U);
+      switch (I->getOpcode()) {
+      default: {
+        MATERIALIZER_DEBUG(
+            dbgs()
+            << "Optimization opportunity missed, unhandled instruction: \n");
+        MATERIALIZER_DEBUG(I->dump());
+        MATERIALIZER_DEBUG(dbgs() << "Function:\n");
+        MATERIALIZER_DEBUG(I->getParent()->getParent()->dump());
+        break;
+      }
+      case Instruction::Load: {
+        TypeAtOffset TyO{
+            I->getType(),
+            /* Non GEP load starts at the beginnig of memory region */ 0};
+        TypesAndOffsets.insert(TyO);
+        LoadsToTypes[cast<LoadInst>(I)] = TyO;
+        break;
+      }
+      case Instruction::GetElementPtr: {
+        auto *GEP = cast<GetElementPtrInst>(I);
+        unsigned int ASL = GEP->getPointerAddressSpace();
+        unsigned OffsetBitWidth = DL.getIndexSizeInBits(ASL);
+        APInt Offset(OffsetBitWidth, 0);
+        bool FoundOffset = GEP->accumulateConstantOffset(DL, Offset);
+        if (!FoundOffset)
+          llvm_unreachable_internal("Offset unknown.");
+        auto Loads = collectGEPsLoads(GEP);
+        for (auto *Load : Loads) {
+          TypeAtOffset TyO{Load->getType(), Offset.getSExtValue()};
+          TypesAndOffsets.insert(TyO);
+          LoadsToTypes[Load] = TyO;
+        }
+        break;
+      }
+      }
+    }
+  }
+}
+
+void SYCLSpecConstMaterializer::reportAndReset() {
+  if (LoadsToTypes.empty()) {
+    MATERIALIZER_DEBUG(dbgs()
+                       << "Did not find any loads from spec const buffer.\n");
+  } else {
+    MATERIALIZER_DEBUG(dbgs() << "Replaced: " << LoadsToTypes.size()
+                              << " loads from spec const buffer.\n");
+    MATERIALIZER_DEBUG(dbgs() << "Load to global variable mappings:\n");
+    for (auto &LTT : LoadsToTypes) {
+      MATERIALIZER_DEBUG(dbgs() << "\tLoad:\n");
+      MATERIALIZER_DEBUG(LTT.first->dump());
+      MATERIALIZER_DEBUG(dbgs() << "\tGlobal Variable:\n");
+      MATERIALIZER_DEBUG(TypesAndOffsetsToBlob[LTT.second]->dump());
+      MATERIALIZER_DEBUG(dbgs() << "\n");
+    }
+  }
+  MATERIALIZER_DEBUG(dbgs() << "\n\n");
+
+  // Reset the state.
+  TypesAndOffsets.clear();
+  TypesAndOffsetsToBlob.clear();
+  LoadsToTypes.clear();
+}
+
+PreservedAnalyses
+SYCLSpecConstMaterializer::handleKernel(llvm::Function &Kernel) {
+  if (Kernel.arg_empty())
+    return PreservedAnalyses::all();
+  auto *SpecConstArg = std::prev(Kernel.arg_end());
+  if (!SpecConstArg || !SpecConstArg->hasName() ||
+      (SpecConstArg->getName() != "_arg__specialization_constants_buffer"))
+    return PreservedAnalyses::all();
+
+  if (!readMetadata())
+    return PreservedAnalyses::all();
+
+  if (!SpecConstData || SpecConstDataSize < 1)
+    llvm_unreachable("Specialisation constant data not found");
+
+  populateUses(SpecConstArg);
+
+  allocateSpecConstant(Kernel.getName());
+
+  fixupSpecConstantUses();
+
+  reportAndReset();
+
+  return PreservedAnalyses::none();
+}
+
+bool SYCLSpecConstMaterializer::readMetadata() {
+  auto *NamedMD = Mod->getNamedMetadata(SPEC_CONST_DATA_NODE_NAME);
+  if (!NamedMD || NamedMD->getNumOperands() != 1)
+    return false;
+
+  auto *MDN = dyn_cast<MDTuple>(NamedMD->getOperand(0));
+  if (!MDN || MDN->getNumOperands() != 1)
+    llvm_unreachable("Malformed data node.");
+
+  auto *MDS = dyn_cast<MDString>(MDN->getOperand(0));
+  if (!MDS)
+    llvm_unreachable("Malformed string node.");
+
+  SpecConstData = MDS->getString().bytes_begin();
+  SpecConstDataSize = MDS->getString().size();
+
+  return true;
+}
+
+PreservedAnalyses SYCLSpecConstMaterializer::run(Function &F,
+                                                 FunctionAnalysisManager &) {
+  Mod = F.getParent();
+  MATERIALIZER_DEBUG(dbgs() << "Working on function:\n==================\n"
+                            << (F.hasName() ? F.getName() : "unnamed kernel")
+                            << "\n\n");
+
+  // Invariant: This pass is only intended to operate on SYCL kernels being
+  // compiled to either `nvptx{,64}-nvidia-cuda`, or `amdgcn-amd-amdhsa`
+  // triples.
+  auto AT = TargetHelpers::getArchType(*Mod);
+  if (TargetHelpers::ArchType::Cuda != AT &&
+      TargetHelpers::ArchType::AMDHSA != AT) {
+    MATERIALIZER_DEBUG(dbgs() << "Unsupported architecture\n");
+    return PreservedAnalyses::all();
+  }
+
+  return handleKernel(F);
+}
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
new file mode 100644
index 0000000000000..bf715543fb4ed
--- /dev/null
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
@@ -0,0 +1,127 @@
+//==--------------------- SYCLSpecConstMaterializer.h ----------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYCL_SPEC_CONST_MATERIALIZER_H
+#define SYCL_SPEC_CONST_MATERIALIZER_H
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include <map>
+#include <set>
+
+namespace llvm {
+class Function;
+
+///
+/// Utility pass to insert specialization constants values into the module as a
+/// metadata node.
+class SYCLSpecConstDataInserter
+    : public PassInfoMixin<SYCLSpecConstDataInserter> {
+public:
+  SYCLSpecConstDataInserter(const unsigned char *SpecConstData,
+                            size_t SpecConstDataSize)
+      : SpecConstData(SpecConstData), SpecConstDataSize(SpecConstDataSize) {};
+
+  SYCLSpecConstDataInserter() {
+    SpecConstDataSize = 255;
+    for (unsigned i = 0; i < SpecConstDataSize; ++i)
+      DebugSpecConstData.push_back(i);
+
+    SpecConstData = DebugSpecConstData.data();
+  }
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
+
+private:
+  const unsigned char *SpecConstData;
+  size_t SpecConstDataSize;
+  SmallVector<unsigned char> DebugSpecConstData;
+};
+
+///
+/// Pass to materialize specialization constants. Specialization constants
+/// represent constants whose values can be set dynamically during execution of
+/// the SYCL application. The values of these constants are fixed when a SYCL
+/// kernel function is invoked, and they do not change during the execution of
+/// the kernel. This pass receives the values of all specialization constants
+/// used by a kernel and materializes them as concrete types. This is done in
+/// order to be able to enable other optimization opportunities (SCCP, SROA and
+/// CSE), we do not track instructions that can be removed as a result of
+/// materialization, as the pipeline runs DCE pass afterwords.
+class SYCLSpecConstMaterializer
+    : public PassInfoMixin<SYCLSpecConstMaterializer> {
+public:
+  SYCLSpecConstMaterializer() : SpecConstData(nullptr), SpecConstDataSize(0) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+
+private:
+  // Main entry point, checks for implicit specialization constant kernel
+  // argument and, if present, performs the optimizations.
+  PreservedAnalyses handleKernel(Function &Kernel);
+
+  bool readMetadata();
+
+  // Collects all the uses of the specialization constant kernel argument.
+  // This results with TypesAndOffsets and LoadsToType being populated.
+  void populateUses(Argument *A);
+
+  // Use TypesAndOffsets to allocate global variables of given types which get
+  // initialized with value taken from the specialization constant blob at a
+  // given offset.
+  void allocateSpecConstant(StringRef KernelName);
+
+  // Re-write uses of loads from the specialization constant kernel argument to
+  // the global variable.
+  void fixupSpecConstantUses();
+
+  // Walk down all uses of a given GEP instruction in order to find loads from
+  // the offsetted pointer.
+  SmallVector<LoadInst *> collectGEPsLoads(GetElementPtrInst *GEP);
+
+  // Helper to report debug message (if enabled) and reset the state.
+  void reportAndReset();
+
+private:
+  // Flag enabling debug output, guarded by: SYCL_MATERIALIZER_DEBUG environment
+  // variable.
+  static const bool IsDebug;
+
+  // Run time known values of specialization constants passed from SYCL rt,
+  // data pointer and size.
+  const unsigned char *SpecConstData;
+  size_t SpecConstDataSize;
+
+  // Module the current function belongs to.
+  Module *Mod{nullptr};
+
+  // Type of the specialization constant and the offset into the SpecConstBlob,
+  // at which the value is located.
+  using TypeAtOffset = std::pair<Type *, uint64_t>;
+  // Helper allowing sorting of TypeAtOffset containers.
+  struct TypeAtOffsetCmp {
+    bool operator()(const TypeAtOffset &LHS, const TypeAtOffset &RHS) const {
+      if (std::get<0>(LHS) == std::get<0>(RHS))
+        return std::get<1>(LHS) < std::get<1>(RHS);
+      return std::get<0>(LHS) < std::get<0>(RHS);
+    }
+  };
+
+  // Unique uses of spec const (type and offset).
+  std::set<TypeAtOffset, TypeAtOffsetCmp> TypesAndOffsets{};
+  // A map from type and offset to a specialization constant blob to a
+  // GlobalVariable containing its value.
+  std::map<TypeAtOffset, GlobalVariable *, TypeAtOffsetCmp>
+      TypesAndOffsetsToBlob{};
+  // A map of load instruction to its type and offset to a specialization
+  // constant blob.
+  std::map<LoadInst *, TypeAtOffset> LoadsToTypes{};
+};
+} // namespace llvm
+
+#endif // SYCL_SPEC_CONST_MATERIALIZER_H
diff --git a/sycl-fusion/test/materializer/basic.ll b/sycl-fusion/test/materializer/basic.ll
new file mode 100644
index 0000000000000..629b8f95e18bd
--- /dev/null
+++ b/sycl-fusion/test/materializer/basic.ll
@@ -0,0 +1,68 @@
+; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
+; RUN: -passes=sycl-spec-const-materializer -S %s | FileCheck\
+; RUN: --check-prefix=CHECK-MATERIALIZER %s %}
+
+; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
+; RUN: -passes=sycl-spec-const-materializer -S %s | FileCheck\
+; RUN: --check-prefix=CHECK-MATERIALIZER %s %}
+
+; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
+; RUN: -passes=sycl-spec-const-materializer,early-cse,adce -S %s | FileCheck\
+; RUN: --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+
+; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
+; RUN: -passes=sycl-spec-const-materializer,early-cse,adce -S %s | FileCheck\
+; RUN: --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+
+source_filename = "basic.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+
+; Check the basic replacement of specialization constant. We expect 2 global ;
+; variables (i32 and [2 x i32]), no loads from implicit kernel argument:
+; CHECK-MATERIALIZER.
+
+; For CHECK-MATERIALIZER-CSE also include early commons subexpression
+; elimination pass and expect the final literal to be stored to the output
+; pointer.
+
+;CHECK-MATERIALIZER: @SpecConsBlob___test_kernel_0 = weak_odr addrspace(4) constant i32 7
+;CHECK-MATERIALIZER: @SpecConsBlob___test_kernel_1 = weak_odr addrspace(4) constant [2 x i32] [i32 3, i32 1]
+
+
+;CHECK: __test_kernel
+define weak_odr protected amdgpu_kernel void @__test_kernel(ptr addrspace(1) noundef align 4 %out, ptr addrspace(1) noundef align 1 %_arg__specialization_constants_buffer) {
+entry:
+  ;CHECK-MATERIALIZER-CSE-NOT: addrspacecast ptr addrspace(1) %_arg__specialization_constants_buffer to ptr
+  ;CHECK-MATERIALIZER: [[REG1:%[0-9]+]] = load i32, ptr addrspace(4) @SpecConsBlob___test_kernel_0
+  ;CHECK-MATERIALIZER: [[REG2:%[0-9]+]] = load [2 x i32], ptr addrspace(4) @SpecConsBlob___test_kernel_1
+  %0 = addrspacecast ptr addrspace(1) %_arg__specialization_constants_buffer to ptr
+  %gep = getelementptr i8, ptr %0, i32 0
+  %bc = bitcast ptr %gep to ptr
+  ;CHECK-MATERIALIZER-CSE-NOT: load i32, ptr %bc
+  %load1 = load i32, ptr %bc, align 4
+  %gep1 = getelementptr i8, ptr %0, i32 4
+  %bc2 = bitcast ptr %gep1 to ptr
+  ;CHECK-MATERIALIZER-CSE-NOT: load [2 x i32], ptr %bc2
+  %load2 = load [2 x i32], ptr %bc2, align 4
+  ;CHECK-MATERIALIZER: load i32, ptr addrspace(4) @SpecConsBlob___test_kernel_0
+  %straight_load = load i32, ptr %0, align 4
+  %extract1 = extractvalue [2 x i32] %load2, 0
+  %extract2 = extractvalue [2 x i32] %load2, 1
+  %add1 = add nsw i32 %extract1, %load1
+  %add2 = add nsw i32 %add1, %extract2
+  %add3 = add nsw i32 %add2, %straight_load
+  ;CHECK-MATERIALIZER-CSE: store i32 18, ptr addrspace(1) %out,
+  ;CHECK-MATERIALIZER: %extract1 = extractvalue [2 x i32] [[REG2]], 0
+  ;CHECK-MATERIALIZER: %extract2 = extractvalue [2 x i32] [[REG2]], 1
+  ;CHECK-MATERIALIZER: %add1 = add nsw i32 %extract1, [[REG1]]
+
+  store i32 %add3, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+!SYCL_SpecConst_data = !{!1}
+!1 = !{!"\07\00\00\00\03\00\00\00\01\00\00\00"}
diff --git a/sycl-fusion/test/materializer/debug_output.ll b/sycl-fusion/test/materializer/debug_output.ll
new file mode 100644
index 0000000000000..6a94d86745b23
--- /dev/null
+++ b/sycl-fusion/test/materializer/debug_output.ll
@@ -0,0 +1,60 @@
+; RUN: %if hip_amd %{ env SYCL_MATERIALIZER_DEBUG=1 opt\
+; RUN: -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
+; RUN: -passes=sycl-spec-const-materializer,sccp -S %s 2> %t.stderr\
+; RUN: | FileCheck %s %}
+; RUN: %if hip_amd %{ FileCheck --input-file=%t.stderr --check-prefix=CHECK-DEBUG %s %}
+
+; RUN: %if cuda %{ env SYCL_MATERIALIZER_DEBUG=1 opt\
+; RUN: -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
+; RUN: -passes=sycl-spec-const-materializer,sccp -S %s 2> %t.stderr\
+; RUN: | FileCheck %s %}
+; RUN: %if hip_amd %{ FileCheck --input-file=%t.stderr --check-prefix=CHECK-DEBUG %s %}
+
+source_filename = "debug_output.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+
+; This is a derivative of basic.ll, which checks for the debug output of
+; specialization constant materializer pass.
+
+;CHECK-DEBUG: Working on function:
+;CHECK-DEBUG-NEXT: ==================
+;CHECK-DEBUG-NEXT: __test_kernel
+;CHECK-DEBUG: Replaced: 2 loads from spec const buffer.
+;CHECK-DEBUG-NEXT: Load to global variable mappings:
+;CHECK-DEBUG-NEXT: Load:
+;CHECK-DEBUG-NEXT: %load1 = load i32, ptr %bc, align 4
+;CHECK-DEBUG-NEXT: Global Variable:
+;CHECK-DEBUG-NEXT: @SpecConsBlob___test_kernel_0 = weak_odr addrspace(4) constant i32 7
+;CHECK-DEBUG: Load:
+;CHECK-DEBUG-NEXT: %load2 = load [2 x i32], ptr %bc2, align 4
+;CHECK-DEBUG-NEXT: Global Variable:
+;CHECK-DEBUG-NEXT: @SpecConsBlob___test_kernel_1 = weak_odr addrspace(4) constant [2 x i32] [i32 3, i32 1]
+
+;CHECK: @SpecConsBlob___test_kernel_0 = weak_odr addrspace(4) constant i32 7
+;CHECK: @SpecConsBlob___test_kernel_1 = weak_odr addrspace(4) constant [2 x i32] [i32 3, i32 1]
+
+;CHECK: __test_kernel
+define weak_odr protected amdgpu_kernel void @__test_kernel(ptr addrspace(1) noundef align 4 %out, ptr addrspace(1) noundef align 1 %_arg__specialization_constants_buffer) {
+entry:
+  %0 = addrspacecast ptr addrspace(1) %_arg__specialization_constants_buffer to ptr
+  %gep = getelementptr i8, ptr %0, i32 0
+  %bc = bitcast ptr %gep to ptr
+  %load1 = load i32, ptr %bc, align 4
+  %gep1 = getelementptr i8, ptr %0, i32 4
+  %bc2 = bitcast ptr %gep1 to ptr
+  %load2 = load [2 x i32], ptr %bc2, align 4
+  ;CHECK: extractvalue [2 x i32] [i32 3, i32 1], 0
+  ;CHECK: extractvalue [2 x i32] [i32 3, i32 1], 1
+  ;CHECK: add nsw i32 %extract1, 7
+  %extract1 = extractvalue [2 x i32] %load2, 0
+  %extract2 = extractvalue [2 x i32] %load2, 1
+  %add1 = add nsw i32 %extract1, %load1
+  %add2 = add nsw i32 %add1, %extract2
+  store i32 %add2, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+!SYCL_SpecConst_data = !{!1}
+!1 = !{!"\07\00\00\00\03\00\00\00\01\00\00\00"}
diff --git a/sycl-fusion/test/materializer/multi_type.ll b/sycl-fusion/test/materializer/multi_type.ll
new file mode 100644
index 0000000000000..a9263ee8c9b51
--- /dev/null
+++ b/sycl-fusion/test/materializer/multi_type.ll
@@ -0,0 +1,82 @@
+; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
+; RUN: -passes=sycl-spec-const-materializer -S %s | FileCheck\
+; RUN: --check-prefix=CHECK-MATERIALIZER %s %}
+
+; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
+; RUN: -passes=sycl-spec-const-materializer -S %s | FileCheck\
+; RUN: --check-prefix=CHECK-MATERIALIZER %s %}
+
+; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
+; RUN: -passes=sycl-spec-const-materializer,early-cse -S %s | FileCheck\
+; RUN: --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+
+; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
+; RUN: -passes=sycl-spec-const-materializer,early-cse -S %s | FileCheck\
+; RUN: --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+
+source_filename = "multi_type.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+
+; The same logic as in the basic.ll, but with more complicated types; an array
+; of struct with multiple members. This is important as the pass has to
+; correctly set the type of offsetted memory region corresponding to
+; specialization constant (see getConstantOfType in the pass).
+; For CHECK-MATERIALIZER-CSE expect literal store only.
+
+;CHECK-MATERIALIZER: @SpecConsBlob___test_kernel_0 = weak_odr addrspace(4) constant i32
+;CHECK-MATERIALIZER: @SpecConsBlob___test_kernel_1 = weak_odr addrspace(4) constant %"struct.std::array"
+;CHECK-MATERIALIZER: @SpecConsBlob___test_kernel_2 = weak_odr addrspace(4) constant [2 x i32]
+
+%"struct.std::array" = type { [5 x %struct.ThreePrimitives] }
+%struct.ThreePrimitives = type <{ double, i64, half }>
+
+;CHECK: __test_kernel
+define weak_odr protected amdgpu_kernel void @__test_kernel(ptr addrspace(1) noundef align 4 %out, ptr addrspace(1) noundef align 1 %_arg__specialization_constants_buffer) {
+entry:
+  ;CHECK-MATERIALIZER-CSE-NOT: load
+  ;CHECK-MATERIALIZER-CSE: store double 1.100000e+01, ptr addrspace(1) %out
+
+  ;CHECK-MATERIALIZER-CSE-NOT: addrspacecast ptr addrspace(1) %_arg__specialization_constants_buffer to ptr
+  %0 = addrspacecast ptr addrspace(1) %_arg__specialization_constants_buffer to ptr
+  %gep = getelementptr i8, ptr %0, i32 0
+  %bc = bitcast ptr %gep to ptr
+  ;CHECK-MATERIALIZER: load i32, ptr addrspace(4) @SpecConsBlob___test_kernel_0
+  %load = load i32, ptr %bc, align 4
+
+  %gep2 = getelementptr i8, ptr %0, i32 4
+  %bc2 = bitcast ptr %gep2 to ptr
+  ;CHECK-MATERIALIZER: load [2 x i32], ptr addrspace(4) @SpecConsBlob___test_kernel_2
+  %load2 = load [2 x i32], ptr %bc2, align 4
+  %extract1 = extractvalue [2 x i32] %load2, 0
+  %extract2 = extractvalue [2 x i32] %load2, 1
+
+  ; See the TODO above, this should be const propagated
+  ;CHECK-MATERIALIZER: load %"struct.std::array", ptr addrspace(4) @SpecConsBlob___test_kernel_1, align 1
+  %gep3 = getelementptr i8, ptr %0, i32 18
+  %bc3 = bitcast ptr %gep3 to ptr
+  %load3 = load %"struct.std::array", ptr %bc3, align 1
+  %D = extractvalue %"struct.std::array" %load3, 0, 2, 0
+  %L = extractvalue %"struct.std::array" %load3, 0, 2, 1
+  %H = extractvalue %"struct.std::array" %load3, 0, 2, 2
+
+  %add1 = add nsw i32 %extract1, %load
+  %add2 = add nsw i32 %add1, %extract2
+  %conv1 = sitofp i32 %add2 to double
+
+  %add3 = fadd double %D, %conv1
+  %conv2 = sitofp i64 %L to double
+  %add4 = fadd double %conv2, %add3
+  %conv3 = fpext half %H to double
+  %add5 = fadd double %conv3, %add4
+
+  store double %add5, ptr addrspace(1) %out, align 4
+
+  ret void
+}
+
+!SYCL_SpecConst_data = !{!1}
+!1 = !{!"\07\00\00\00\03\00\00\00\01\00\00\00\06\00\00\00\04\00\00\00"}

From 1aaf3e5ccbee3ee2ffe7c19b191a606112346422 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Thu, 20 Jun 2024 13:09:02 +0100
Subject: [PATCH 04/22] [SYCL] Add functionality to create/cache/retrieve
 materialized kernels

---
 sycl/source/detail/jit_compiler.cpp           | 23 +++++--
 .../program_manager/program_manager.cpp       | 65 +++++++++++++++++++
 .../program_manager/program_manager.hpp       | 13 ++++
 sycl/source/detail/scheduler/commands.cpp     |  9 ++-
 4 files changed, 101 insertions(+), 9 deletions(-)

diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index fafe37d0db14c..ac2c2e3489d65 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -702,6 +702,10 @@ sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
         "Cannot jit kernel with invalid kernel function name");
     return nullptr;
   }
+  auto &PM = detail::ProgramManager::getInstance();
+  if (auto CachedKernel =
+          PM.getCachedMaterializedKernel(KernelName, SpecConstBlob))
+    return CachedKernel;
 
   auto &RawDeviceImage = BinImage->getRawData();
   auto DeviceImageSize = static_cast<size_t>(RawDeviceImage.BinaryEnd -
@@ -741,10 +745,12 @@ sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
   }
 
   auto &MaterializerKernelInfo = MaterializerResult.getKernelInfo();
-  auto PIDeviceBinaries =
-      createPIDeviceBinary(MaterializerKernelInfo, TargetFormat);
-  auto &PM = detail::ProgramManager::getInstance();
-  PM.addImages(PIDeviceBinaries);
+  pi_device_binary_struct MaterializedRawDeviceImage{RawDeviceImage};
+  MaterializedRawDeviceImage.BinaryStart =
+      MaterializerKernelInfo.BinaryInfo.BinaryStart;
+  MaterializedRawDeviceImage.BinaryEnd =
+      MaterializerKernelInfo.BinaryInfo.BinaryStart +
+      MaterializerKernelInfo.BinaryInfo.BinarySize;
 
   const bool OrigCacheCfg = SYCLConfig<SYCL_CACHE_IN_MEM>::get();
   if (OrigCacheCfg) {
@@ -755,8 +761,13 @@ sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
     }
     SYCLConfig<SYCL_CACHE_IN_MEM>::reset();
   }
-  auto NewKernel = std::get<0>(PM.getOrCreateKernel(
-      Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName));
+
+  RTDeviceBinaryImage MaterializedRTDevBinImage{&MaterializedRawDeviceImage};
+  const auto &Context = Queue->get_context();
+  const auto &Device = Queue->get_device();
+  auto NewKernel = PM.getOrCreateMaterializedKernel(
+      MaterializedRTDevBinImage, Context, Device, KernelName, SpecConstBlob);
+
   if (OrigCacheCfg) {
     if (0 != setenv("SYCL_CACHE_IN_MEM", "1", true)) {
       throw sycl::exception(
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 82246af25173d..2dfbab8272003 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -2404,6 +2404,71 @@ ProgramManager::getOrCreateKernel(const context &Context,
                          BuildResult->Val.second);
 }
 
+sycl::detail::pi::PiKernel ProgramManager::getCachedMaterializedKernel(
+    const std::string &KernelName,
+    const std::vector<unsigned char> &SpecializationConsts) {
+  if (DbgProgMgr > 0)
+    std::cerr << ">>> ProgramManager::getCachedMaterializedKernel\n"
+              << "KernelName: " << KernelName << "\n";
+
+  {
+    std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
+    if (auto KnownMaterializations = m_MaterializedKernels.find(KernelName);
+        KnownMaterializations != m_MaterializedKernels.end()) {
+      if (DbgProgMgr > 0)
+        std::cerr << ">>> There are:" << KnownMaterializations->second.size()
+                  << " materialized kernels.\n";
+      if (auto Kernel =
+              KnownMaterializations->second.find(SpecializationConsts);
+          Kernel != KnownMaterializations->second.end()) {
+        if (DbgProgMgr > 0)
+          std::cerr << ">>> Kernel in the chache\n";
+        return Kernel->second;
+      }
+    }
+  }
+
+  if (DbgProgMgr > 0)
+    std::cerr << ">>> Kernel not in the chache\n";
+
+  return nullptr;
+}
+
+sycl::detail::pi::PiKernel ProgramManager::getOrCreateMaterializedKernel(
+    const RTDeviceBinaryImage &Img, const context &Context,
+    const device &Device, const std::string &KernelName,
+    const std::vector<unsigned char> &SpecializationConsts) {
+  // Check if we already have the kernel in the cache.
+  if (DbgProgMgr > 0)
+    std::cerr << ">>> ProgramManager::getOrCreateMaterializedKernel\n"
+              << "KernelName: " << KernelName << "\n";
+
+  if (auto CachedKernel =
+          getCachedMaterializedKernel(KernelName, SpecializationConsts))
+    return CachedKernel;
+
+  if (DbgProgMgr > 0)
+    std::cerr << ">>> Adding the kernel to the cache.\n";
+  auto Program = createPIProgram(Img, Context, Device);
+  auto DeviceImpl = detail::getSyclObjImpl(Device);
+  auto &Plugin = DeviceImpl->getPlugin();
+  ProgramPtr ProgramManaged(
+      Program, Plugin->getPiPlugin().PiFunctionTable.piProgramRelease);
+  // TODO: JKB: Flags and zeros.
+  auto BuildProgram =
+      build(std::move(ProgramManaged), detail::getSyclObjImpl(Context), "", "",
+            DeviceImpl->getHandleRef(), 0);
+  sycl::detail::pi::PiKernel PiKernel{nullptr};
+  Plugin->call<errc::kernel_not_supported, PiApiKind::piKernelCreate>(
+      BuildProgram.get(), KernelName.c_str(), &PiKernel);
+  {
+    std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
+    m_MaterializedKernels[KernelName][SpecializationConsts] = PiKernel;
+  }
+
+  return PiKernel;
+}
+
 bool doesDevSupportDeviceRequirements(const device &Dev,
                                       const RTDeviceBinaryImage &Img) {
   return !checkDevSupportDeviceRequirements(Dev, Img).has_value();
diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp
index 573e4ddfed284..64684a66a9df4 100644
--- a/sycl/source/detail/program_manager/program_manager.hpp
+++ b/sycl/source/detail/program_manager/program_manager.hpp
@@ -155,6 +155,15 @@ class ProgramManager {
                     const std::string &KernelName,
                     const NDRDescT &NDRDesc = {});
 
+  sycl::detail::pi::PiKernel getCachedMaterializedKernel(
+      const std::string &KernelName,
+      const std::vector<unsigned char> &SpecializationConsts);
+
+  sycl::detail::pi::PiKernel getOrCreateMaterializedKernel(
+      const RTDeviceBinaryImage &Img, const context &Context,
+      const device &Device, const std::string &KernelName,
+      const std::vector<unsigned char> &SpecializationConsts);
+
   sycl::detail::pi::PiProgram
   getPiProgramFromPiKernel(sycl::detail::pi::PiKernel Kernel,
                            const ContextImplPtr Context);
@@ -423,6 +432,10 @@ class ProgramManager {
 
   /// Protects m_HostPipes and m_Ptr2HostPipe.
   std::mutex m_HostPipesMutex;
+
+  using MaterializedEntries =
+      std::map<std::vector<unsigned char>, pi::PiKernel>;
+  std::unordered_map<std::string, MaterializedEntries> m_MaterializedKernels;
 };
 } // namespace detail
 } // namespace _V1
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 42338e57f17a1..d6c3540c33444 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2419,11 +2419,14 @@ static pi_result SetKernelParamsAndLaunch(
     const std::string &KernelName) {
   const PluginPtr &Plugin = Queue->getPlugin();
 
-  auto &SpecConstBlob = DeviceImageImpl->get_spec_const_blob_ref();
-  if (SYCLConfig<SYCL_JIT_KERNELS>::get() && !SpecConstBlob.empty()) {
+  if (SYCLConfig<SYCL_JIT_KERNELS>::get()) {
+    std::vector<unsigned char> Empty;
     Kernel = Scheduler::getInstance().completeSpecConstMaterialization(
-        Queue, BinImage, KernelName, SpecConstBlob);
+        Queue, BinImage, KernelName,
+        DeviceImageImpl.get() ? DeviceImageImpl->get_spec_const_blob_ref()
+                              : Empty);
   }
+
   auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
                   &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
     SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,

From 8221f0f1550b2f2148b07b81ad043c41536e4580 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Mon, 24 Jun 2024 14:38:10 +0100
Subject: [PATCH 05/22] [SYCL] Introduce SYCL_JIT_TARGET_{CPU,FEATURES} env
 variables

---
 .../jit-compiler/include/KernelFusion.h       |  8 +-
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 11 +--
 .../lib/translation/KernelTranslation.cpp     | 86 +++++++++++--------
 .../lib/translation/KernelTranslation.h       | 11 ++-
 sycl/source/detail/config.def                 |  2 +
 sycl/source/detail/config.hpp                 | 57 ++++++++++++
 sycl/source/detail/jit_compiler.cpp           | 10 ++-
 .../program_manager/program_manager.cpp       | 10 ++-
 8 files changed, 141 insertions(+), 54 deletions(-)

diff --git a/sycl-fusion/jit-compiler/include/KernelFusion.h b/sycl-fusion/jit-compiler/include/KernelFusion.h
index 8b727e6ec03e5..e11aa2154b3f5 100644
--- a/sycl-fusion/jit-compiler/include/KernelFusion.h
+++ b/sycl-fusion/jit-compiler/include/KernelFusion.h
@@ -63,10 +63,10 @@ FusionResult fuseKernels(View<SYCLKernelInfo> KernelInformation,
                          View<ParameterInternalization> Internalization,
                          View<jit_compiler::JITConstant> JITConstants);
 
-FusionResult
-materializeSpecConstants(const char *KernelName,
-                         jit_compiler::SYCLKernelBinaryInfo &BinInfo,
-                         std::vector<unsigned char> &SpecConstBlob);
+FusionResult materializeSpecConstants(
+    const char *KernelName, jit_compiler::SYCLKernelBinaryInfo &BinInfo,
+    std::vector<unsigned char> &SpecConstBlob, const std::string &TargetCPU,
+    const std::string &TargetFeatures);
 
 /// Clear all previously set options.
 void resetJITConfiguration();
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 72bac027b6095..c329c11c0f94d 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -70,10 +70,10 @@ static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
   }
 }
 
-extern "C" FusionResult
-materializeSpecConstants(const char *KernelName,
-                         jit_compiler::SYCLKernelBinaryInfo &BinInfo,
-                         std::vector<unsigned char> &SpecConstBlob) {
+extern "C" FusionResult materializeSpecConstants(
+    const char *KernelName, jit_compiler::SYCLKernelBinaryInfo &BinInfo,
+    std::vector<unsigned char> &SpecConstBlob, const std::string &TargetCPU,
+    const std::string &TargetFeatures) {
   auto &JITCtx = JITContext::getInstance();
 
   TargetInfo TargetInfo = ConfigHelper::get<option::JITTargetInfo>();
@@ -105,7 +105,8 @@ materializeSpecConstants(const char *KernelName,
 
   SYCLKernelInfo &MaterializerKernelInfo = *ModuleInfo.getKernelFor(KernelName);
   if (auto Error = translation::KernelTranslator::translateKernel(
-          MaterializerKernelInfo, *NewMod, JITCtx, TargetFormat)) {
+          MaterializerKernelInfo, *NewMod, JITCtx, TargetFormat, TargetCPU,
+          TargetFeatures)) {
     return errorToFusionResult(std::move(Error),
                                "Translation to output format failed");
   }
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 977d5a4a73eef..fe1e792cc2846 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -168,10 +168,11 @@ KernelTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
   return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel);
 }
 
-llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
-                                              llvm::Module &Mod,
-                                              JITContext &JITCtx,
-                                              BinaryFormat Format) {
+llvm::Error
+KernelTranslator::translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod,
+                                  JITContext &JITCtx, BinaryFormat Format,
+                                  const std::string &TargetCPU,
+                                  const std::string &TargetFeatures) {
 
   KernelBinary *KernelBin = nullptr;
   switch (Format) {
@@ -186,7 +187,7 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
   }
   case BinaryFormat::PTX: {
     llvm::Expected<KernelBinary *> BinaryOrError =
-        translateToPTX(Kernel, Mod, JITCtx);
+        translateToPTX(Kernel, Mod, JITCtx, TargetCPU, TargetFeatures);
     if (auto Error = BinaryOrError.takeError()) {
       return Error;
     }
@@ -195,7 +196,7 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
   }
   case BinaryFormat::AMDGCN: {
     llvm::Expected<KernelBinary *> BinaryOrError =
-        translateToAMDGCN(Kernel, Mod, JITCtx);
+        translateToAMDGCN(Kernel, Mod, JITCtx, TargetCPU, TargetFeatures);
     if (auto Error = BinaryOrError.takeError())
       return Error;
     KernelBin = *BinaryOrError;
@@ -226,9 +227,9 @@ KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) {
   return SPIRVLLVMTranslator::translateLLVMtoSPIRV(Mod, JITCtx);
 }
 
-llvm::Expected<KernelBinary *>
-KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
-                                 JITContext &JITCtx) {
+llvm::Expected<KernelBinary *> KernelTranslator::translateToPTX(
+    SYCLKernelInfo &KernelInfo, llvm::Module &Mod, JITContext &JITCtx,
+    const std::string &TargetCPU, const std::string &TargetFeatures) {
 #ifndef FUSION_JIT_SUPPORT_PTX
   (void)KernelInfo;
   (void)Mod;
@@ -257,23 +258,32 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
         ErrorMessage.c_str());
   }
 
-  llvm::StringRef TargetCPU{"sm_50"};
-  llvm::StringRef TargetFeatures{"+sm_50,+ptx76"};
-  if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name.c_str())) {
-    if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
-      TargetCPU =
-          KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
+  // Give priority to user specified values (through environment variables:
+  // SYCL_JIT_TARGET_CPU and SYCL_JIT_TARGET_FEATURES).
+  llvm::StringRef CPU{TargetCPU};
+  llvm::StringRef Features{TargetFeatures};
+
+  auto *KernelFunc = Mod.getFunction(KernelInfo.Name.c_str());
+  // If they were not set, use default and consult the module for alternatives
+  // (if present).
+  if (CPU.empty()) {
+    CPU = "sm_50";
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
+      CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
     }
-    if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
-      TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
-                           .getValueAsString();
+  }
+  if (Features.empty()) {
+    Features = "+sm_50,+ptx76";
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
+      Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
+                     .getValueAsString();
     }
   }
 
   // FIXME: Check whether we can provide more accurate target information here
   auto *TargetMachine = Target->createTargetMachine(
-      TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_,
-      std::nullopt, llvm::CodeGenOptLevel::Default);
+      TargetTriple, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
+      llvm::CodeGenOptLevel::Default);
 
   llvm::legacy::PassManager PM;
 
@@ -298,9 +308,9 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
 #endif // FUSION_JIT_SUPPORT_PTX
 }
 
-llvm::Expected<KernelBinary *>
-KernelTranslator::translateToAMDGCN(SYCLKernelInfo &KernelInfo,
-                                    llvm::Module &Mod, JITContext &JITCtx) {
+llvm::Expected<KernelBinary *> KernelTranslator::translateToAMDGCN(
+    SYCLKernelInfo &KernelInfo, llvm::Module &Mod, JITContext &JITCtx,
+    const std::string &TargetCPU, const std::string &TargetFeatures) {
 #ifndef FUSION_JIT_SUPPORT_AMDGCN
   (void)KernelInfo;
   (void)Mod;
@@ -329,25 +339,29 @@ KernelTranslator::translateToAMDGCN(SYCLKernelInfo &KernelInfo,
         "Failed to load and translate AMDGCN LLVM IR module with error %s",
         ErrorMessage.c_str());
 
-  // Set to the lowest tested target according to the GetStartedGuide, section
-  // "Build DPC++ toolchain with support for HIP AMD"
-  llvm::StringRef TargetCPU{"gfx906"};
-  llvm::StringRef TargetFeatures{""};
-  if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name.c_str())) {
-    if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
-      TargetCPU =
-          KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
+  llvm::StringRef CPU{TargetCPU};
+  llvm::StringRef Features{TargetFeatures};
+
+  auto *KernelFunc = Mod.getFunction(KernelInfo.Name.c_str());
+  if (CPU.empty()) {
+    // Set to the lowest tested target according to the GetStartedGuide, section
+    // "Build DPC++ toolchain with support for HIP AMD"
+    CPU = "gfx906";
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
+      CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
     }
-    if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
-      TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
-                           .getValueAsString();
+  }
+  if (Features.empty()) {
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
+      Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
+                     .getValueAsString();
     }
   }
 
   // FIXME: Check whether we can provide more accurate target information here
   auto *TargetMachine = Target->createTargetMachine(
-      TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_,
-      std::nullopt, llvm::CodeGenOptLevel::Default);
+      TargetTriple, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
+      llvm::CodeGenOptLevel::Default);
 
   std::string AMDObj;
   {
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
index 809c8fab2e42f..a881dd6176b70 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
@@ -25,7 +25,9 @@ class KernelTranslator {
   loadKernels(llvm::LLVMContext &LLVMCtx, std::vector<SYCLKernelInfo> &Kernels);
 
   static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod,
-                                     JITContext &JITCtx, BinaryFormat Format);
+                                     JITContext &JITCtx, BinaryFormat Format,
+                                     const std::string &TargetCPU = {},
+                                     const std::string &TargetFeatures = {});
 
 private:
   ///
@@ -42,11 +44,14 @@ class KernelTranslator {
                                                          JITContext &JITCtx);
 
   static llvm::Expected<KernelBinary *>
-  translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx);
+  translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx,
+                 const std::string &TargetCPU = {},
+                 const std::string &TargetFeatures = {});
 
   static llvm::Expected<KernelBinary *>
   translateToAMDGCN(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
-                    JITContext &JITCtx);
+                    JITContext &JITCtx, const std::string &TargetCPU = {},
+                    const std::string &TargetFeatures = {});
 };
 } // namespace translation
 } // namespace jit_compiler
diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def
index 3e6ca9e081369..9b9003795c27d 100644
--- a/sycl/source/detail/config.def
+++ b/sycl/source/detail/config.def
@@ -43,3 +43,5 @@ CONFIG(ONEAPI_DEVICE_SELECTOR, 1024, __ONEAPI_DEVICE_SELECTOR)
 CONFIG(SYCL_ENABLE_FUSION_CACHING, 1, __SYCL_ENABLE_FUSION_CACHING)
 CONFIG(SYCL_CACHE_IN_MEM, 1, __SYCL_CACHE_IN_MEM)
 CONFIG(SYCL_JIT_KERNELS, 1, __SYCL_JIT_KERNELS)
+CONFIG(SYCL_JIT_TARGET_CPU, 1024, __SYCL_JIT_TARGET_CPU)
+CONFIG(SYCL_JIT_TARGET_FEATURES, 1024, __SYCL_JIT_TARGET_FEATURES)
diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp
index 46177036115fe..2b32f932fb96b 100644
--- a/sycl/source/detail/config.hpp
+++ b/sycl/source/detail/config.hpp
@@ -639,6 +639,63 @@ template <> class SYCLConfig<SYCL_JIT_KERNELS> {
     return ValStr;
   }
 };
+
+template <> class SYCLConfig<SYCL_JIT_TARGET_CPU> {
+  using BaseT = SYCLConfigBase<SYCL_CACHE_IN_MEM>;
+
+public:
+  static std::string get() {
+    const std::string DefaultValue{""};
+
+    const char *ValStr = getCachedValue();
+
+    if (!ValStr)
+      return DefaultValue;
+
+    return std::string{ValStr};
+  }
+
+  static void reset() { (void)getCachedValue(/*ResetCache=*/true); }
+
+  static const char *getName() { return BaseT::MConfigName; }
+
+private:
+  static const char *getCachedValue(bool ResetCache = false) {
+    static const char *ValStr = BaseT::getRawValue();
+    if (ResetCache)
+      ValStr = BaseT::getRawValue();
+    return ValStr;
+  }
+};
+
+template <> class SYCLConfig<SYCL_JIT_TARGET_FEATURES> {
+  using BaseT = SYCLConfigBase<SYCL_CACHE_IN_MEM>;
+
+public:
+  static std::string get() {
+    const std::string DefaultValue{""};
+
+    const char *ValStr = getCachedValue();
+
+    if (!ValStr)
+      return DefaultValue;
+
+    return std::string{ValStr};
+  }
+
+  static void reset() { (void)getCachedValue(/*ResetCache=*/true); }
+
+  static const char *getName() { return BaseT::MConfigName; }
+
+private:
+  static const char *getCachedValue(bool ResetCache = false) {
+    static const char *ValStr = BaseT::getRawValue();
+    if (ResetCache)
+      ValStr = BaseT::getRawValue();
+    return ValStr;
+  }
+};
+
 #undef INVALID_CONFIG_EXCEPTION
 
 } // namespace detail
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index ac2c2e3489d65..a5b59f7f22d7b 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -722,7 +722,6 @@ sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
       BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
 
   ::jit_compiler::TargetInfo TargetInfo = getTargetInfo(Queue);
-  ::jit_compiler::BinaryFormat TargetFormat = TargetInfo.getFormat();
   AddToConfigHandle(
       ::jit_compiler::option::JITTargetInfo::set(std::move(TargetInfo)));
   bool DebugEnabled =
@@ -732,8 +731,13 @@ sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
   AddToConfigHandle(::jit_compiler::option::JITEnableCaching::set(
       detail::SYCLConfig<detail::SYCL_ENABLE_FUSION_CACHING>::get()));
 
-  auto MaterializerResult =
-      MaterializeSpecConstHandle(KernelName.c_str(), BinInfo, SpecConstBlob);
+  std::string TargetCPU =
+      detail::SYCLConfig<detail::SYCL_JIT_TARGET_CPU>::get();
+  std::string TargetFeatures =
+      detail::SYCLConfig<detail::SYCL_JIT_TARGET_FEATURES>::get();
+
+  auto MaterializerResult = MaterializeSpecConstHandle(
+      KernelName.c_str(), BinInfo, SpecConstBlob, TargetCPU, TargetFeatures);
   if (MaterializerResult.failed()) {
     std::string Message{"Compilation for kernel failed with message:\n"};
     Message.append(MaterializerResult.getErrorMessage());
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 2dfbab8272003..0442ae508d11b 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -2454,10 +2454,14 @@ sycl::detail::pi::PiKernel ProgramManager::getOrCreateMaterializedKernel(
   auto &Plugin = DeviceImpl->getPlugin();
   ProgramPtr ProgramManaged(
       Program, Plugin->getPiPlugin().PiFunctionTable.piProgramRelease);
-  // TODO: JKB: Flags and zeros.
+
+  std::string CompileOpts;
+  std::string LinkOpts;
+  applyOptionsFromEnvironment(CompileOpts, LinkOpts);
   auto BuildProgram =
-      build(std::move(ProgramManaged), detail::getSyclObjImpl(Context), "", "",
-            DeviceImpl->getHandleRef(), 0);
+      build(std::move(ProgramManaged), detail::getSyclObjImpl(Context),
+            CompileOpts, LinkOpts, DeviceImpl->getHandleRef(),
+            /*For non SPIR-V devices DeviceLibReqdMask is always 0*/ 0);
   sycl::detail::pi::PiKernel PiKernel{nullptr};
   Plugin->call<errc::kernel_not_supported, PiApiKind::piKernelCreate>(
       BuildProgram.get(), KernelName.c_str(), &PiKernel);

From 434bc9f52b97547f68016180837f4e05a8a1d09a Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Thu, 27 Jun 2024 08:57:02 +0000
Subject: [PATCH 06/22] [SYCL] Document
 SYCL_JIT_{KERNELS,TARGET_CPU,TARGET_FEATURES} env vars

---
 sycl/doc/EnvironmentVariables.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 455f4ddf0bd43..4cdb81f98af86 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -25,6 +25,9 @@ compiler and runtime.
 | `SYCL_EAGER_INIT` | Integer | Enable by specifying non-zero value. Tells the SYCL runtime to do as much as possible initialization at objects construction as opposed to doing lazy initialization on the fly. This may mean doing some redundant work at warmup but ensures fastest possible execution on the following hot and reportable paths. It also instructs PI plugins to do the same. Default is "0".  |
 | `SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE` | See [below](#sycl_reduction_preferred_workgroup_size) | Controls the preferred work-group size of reductions. |
 | `SYCL_ENABLE_FUSION_CACHING` | '1' or '0' | Enable ('1') or disable ('0') caching of JIT compilations for kernel fusion. Caching avoids repeatedly running the JIT compilation pipeline if the same sequence of kernels is fused multiple times. Default value is '1'. |
+| `SYCL_JIT_KERNELS` | '1' or '0' | Enable ('1') or disable ('0') JIT compilation of kernels. Only supported for Nvidia and AMD backends. When JIT-ing is enabled SYCL runtime will try to cache and reuse JIT-compiled kernels, furthermore if a kernel uses specialization constants the compiler will attempt to materialize the values in place, turning them to de-facto compile time constants. Default is '0'. |
+| `SYCL_JIT_TARGET_CPU` | Any(\*) | Allows setting the target CPU to be used when JIT-ing kernels. Examples include setting SM version for Nvidia, or target architecture for AMD. |
+| `SYCL_JIT_TARGET_FEATURES` | Any(\*) | Allows setting desired target features to be used when JIT-ing kernels. Examples include setting PTX version for Nvidia. |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 

From c3c9abc61305383d7dcdab49291736e91dbf619f Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Fri, 5 Jul 2024 14:49:49 +0100
Subject: [PATCH 07/22] PR feedback

---
 .../jit-compiler/include/KernelFusion.h       | 42 +++++-----
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp | 50 ++++++------
 .../lib/fusion/FusionPipeline.cpp             |  8 +-
 .../jit-compiler/lib/fusion/FusionPipeline.h  |  3 +-
 .../SYCLSpecConstMaterializer.cpp             | 26 +++---
 .../kernel-fusion/SYCLSpecConstMaterializer.h | 15 +---
 sycl-fusion/test/materializer/multi_type.ll   |  1 -
 sycl/include/sycl/detail/helpers.hpp          |  9 ++-
 sycl/source/detail/helpers.cpp                | 66 +++++++++++++++
 sycl/source/detail/jit_compiler.cpp           | 80 ++-----------------
 sycl/source/detail/jit_compiler.hpp           |  7 +-
 sycl/source/detail/scheduler/commands.cpp     | 42 +++-------
 sycl/source/detail/scheduler/commands.hpp     |  2 +-
 sycl/source/detail/scheduler/scheduler.cpp    | 10 +--
 sycl/source/detail/scheduler/scheduler.hpp    |  2 +-
 sycl/source/handler.cpp                       | 11 ++-
 16 files changed, 174 insertions(+), 200 deletions(-)

diff --git a/sycl-fusion/jit-compiler/include/KernelFusion.h b/sycl-fusion/jit-compiler/include/KernelFusion.h
index e11aa2154b3f5..d602c9388db0c 100644
--- a/sycl-fusion/jit-compiler/include/KernelFusion.h
+++ b/sycl-fusion/jit-compiler/include/KernelFusion.h
@@ -19,19 +19,18 @@
 
 namespace jit_compiler {
 
-class FusionResult {
+class JITResult {
 public:
-  explicit FusionResult(const char *ErrorMessage)
-      : Type{FusionResultType::FAILED}, KernelInfo{},
-        ErrorMessage{ErrorMessage} {}
+  explicit JITResult(const char *ErrorMessage)
+      : Type{JITResultType::FAILED}, KernelInfo{}, ErrorMessage{ErrorMessage} {}
 
-  explicit FusionResult(const SYCLKernelInfo &KernelInfo, bool Cached = false)
-      : Type{(Cached) ? FusionResultType::CACHED : FusionResultType::NEW},
+  explicit JITResult(const SYCLKernelInfo &KernelInfo, bool Cached = false)
+      : Type{(Cached) ? JITResultType::CACHED : JITResultType::NEW},
         KernelInfo(KernelInfo), ErrorMessage{} {}
 
-  bool failed() const { return Type == FusionResultType::FAILED; }
+  bool failed() const { return Type == JITResultType::FAILED; }
 
-  bool cached() const { return Type == FusionResultType::CACHED; }
+  bool cached() const { return Type == JITResultType::CACHED; }
 
   const char *getErrorMessage() const {
     assert(failed() && "No error message present");
@@ -44,9 +43,9 @@ class FusionResult {
   }
 
 private:
-  enum class FusionResultType { FAILED, CACHED, NEW };
+  enum class JITResultType { FAILED, CACHED, NEW };
 
-  FusionResultType Type;
+  JITResultType Type;
   SYCLKernelInfo KernelInfo;
   sycl::detail::string ErrorMessage;
 };
@@ -56,17 +55,18 @@ extern "C" {
 #ifdef __clang__
 #pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
 #endif // __clang__
-FusionResult fuseKernels(View<SYCLKernelInfo> KernelInformation,
-                         const char *FusedKernelName,
-                         View<ParameterIdentity> Identities,
-                         BarrierFlags BarriersFlags,
-                         View<ParameterInternalization> Internalization,
-                         View<jit_compiler::JITConstant> JITConstants);
-
-FusionResult materializeSpecConstants(
-    const char *KernelName, jit_compiler::SYCLKernelBinaryInfo &BinInfo,
-    std::vector<unsigned char> &SpecConstBlob, const std::string &TargetCPU,
-    const std::string &TargetFeatures);
+JITResult fuseKernels(View<SYCLKernelInfo> KernelInformation,
+                      const char *FusedKernelName,
+                      View<ParameterIdentity> Identities,
+                      BarrierFlags BarriersFlags,
+                      View<ParameterInternalization> Internalization,
+                      View<jit_compiler::JITConstant> JITConstants);
+
+JITResult materializeSpecConstants(const char *KernelName,
+                                   jit_compiler::SYCLKernelBinaryInfo &BinInfo,
+                                   const View<unsigned char> &SpecConstBlob,
+                                   const char *TargetCPU,
+                                   const char *TargetFeatures);
 
 /// Clear all previously set options.
 void resetJITConfiguration();
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index c329c11c0f94d..2ae3f3fe46d64 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -24,8 +24,8 @@ using namespace jit_compiler;
 using FusedFunction = helper::FusionHelper::FusedFunction;
 using FusedFunctionList = std::vector<FusedFunction>;
 
-static FusionResult errorToFusionResult(llvm::Error &&Err,
-                                        const std::string &Msg) {
+static JITResult errorToFusionResult(llvm::Error &&Err,
+                                     const std::string &Msg) {
   std::stringstream ErrMsg;
   ErrMsg << Msg << "\nDetailed information:\n";
   llvm::handleAllErrors(std::move(Err),
@@ -34,7 +34,7 @@ static FusionResult errorToFusionResult(llvm::Error &&Err,
                           // compiled without exception support.
                           ErrMsg << "\t" << StrErr.getMessage() << "\n";
                         });
-  return FusionResult{ErrMsg.str().c_str()};
+  return JITResult{ErrMsg.str().c_str()};
 }
 
 static std::vector<jit_compiler::NDRange>
@@ -70,18 +70,19 @@ static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
   }
 }
 
-extern "C" FusionResult materializeSpecConstants(
-    const char *KernelName, jit_compiler::SYCLKernelBinaryInfo &BinInfo,
-    std::vector<unsigned char> &SpecConstBlob, const std::string &TargetCPU,
-    const std::string &TargetFeatures) {
+extern "C" JITResult
+materializeSpecConstants(const char *KernelName,
+                         jit_compiler::SYCLKernelBinaryInfo &BinInfo,
+                         const View<unsigned char> &SpecConstBlob,
+                         const char *TargetCPU, const char *TargetFeatures) {
   auto &JITCtx = JITContext::getInstance();
 
   TargetInfo TargetInfo = ConfigHelper::get<option::JITTargetInfo>();
   BinaryFormat TargetFormat = TargetInfo.getFormat();
   if (TargetFormat != BinaryFormat::PTX &&
       TargetFormat != BinaryFormat::AMDGCN) {
-    return FusionResult(
-        "Fusion output target format not supported by this build");
+    return JITResult("Output target format not supported by this build. "
+                     "Available targets are: PTX or AMDGCN.");
   }
 
   ::jit_compiler::SYCLKernelInfo KernelInfo{
@@ -89,7 +90,7 @@ extern "C" FusionResult materializeSpecConstants(
       ::jit_compiler::NDRange{}, BinInfo};
   SYCLModuleInfo ModuleInfo;
   ModuleInfo.kernels().insert(ModuleInfo.kernels().end(), KernelInfo);
-  // Load all input kernels from their respective SPIR-V modules into a single
+  // Load all input kernels from their respective modules into a single
   // LLVM IR module.
   llvm::Expected<std::unique_ptr<llvm::Module>> ModOrError =
       translation::KernelTranslator::loadKernels(*JITCtx.getLLVMContext(),
@@ -98,9 +99,10 @@ extern "C" FusionResult materializeSpecConstants(
     return errorToFusionResult(std::move(Error), "Failed to load kernels");
   }
   std::unique_ptr<llvm::Module> NewMod = std::move(*ModOrError);
-  if (!fusion::FusionPipeline::runMaterializerPasses(*NewMod, SpecConstBlob) ||
+  if (!fusion::FusionPipeline::runMaterializerPasses(
+          *NewMod, SpecConstBlob.begin(), SpecConstBlob.size()) ||
       !NewMod->getFunction(KernelName)) {
-    return FusionResult{"Materializer passes should not fail"};
+    return JITResult{"Materializer passes should not fail"};
   }
 
   SYCLKernelInfo &MaterializerKernelInfo = *ModuleInfo.getKernelFor(KernelName);
@@ -111,14 +113,15 @@ extern "C" FusionResult materializeSpecConstants(
                                "Translation to output format failed");
   }
 
-  return FusionResult{MaterializerKernelInfo};
+  return JITResult{MaterializerKernelInfo};
 }
 
-extern "C" FusionResult
-fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
-            View<ParameterIdentity> Identities, BarrierFlags BarriersFlags,
-            View<ParameterInternalization> Internalization,
-            View<jit_compiler::JITConstant> Constants) {
+extern "C" JITResult fuseKernels(View<SYCLKernelInfo> KernelInformation,
+                                 const char *FusedKernelName,
+                                 View<ParameterIdentity> Identities,
+                                 BarrierFlags BarriersFlags,
+                                 View<ParameterInternalization> Internalization,
+                                 View<jit_compiler::JITConstant> Constants) {
 
   std::vector<std::string> KernelsToFuse;
   llvm::transform(KernelInformation, std::back_inserter(KernelsToFuse),
@@ -137,8 +140,7 @@ fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
   }
 
   if (!isTargetFormatSupported(TargetFormat)) {
-    return FusionResult(
-        "Fusion output target format not supported by this build");
+    return JITResult("Fusion output target format not supported by this build");
   }
 
   auto &JITCtx = JITContext::getInstance();
@@ -161,7 +163,7 @@ fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
         // before returning the kernel info to the runtime.
         CachedKernel->NDR = FusedNDR->getNDR();
       }
-      return FusionResult{*CachedKernel, /*Cached*/ true};
+      return JITResult{*CachedKernel, /*Cached*/ true};
     }
     helper::printDebugMessage(
         "Compiling new kernel, no suitable cached kernel found");
@@ -209,13 +211,13 @@ fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
                                               BarriersFlags);
 
   if (!NewMod->getFunction(FusedKernelName)) {
-    return FusionResult{"Kernel fusion failed"};
+    return JITResult{"Kernel fusion failed"};
   }
 
   // Get the updated kernel info for the fused kernel and add the information to
   // the existing KernelInfo.
   if (!NewModInfo->hasKernelFor(FusedKernelName)) {
-    return FusionResult{"No KernelInfo for fused kernel"};
+    return JITResult{"No KernelInfo for fused kernel"};
   }
 
   SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName);
@@ -232,7 +234,7 @@ fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
     JITCtx.addCacheEntry(CacheKey, FusedKernelInfo);
   }
 
-  return FusionResult{FusedKernelInfo};
+  return JITResult{FusedKernelInfo};
 }
 
 extern "C" void resetJITConfiguration() { ConfigHelper::reset(); }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index f9a0d1fd74b43..dc94eda2917e2 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -143,8 +143,9 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
   return std::make_unique<SYCLModuleInfo>(std::move(*NewModInfo.ModuleInfo));
 }
 
-bool FusionPipeline::runMaterializerPasses(
-    llvm::Module &Mod, std::vector<unsigned char> &SpecConstBlob) {
+bool FusionPipeline::runMaterializerPasses(llvm::Module &Mod,
+                                           const unsigned char *SpecConstData,
+                                           size_t SpecConstDataSize) {
   PassBuilder PB;
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
@@ -162,8 +163,7 @@ bool FusionPipeline::runMaterializerPasses(
   // Register inserter and materializer passes.
   {
     FunctionPassManager FPM;
-    MPM.addPass(
-        SYCLSpecConstDataInserter{SpecConstBlob.data(), SpecConstBlob.size()});
+    MPM.addPass(SYCLSpecConstDataInserter{SpecConstData, SpecConstDataSize});
     FPM.addPass(SYCLSpecConstMaterializer{});
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
index 28ad4dc1ce362..e1284484f7494 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
@@ -32,7 +32,8 @@ class FusionPipeline {
   /// Run the necessary passes in a custom pass pipeline to perform
   /// materialization of kernel specialization constants.
   static bool runMaterializerPasses(llvm::Module &Mod,
-                                    std::vector<unsigned char> &SpecConstBlob);
+                                    const unsigned char *SpecConstData,
+                                    size_t SpecConstDataSize);
 };
 } // namespace fusion
 } // namespace jit_compiler
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
index 8cd6fe2a048b8..1227e396eeefe 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
@@ -10,6 +10,7 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/SYCLLowerIR/TargetHelpers.h"
+#include <llvm/ADT/StringRef.h>
 
 #define DEBUG_TYPE "sycl-spec-const-materializer"
 
@@ -33,19 +34,19 @@ const bool SYCLSpecConstMaterializer::IsDebug =
       LLVM_DEBUG(X);                                                           \
   } while (false)
 
-#define SPEC_CONST_DATA_NODE_NAME "SYCL_SpecConst_data"
+constexpr llvm::StringLiteral SPEC_CONST_DATA_NODE_NAME{"SYCL_SpecConst_data"};
 
 PreservedAnalyses SYCLSpecConstDataInserter::run(Module &M,
                                                  ModuleAnalysisManager &) {
   if (M.getNamedMetadata(SPEC_CONST_DATA_NODE_NAME))
-    llvm_unreachable("Did not expecte the node to be present.");
+    llvm_unreachable("Did not expect the node to be present.");
 
   auto &Context = M.getContext();
-  auto *SYCLMD = M.getOrInsertNamedMetadata(SPEC_CONST_DATA_NODE_NAME);
+  auto *SpecConstMD = M.getOrInsertNamedMetadata(SPEC_CONST_DATA_NODE_NAME);
   auto *StringMD = MDString::get(
-      Context, std::string{(const char *)SpecConstData, SpecConstDataSize});
+      Context, StringRef{(const char *)SpecConstData, SpecConstDataSize});
   auto *TupleMD = MDTuple::get(Context, {StringMD});
-  SYCLMD->addOperand(TupleMD);
+  SpecConstMD->addOperand(TupleMD);
 
   return PreservedAnalyses::all();
 }
@@ -110,11 +111,10 @@ void SYCLSpecConstMaterializer::allocateSpecConstant(StringRef KernelName) {
     auto *Initializer = getConstantOfType(&ValPtr, I.value().first);
     // AMD's CONSTANT_ADDRESS and Nvidia's ADDRESS_SPACE_CONST happen to have
     // the same value.
-    const unsigned AS = 4;
+    constexpr unsigned AS = 4;
     auto *SpecConstGlobal = new GlobalVariable(
         *Mod, Ty, /*isConstant*/ true, GlobalValue::WeakODRLinkage, Initializer,
-        Twine("SpecConsBlob_" + std::string(KernelName) + "_" +
-              std::to_string(I.index())),
+        Twine("SpecConsBlob_" + KernelName + "_" + Twine(I.index())),
         /*InsertBefore*/ nullptr, GlobalValue::NotThreadLocal, AS,
         /*isExternallyInitialized*/ false);
     TypesAndOffsetsToBlob[I.value()] = SpecConstGlobal;
@@ -282,13 +282,9 @@ bool SYCLSpecConstMaterializer::readMetadata() {
   if (!NamedMD || NamedMD->getNumOperands() != 1)
     return false;
 
-  auto *MDN = dyn_cast<MDTuple>(NamedMD->getOperand(0));
-  if (!MDN || MDN->getNumOperands() != 1)
-    llvm_unreachable("Malformed data node.");
-
-  auto *MDS = dyn_cast<MDString>(MDN->getOperand(0));
-  if (!MDS)
-    llvm_unreachable("Malformed string node.");
+  auto *MDN = cast<MDTuple>(NamedMD->getOperand(0));
+  assert(MDN->getNumOperands() != 1 && "Malformed data node.");
+  auto *MDS = cast<MDString>(MDN->getOperand(0));
 
   SpecConstData = MDS->getString().bytes_begin();
   SpecConstDataSize = MDS->getString().size();
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
index bf715543fb4ed..138b6271466e9 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
@@ -25,7 +25,7 @@ class SYCLSpecConstDataInserter
 public:
   SYCLSpecConstDataInserter(const unsigned char *SpecConstData,
                             size_t SpecConstDataSize)
-      : SpecConstData(SpecConstData), SpecConstDataSize(SpecConstDataSize) {};
+      : SpecConstData(SpecConstData), SpecConstDataSize(SpecConstDataSize){};
 
   SYCLSpecConstDataInserter() {
     SpecConstDataSize = 255;
@@ -103,21 +103,12 @@ class SYCLSpecConstMaterializer
   // Type of the specialization constant and the offset into the SpecConstBlob,
   // at which the value is located.
   using TypeAtOffset = std::pair<Type *, uint64_t>;
-  // Helper allowing sorting of TypeAtOffset containers.
-  struct TypeAtOffsetCmp {
-    bool operator()(const TypeAtOffset &LHS, const TypeAtOffset &RHS) const {
-      if (std::get<0>(LHS) == std::get<0>(RHS))
-        return std::get<1>(LHS) < std::get<1>(RHS);
-      return std::get<0>(LHS) < std::get<0>(RHS);
-    }
-  };
 
   // Unique uses of spec const (type and offset).
-  std::set<TypeAtOffset, TypeAtOffsetCmp> TypesAndOffsets{};
+  std::set<TypeAtOffset> TypesAndOffsets{};
   // A map from type and offset to a specialization constant blob to a
   // GlobalVariable containing its value.
-  std::map<TypeAtOffset, GlobalVariable *, TypeAtOffsetCmp>
-      TypesAndOffsetsToBlob{};
+  std::map<TypeAtOffset, GlobalVariable *> TypesAndOffsetsToBlob{};
   // A map of load instruction to its type and offset to a specialization
   // constant blob.
   std::map<LoadInst *, TypeAtOffset> LoadsToTypes{};
diff --git a/sycl-fusion/test/materializer/multi_type.ll b/sycl-fusion/test/materializer/multi_type.ll
index a9263ee8c9b51..19972cbe3832d 100644
--- a/sycl-fusion/test/materializer/multi_type.ll
+++ b/sycl-fusion/test/materializer/multi_type.ll
@@ -54,7 +54,6 @@ entry:
   %extract1 = extractvalue [2 x i32] %load2, 0
   %extract2 = extractvalue [2 x i32] %load2, 1
 
-  ; See the TODO above, this should be const propagated
   ;CHECK-MATERIALIZER: load %"struct.std::array", ptr addrspace(4) @SpecConsBlob___test_kernel_1, align 1
   %gep3 = getelementptr i8, ptr %0, i32 18
   %bc3 = bitcast ptr %gep3 to ptr
diff --git a/sycl/include/sycl/detail/helpers.hpp b/sycl/include/sycl/detail/helpers.hpp
index 9c7d0eddd59e2..60780ed338814 100644
--- a/sycl/include/sycl/detail/helpers.hpp
+++ b/sycl/include/sycl/detail/helpers.hpp
@@ -39,9 +39,12 @@ template <typename Type, std::size_t NumElements> class marray;
 enum class memory_order;
 
 namespace detail {
-
+class CGExecKernel;
 class buffer_impl;
 class context_impl;
+class queue_impl;
+using QueueImplPtr = std::shared_ptr<sycl::detail::queue_impl>;
+class RTDeviceBinaryImage;
 // The function returns list of events that can be passed to OpenCL API as
 // dependency list and waits for others.
 __SYCL_EXPORT std::vector<sycl::detail::pi::PiEvent>
@@ -254,6 +257,10 @@ template <size_t count, class F> void loop(F &&f) {
   loop_impl(std::make_index_sequence<count>{}, std::forward<F>(f));
 }
 inline constexpr bool is_power_of_two(int x) { return (x & (x - 1)) == 0; }
+
+std::tuple<const RTDeviceBinaryImage *, sycl::detail::pi::PiProgram>
+retrieveKernelBinary(const QueueImplPtr &, const char *KernelName,
+                     CGExecKernel *CGKernel = nullptr);
 } // namespace detail
 
 } // namespace _V1
diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp
index 1bdb2ddbd4697..1f097297dedd2 100644
--- a/sycl/source/detail/helpers.cpp
+++ b/sycl/source/detail/helpers.cpp
@@ -11,11 +11,14 @@
 
 #include <detail/buffer_impl.hpp>
 #include <detail/context_impl.hpp>
+#include <detail/device_binary_image.hpp>
 #include <detail/event_impl.hpp>
+#include <detail/kernel_bundle_impl.hpp>
 #include <detail/queue_impl.hpp>
 #include <sycl/event.hpp>
 
 #include <memory>
+#include <tuple>
 
 namespace sycl {
 inline namespace _V1 {
@@ -71,6 +74,69 @@ void markBufferAsInternal(const std::shared_ptr<buffer_impl> &BufImpl) {
   BufImpl->markAsInternal();
 }
 
+std::tuple<const RTDeviceBinaryImage *, sycl::detail::pi::PiProgram>
+retrieveKernelBinary(const QueueImplPtr &Queue, const char *KernelName,
+                     CGExecKernel *KernelCG) {
+  bool isNvidia =
+      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_cuda;
+  bool isHIP =
+      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_hip;
+  if (isNvidia || isHIP) {
+    auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
+    std::vector<kernel_id> KernelIds{KernelID};
+    auto DeviceImages =
+        ProgramManager::getInstance().getRawDeviceImages(KernelIds);
+    auto DeviceImage = std::find_if(
+        DeviceImages.begin(), DeviceImages.end(),
+        [isNvidia](RTDeviceBinaryImage *DI) {
+          const std::string &TargetSpec = isNvidia ? std::string("llvm_nvptx64")
+                                                   : std::string("llvm_amdgcn");
+          return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
+                 DI->getRawData().DeviceTargetSpec == TargetSpec;
+        });
+    if (DeviceImage == DeviceImages.end()) {
+      return {nullptr, nullptr};
+    }
+    auto ContextImpl = Queue->getContextImplPtr();
+    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+    auto DeviceImpl = Queue->getDeviceImplPtr();
+    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+    sycl::detail::pi::PiProgram Program =
+        detail::ProgramManager::getInstance().createPIProgram(**DeviceImage,
+                                                              Context, Device);
+    return {*DeviceImage, Program};
+  }
+
+  assert(KernelCG && "CGExecKernel must be provided.");
+  const RTDeviceBinaryImage *DeviceImage = nullptr;
+  sycl::detail::pi::PiProgram Program = nullptr;
+  if (KernelCG->getKernelBundle() != nullptr) {
+    // Retrieve the device image from the kernel bundle.
+    auto KernelBundle = KernelCG->getKernelBundle();
+    kernel_id KernelID =
+        detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
+
+    auto SyclKernel = detail::getSyclObjImpl(
+        KernelBundle->get_kernel(KernelID, KernelBundle));
+
+    DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
+    Program = SyclKernel->getDeviceImage()->get_program_ref();
+  } else if (KernelCG->MSyclKernel != nullptr) {
+    DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
+    Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
+  } else {
+    auto ContextImpl = Queue->getContextImplPtr();
+    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+    auto DeviceImpl = Queue->getDeviceImplPtr();
+    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+    DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
+        KernelName, Context, Device);
+    Program = detail::ProgramManager::getInstance().createPIProgram(
+        *DeviceImage, Context, Device);
+  }
+  return {DeviceImage, Program};
+}
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index a5b59f7f22d7b..db8e74959e805 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -116,69 +116,6 @@ ::jit_compiler::TargetInfo getTargetInfo(QueueImplPtr &Queue) {
                   Queue->getDeviceImplPtr()->getDeviceArch()));
 }
 
-std::pair<const RTDeviceBinaryImage *, sycl::detail::pi::PiProgram>
-retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
-  auto KernelName = KernelCG->getKernelName();
-
-  bool isNvidia =
-      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_cuda;
-  bool isHIP =
-      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_hip;
-  if (isNvidia || isHIP) {
-    auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
-    std::vector<kernel_id> KernelIds{KernelID};
-    auto DeviceImages =
-        ProgramManager::getInstance().getRawDeviceImages(KernelIds);
-    auto DeviceImage = std::find_if(
-        DeviceImages.begin(), DeviceImages.end(),
-        [isNvidia](RTDeviceBinaryImage *DI) {
-          const std::string &TargetSpec = isNvidia ? std::string("llvm_nvptx64")
-                                                   : std::string("llvm_amdgcn");
-          return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
-                 DI->getRawData().DeviceTargetSpec == TargetSpec;
-        });
-    if (DeviceImage == DeviceImages.end()) {
-      return {nullptr, nullptr};
-    }
-    auto ContextImpl = Queue->getContextImplPtr();
-    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
-    auto DeviceImpl = Queue->getDeviceImplPtr();
-    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
-    sycl::detail::pi::PiProgram Program =
-        detail::ProgramManager::getInstance().createPIProgram(**DeviceImage,
-                                                              Context, Device);
-    return {*DeviceImage, Program};
-  }
-
-  const RTDeviceBinaryImage *DeviceImage = nullptr;
-  sycl::detail::pi::PiProgram Program = nullptr;
-  if (KernelCG->getKernelBundle() != nullptr) {
-    // Retrieve the device image from the kernel bundle.
-    auto KernelBundle = KernelCG->getKernelBundle();
-    kernel_id KernelID =
-        detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
-
-    auto SyclKernel = detail::getSyclObjImpl(
-        KernelBundle->get_kernel(KernelID, KernelBundle));
-
-    DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
-    Program = SyclKernel->getDeviceImage()->get_program_ref();
-  } else if (KernelCG->MSyclKernel != nullptr) {
-    DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
-    Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
-  } else {
-    auto ContextImpl = Queue->getContextImplPtr();
-    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
-    auto DeviceImpl = Queue->getDeviceImplPtr();
-    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
-    DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
-        KernelName, Context, Device);
-    Program = detail::ProgramManager::getInstance().createPIProgram(
-        *DeviceImage, Context, Device);
-  }
-  return {DeviceImage, Program};
-}
-
 static ::jit_compiler::ParameterKind
 translateArgType(kernel_param_kind_t Kind) {
   using PK = ::jit_compiler::ParameterKind;
@@ -689,7 +626,7 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
 }
 
 sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
-    QueueImplPtr Queue, RTDeviceBinaryImage *BinImage,
+    QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage,
     const std::string &KernelName, std::vector<unsigned char> &SpecConstBlob) {
   if (!BinImage) {
     throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
@@ -710,8 +647,8 @@ sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
   auto &RawDeviceImage = BinImage->getRawData();
   auto DeviceImageSize = static_cast<size_t>(RawDeviceImage.BinaryEnd -
                                              RawDeviceImage.BinaryStart);
-  // Set 0 as the number of address bits, because the JIT compiler can set
-  // this field based on information from SPIR-V/LLVM module's data-layout.
+  // Set 0 as the number of address bits, because the JIT compiler can set this
+  // field based on information from LLVM module's data-layout.
   auto BinaryImageFormat = translateBinaryImageFormat(BinImage->getFormat());
   if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
     throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
@@ -728,16 +665,15 @@ sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
       detail::SYCLConfig<detail::SYCL_RT_WARNING_LEVEL>::get() > 0;
   AddToConfigHandle(
       ::jit_compiler::option::JITEnableVerbose::set(DebugEnabled));
-  AddToConfigHandle(::jit_compiler::option::JITEnableCaching::set(
-      detail::SYCLConfig<detail::SYCL_ENABLE_FUSION_CACHING>::get()));
 
   std::string TargetCPU =
       detail::SYCLConfig<detail::SYCL_JIT_TARGET_CPU>::get();
   std::string TargetFeatures =
       detail::SYCLConfig<detail::SYCL_JIT_TARGET_FEATURES>::get();
 
-  auto MaterializerResult = MaterializeSpecConstHandle(
-      KernelName.c_str(), BinInfo, SpecConstBlob, TargetCPU, TargetFeatures);
+  auto MaterializerResult =
+      MaterializeSpecConstHandle(KernelName.c_str(), BinInfo, SpecConstBlob,
+                                 TargetCPU.c_str(), TargetFeatures.c_str());
   if (MaterializerResult.failed()) {
     std::string Message{"Compilation for kernel failed with message:\n"};
     Message.append(MaterializerResult.getErrorMessage());
@@ -829,8 +765,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       return nullptr;
     }
 
-    auto [DeviceImage, Program] = retrieveKernelBinary(Queue, KernelCG);
-
+    auto [DeviceImage, Program] =
+        retrieveKernelBinary(Queue, KernelName.c_str(), KernelCG);
     if (!DeviceImage || !Program) {
       printPerformanceWarning("No suitable IR available for fusion");
       return nullptr;
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index c283c5c19c223..d0df4b775cd46 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -40,10 +40,9 @@ class jit_compiler {
   std::unique_ptr<detail::CG>
   fuseKernels(QueueImplPtr Queue, std::vector<ExecCGCommand *> &InputKernels,
               const property_list &);
-  sycl::detail::pi::PiKernel
-  materializeSpecConstants(QueueImplPtr Queue, RTDeviceBinaryImage *BinImage,
-                           const std::string &KernelName,
-                           std::vector<unsigned char> &SpecConstBlob);
+  sycl::detail::pi::PiKernel materializeSpecConstants(
+      QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage,
+      const std::string &KernelName, std::vector<unsigned char> &SpecConstBlob);
 
   bool isAvailable() { return Available; }
 
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index d6c3540c33444..21817923de9a0 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -26,6 +26,7 @@
 #include <sycl/access/access.hpp>
 #include <sycl/backend_types.hpp>
 #include <sycl/detail/cg_types.hpp>
+#include <sycl/detail/helpers.hpp>
 #include <sycl/detail/kernel_desc.hpp>
 #include <sycl/sampler.hpp>
 
@@ -76,33 +77,6 @@ void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
 }
 #endif
 
-RTDeviceBinaryImage *
-retrieveAMDGCNOrNVPTXKernelBinary(const DeviceImplPtr DeviceImpl,
-                                  const std::string &KernelName) {
-  const bool IsNvidia = DeviceImpl->getBackend() == backend::ext_oneapi_cuda;
-  const bool IsHIP = DeviceImpl->getBackend() == backend::ext_oneapi_hip;
-  if (!(IsNvidia || IsHIP))
-    throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
-                          "Unsupported backend" +
-                              codeToString(PI_ERROR_INVALID_VALUE));
-  auto &PM = ProgramManager::getInstance();
-  const auto KernelID = PM.getSYCLKernelID(KernelName);
-  std::vector<kernel_id> KernelIds{KernelID};
-  const auto DeviceImages = PM.getRawDeviceImages(KernelIds);
-  const auto DeviceImage = std::find_if(
-      DeviceImages.begin(), DeviceImages.end(),
-      [IsNvidia](RTDeviceBinaryImage *DI) {
-        const std::string &TargetSpec =
-            IsNvidia ? std::string("llvm_nvptx64") : std::string("llvm_amdgcn");
-        return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
-               DI->getRawData().DeviceTargetSpec == TargetSpec;
-      });
-  if (DeviceImage == DeviceImages.end())
-    return nullptr;
-
-  return *DeviceImage;
-}
-
 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
 struct DemangleHandle {
   char *p;
@@ -2415,7 +2389,7 @@ static pi_result SetKernelParamsAndLaunch(
     const detail::EventImplPtr &OutEventImpl,
     const KernelArgMask *EliminatedArgMask,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    bool IsCooperative, RTDeviceBinaryImage *BinImage,
+    bool IsCooperative, const RTDeviceBinaryImage *BinImage,
     const std::string &KernelName) {
   const PluginPtr &Plugin = Queue->getPlugin();
 
@@ -2615,7 +2589,7 @@ pi_int32 enqueueImpKernel(
     const detail::EventImplPtr &OutEventImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
-    const bool KernelIsCooperative, RTDeviceBinaryImage *BinImage) {
+    const bool KernelIsCooperative, const RTDeviceBinaryImage *BinImage) {
 
   // Run OpenCL kernel
   auto ContextImpl = Queue->getContextImplPtr();
@@ -3075,10 +3049,12 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
       }
     }
 
-    RTDeviceBinaryImage *BinImage = nullptr;
-    if (detail::SYCLConfig<detail::SYCL_JIT_KERNELS>::get())
-      BinImage = retrieveAMDGCNOrNVPTXKernelBinary(MQueue->getDeviceImplPtr(),
-                                                   KernelName);
+    const RTDeviceBinaryImage *BinImage = nullptr;
+    if (detail::SYCLConfig<detail::SYCL_JIT_KERNELS>::get()) {
+      std::tie(BinImage, std::ignore) =
+          retrieveKernelBinary(MQueue, KernelName.c_str());
+      assert(BinImage && "Failed to obtain a binary image.");
+    }
 
     return enqueueImpKernel(
         MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 58ed5b12bc097..6bf1dacba06d3 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -641,7 +641,7 @@ pi_int32 enqueueImpKernel(
     const detail::EventImplPtr &Event,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
-    bool KernelIsCooperative, RTDeviceBinaryImage *BinImage = nullptr);
+    bool KernelIsCooperative, const RTDeviceBinaryImage *BinImage = nullptr);
 
 class KernelFusionCommand;
 
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 1cf457e0a86cc..9b6b6cb2c6def 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -638,18 +638,16 @@ void Scheduler::cancelFusion(QueueImplPtr Queue) {
 }
 
 sycl::detail::pi::PiKernel Scheduler::completeSpecConstMaterialization(
-    QueueImplPtr Queue, RTDeviceBinaryImage *BinImage,
-    const std::string &KernelName, std::vector<unsigned char> &SpecConstBlob) {
+    [[maybe_unused]] QueueImplPtr Queue,
+    [[maybe_unused]] const RTDeviceBinaryImage *BinImage,
+    [[maybe_unused]] const std::string &KernelName,
+    [[maybe_unused]] std::vector<unsigned char> &SpecConstBlob) {
 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
   return detail::jit_compiler::get_instance().materializeSpecConstants(
       Queue, BinImage, KernelName, SpecConstBlob);
 #else  // SYCL_EXT_CODEPLAY_KERNEL_FUSION
   printFusionWarning(
       "Materialization of spec constants not supported by this build");
-  (void)Queue;
-  (void)BinImage;
-  (void)KernelName;
-  (void)SpecConstBlob;
   return nullptr;
 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
 }
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 01e7739e53e01..a9b85eba293fc 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -468,7 +468,7 @@ class Scheduler {
 
   EventImplPtr completeFusion(QueueImplPtr Queue, const property_list &);
   sycl::detail::pi::PiKernel completeSpecConstMaterialization(
-      QueueImplPtr Queue, RTDeviceBinaryImage *BinImage,
+      QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage,
       const std::string &KernelName, std::vector<unsigned char> &SpecConstBlob);
 
   bool isInFusionMode(QueueIdT Queue);
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index c286780bec9f6..4600def575613 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "sycl/detail/helpers.hpp"
 #include <algorithm>
 
 #include <detail/config.hpp>
@@ -293,10 +294,12 @@ event handler::finalize() {
                                           : nullptr);
           Result = PI_SUCCESS;
         } else {
-          detail::RTDeviceBinaryImage *BinImage = nullptr;
-          if (detail::SYCLConfig<detail::SYCL_JIT_KERNELS>::get())
-            BinImage = detail::retrieveAMDGCNOrNVPTXKernelBinary(
-                MQueue->getDeviceImplPtr(), MKernelName.c_str());
+          const detail::RTDeviceBinaryImage *BinImage = nullptr;
+          if (detail::SYCLConfig<detail::SYCL_JIT_KERNELS>::get()) {
+            std::tie(BinImage, std::ignore) =
+                detail::retrieveKernelBinary(MQueue, MKernelName.c_str());
+            assert(BinImage && "Failed to obtain a binary image.");
+          }
 
           Result = enqueueImpKernel(
               MQueue, MNDRDesc, MArgs, KernelBundleImpPtr, MKernel,

From df9133fddf5a70354b6d4d84a271c7df5fc4627d Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Wed, 10 Jul 2024 12:54:34 +0100
Subject: [PATCH 08/22] PR feedback 2

---
 .../jit-compiler/include/KernelFusion.h       |  2 +-
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp |  2 +-
 .../lib/translation/KernelTranslation.cpp     |  2 +-
 .../SYCLSpecConstMaterializer.cpp             | 65 ++++++++-----------
 sycl-fusion/test/materializer/debug_output.ll |  4 +-
 sycl/doc/EnvironmentVariables.md              |  7 +-
 sycl/source/detail/config.def                 |  6 +-
 sycl/source/detail/config.hpp                 | 12 ++--
 sycl/source/detail/jit_compiler.cpp           |  4 +-
 sycl/source/detail/scheduler/commands.cpp     |  4 +-
 sycl/source/handler.cpp                       |  2 +-
 11 files changed, 50 insertions(+), 60 deletions(-)

diff --git a/sycl-fusion/jit-compiler/include/KernelFusion.h b/sycl-fusion/jit-compiler/include/KernelFusion.h
index d602c9388db0c..37489b640a597 100644
--- a/sycl-fusion/jit-compiler/include/KernelFusion.h
+++ b/sycl-fusion/jit-compiler/include/KernelFusion.h
@@ -64,7 +64,7 @@ JITResult fuseKernels(View<SYCLKernelInfo> KernelInformation,
 
 JITResult materializeSpecConstants(const char *KernelName,
                                    jit_compiler::SYCLKernelBinaryInfo &BinInfo,
-                                   const View<unsigned char> &SpecConstBlob,
+                                   View<unsigned char> SpecConstBlob,
                                    const char *TargetCPU,
                                    const char *TargetFeatures);
 
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 2ae3f3fe46d64..7abf51ddb5922 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -73,7 +73,7 @@ static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
 extern "C" JITResult
 materializeSpecConstants(const char *KernelName,
                          jit_compiler::SYCLKernelBinaryInfo &BinInfo,
-                         const View<unsigned char> &SpecConstBlob,
+                         View<unsigned char> SpecConstBlob,
                          const char *TargetCPU, const char *TargetFeatures) {
   auto &JITCtx = JITContext::getInstance();
 
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index fe1e792cc2846..0d45a22c9e5a1 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -259,7 +259,7 @@ llvm::Expected<KernelBinary *> KernelTranslator::translateToPTX(
   }
 
   // Give priority to user specified values (through environment variables:
-  // SYCL_JIT_TARGET_CPU and SYCL_JIT_TARGET_FEATURES).
+  // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES).
   llvm::StringRef CPU{TargetCPU};
   llvm::StringRef Features{TargetFeatures};
 
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
index 1227e396eeefe..281b85aaa41e9 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
@@ -20,29 +20,15 @@ cl::opt<size_t> UseTestConstValues(
     "sycl-materializer-debug-value-size",
     cl::desc("Size of the spec const blob, debug use only."));
 
-const bool SYCLSpecConstMaterializer::IsDebug =
-    getenv("SYCL_MATERIALIZER_DEBUG");
-
-// When run through the JIT pipeline we have no way of using this pass' debug
-// type, hence the introduction of the environment variable above and the macro
-// below.
-#define MATERIALIZER_DEBUG(X)                                                  \
-  do {                                                                         \
-    if (IsDebug)                                                               \
-      X;                                                                       \
-    else                                                                       \
-      LLVM_DEBUG(X);                                                           \
-  } while (false)
-
-constexpr llvm::StringLiteral SPEC_CONST_DATA_NODE_NAME{"SYCL_SpecConst_data"};
+constexpr llvm::StringLiteral SpecConstDataNodeName{"SYCL_SpecConst_data"};
 
 PreservedAnalyses SYCLSpecConstDataInserter::run(Module &M,
                                                  ModuleAnalysisManager &) {
-  if (M.getNamedMetadata(SPEC_CONST_DATA_NODE_NAME))
+  if (M.getNamedMetadata(SpecConstDataNodeName))
     llvm_unreachable("Did not expect the node to be present.");
 
   auto &Context = M.getContext();
-  auto *SpecConstMD = M.getOrInsertNamedMetadata(SPEC_CONST_DATA_NODE_NAME);
+  auto *SpecConstMD = M.getOrInsertNamedMetadata(SpecConstDataNodeName);
   auto *StringMD = MDString::get(
       Context, StringRef{(const char *)SpecConstData, SpecConstDataSize});
   auto *TupleMD = MDTuple::get(Context, {StringMD});
@@ -190,12 +176,12 @@ void SYCLSpecConstMaterializer::populateUses(Argument *A) {
       auto *I = cast<Instruction>(&*U);
       switch (I->getOpcode()) {
       default: {
-        MATERIALIZER_DEBUG(
+        LLVM_DEBUG(
             dbgs()
             << "Optimization opportunity missed, unhandled instruction: \n");
-        MATERIALIZER_DEBUG(I->dump());
-        MATERIALIZER_DEBUG(dbgs() << "Function:\n");
-        MATERIALIZER_DEBUG(I->getParent()->getParent()->dump());
+        LLVM_DEBUG(I->dump());
+        LLVM_DEBUG(dbgs() << "Function:\n");
+        LLVM_DEBUG(I->getParent()->getParent()->dump());
         break;
       }
       case Instruction::Load: {
@@ -229,21 +215,20 @@ void SYCLSpecConstMaterializer::populateUses(Argument *A) {
 
 void SYCLSpecConstMaterializer::reportAndReset() {
   if (LoadsToTypes.empty()) {
-    MATERIALIZER_DEBUG(dbgs()
-                       << "Did not find any loads from spec const buffer.\n");
+    LLVM_DEBUG(dbgs() << "Did not find any loads from spec const buffer.\n");
   } else {
-    MATERIALIZER_DEBUG(dbgs() << "Replaced: " << LoadsToTypes.size()
-                              << " loads from spec const buffer.\n");
-    MATERIALIZER_DEBUG(dbgs() << "Load to global variable mappings:\n");
+    LLVM_DEBUG(dbgs() << "Replaced: " << LoadsToTypes.size()
+                      << " loads from spec const buffer.\n");
+    LLVM_DEBUG(dbgs() << "Load to global variable mappings:\n");
     for (auto &LTT : LoadsToTypes) {
-      MATERIALIZER_DEBUG(dbgs() << "\tLoad:\n");
-      MATERIALIZER_DEBUG(LTT.first->dump());
-      MATERIALIZER_DEBUG(dbgs() << "\tGlobal Variable:\n");
-      MATERIALIZER_DEBUG(TypesAndOffsetsToBlob[LTT.second]->dump());
-      MATERIALIZER_DEBUG(dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "\tLoad:\n");
+      LLVM_DEBUG(LTT.first->dump());
+      LLVM_DEBUG(dbgs() << "\tGlobal Variable:\n");
+      LLVM_DEBUG(TypesAndOffsetsToBlob[LTT.second]->dump());
+      LLVM_DEBUG(dbgs() << "\n");
     }
   }
-  MATERIALIZER_DEBUG(dbgs() << "\n\n");
+  LLVM_DEBUG(dbgs() << "\n\n");
 
   // Reset the state.
   TypesAndOffsets.clear();
@@ -278,12 +263,12 @@ SYCLSpecConstMaterializer::handleKernel(llvm::Function &Kernel) {
 }
 
 bool SYCLSpecConstMaterializer::readMetadata() {
-  auto *NamedMD = Mod->getNamedMetadata(SPEC_CONST_DATA_NODE_NAME);
+  auto *NamedMD = Mod->getNamedMetadata(SpecConstDataNodeName);
   if (!NamedMD || NamedMD->getNumOperands() != 1)
     return false;
 
   auto *MDN = cast<MDTuple>(NamedMD->getOperand(0));
-  assert(MDN->getNumOperands() != 1 && "Malformed data node.");
+  assert(MDN->getNumOperands() == 1 && "Malformed data node.");
   auto *MDS = cast<MDString>(MDN->getOperand(0));
 
   SpecConstData = MDS->getString().bytes_begin();
@@ -294,10 +279,14 @@ bool SYCLSpecConstMaterializer::readMetadata() {
 
 PreservedAnalyses SYCLSpecConstMaterializer::run(Function &F,
                                                  FunctionAnalysisManager &) {
+  if (const char *DebugEnv = std::getenv("SYCL_MATERIALIZER_DEBUG"))
+    if (0 == strcmp(DebugEnv, DEBUG_TYPE))
+      llvm::setCurrentDebugType(DEBUG_TYPE);
+
   Mod = F.getParent();
-  MATERIALIZER_DEBUG(dbgs() << "Working on function:\n==================\n"
-                            << (F.hasName() ? F.getName() : "unnamed kernel")
-                            << "\n\n");
+  LLVM_DEBUG(dbgs() << "Working on function:\n==================\n"
+                    << (F.hasName() ? F.getName() : "unnamed kernel")
+                    << "\n\n");
 
   // Invariant: This pass is only intended to operate on SYCL kernels being
   // compiled to either `nvptx{,64}-nvidia-cuda`, or `amdgcn-amd-amdhsa`
@@ -305,7 +294,7 @@ PreservedAnalyses SYCLSpecConstMaterializer::run(Function &F,
   auto AT = TargetHelpers::getArchType(*Mod);
   if (TargetHelpers::ArchType::Cuda != AT &&
       TargetHelpers::ArchType::AMDHSA != AT) {
-    MATERIALIZER_DEBUG(dbgs() << "Unsupported architecture\n");
+    LLVM_DEBUG(dbgs() << "Unsupported architecture\n");
     return PreservedAnalyses::all();
   }
 
diff --git a/sycl-fusion/test/materializer/debug_output.ll b/sycl-fusion/test/materializer/debug_output.ll
index 6a94d86745b23..bc971e2c987a3 100644
--- a/sycl-fusion/test/materializer/debug_output.ll
+++ b/sycl-fusion/test/materializer/debug_output.ll
@@ -1,11 +1,11 @@
-; RUN: %if hip_amd %{ env SYCL_MATERIALIZER_DEBUG=1 opt\
+; RUN: %if hip_amd %{ env SYCL_MATERIALIZER_DEBUG="sycl-spec-const-materializer" opt\
 ; RUN: -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
 ; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
 ; RUN: -passes=sycl-spec-const-materializer,sccp -S %s 2> %t.stderr\
 ; RUN: | FileCheck %s %}
 ; RUN: %if hip_amd %{ FileCheck --input-file=%t.stderr --check-prefix=CHECK-DEBUG %s %}
 
-; RUN: %if cuda %{ env SYCL_MATERIALIZER_DEBUG=1 opt\
+; RUN: %if cuda %{ env SYCL_MATERIALIZER_DEBUG="sycl-spec-const-materializer" opt\
 ; RUN: -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
 ; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
 ; RUN: -passes=sycl-spec-const-materializer,sccp -S %s 2> %t.stderr\
diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 4cdb81f98af86..3b2cf2e6f9798 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -25,9 +25,10 @@ compiler and runtime.
 | `SYCL_EAGER_INIT` | Integer | Enable by specifying non-zero value. Tells the SYCL runtime to do as much as possible initialization at objects construction as opposed to doing lazy initialization on the fly. This may mean doing some redundant work at warmup but ensures fastest possible execution on the following hot and reportable paths. It also instructs PI plugins to do the same. Default is "0".  |
 | `SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE` | See [below](#sycl_reduction_preferred_workgroup_size) | Controls the preferred work-group size of reductions. |
 | `SYCL_ENABLE_FUSION_CACHING` | '1' or '0' | Enable ('1') or disable ('0') caching of JIT compilations for kernel fusion. Caching avoids repeatedly running the JIT compilation pipeline if the same sequence of kernels is fused multiple times. Default value is '1'. |
-| `SYCL_JIT_KERNELS` | '1' or '0' | Enable ('1') or disable ('0') JIT compilation of kernels. Only supported for Nvidia and AMD backends. When JIT-ing is enabled SYCL runtime will try to cache and reuse JIT-compiled kernels, furthermore if a kernel uses specialization constants the compiler will attempt to materialize the values in place, turning them to de-facto compile time constants. Default is '0'. |
-| `SYCL_JIT_TARGET_CPU` | Any(\*) | Allows setting the target CPU to be used when JIT-ing kernels. Examples include setting SM version for Nvidia, or target architecture for AMD. |
-| `SYCL_JIT_TARGET_FEATURES` | Any(\*) | Allows setting desired target features to be used when JIT-ing kernels. Examples include setting PTX version for Nvidia. |
+| `SYCL_JIT_AMDGCN_PTX_KERNELS` | '1' or '0' | Enable ('1') or disable ('0') JIT compilation of kernels. Only supported for Nvidia and AMD backends. When JIT-ing is enabled SYCL runtime will try to cache and reuse JIT-compiled kernels, furthermore if a kernel uses specialization constants the compiler will attempt to materialize the values in place, turning them to de-facto compile time constants. Default is '0'. |
+| `SYCL_JIT_AMDGCN_PTX_TARGET_CPU` | Any(\*) | Allows setting the target CPU to be used when JIT-ing kernels. Examples include setting SM version for Nvidia, or target architecture for AMD. |
+| `SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES` | Any(\*) | Allows setting desired target features to be used when JIT-ing kernels. Examples include setting PTX version for Nvidia. |
+| `SYCL_MATERIALIZER_DEBUG` | `sycl-spec-const-materializer` | Enables debug output generation in specialization constants materialization pass. |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 
diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def
index 9b9003795c27d..94424312c14d2 100644
--- a/sycl/source/detail/config.def
+++ b/sycl/source/detail/config.def
@@ -42,6 +42,6 @@ CONFIG(SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE, 16, __SYCL_REDUCTION_PREFERRED_W
 CONFIG(ONEAPI_DEVICE_SELECTOR, 1024, __ONEAPI_DEVICE_SELECTOR)
 CONFIG(SYCL_ENABLE_FUSION_CACHING, 1, __SYCL_ENABLE_FUSION_CACHING)
 CONFIG(SYCL_CACHE_IN_MEM, 1, __SYCL_CACHE_IN_MEM)
-CONFIG(SYCL_JIT_KERNELS, 1, __SYCL_JIT_KERNELS)
-CONFIG(SYCL_JIT_TARGET_CPU, 1024, __SYCL_JIT_TARGET_CPU)
-CONFIG(SYCL_JIT_TARGET_FEATURES, 1024, __SYCL_JIT_TARGET_FEATURES)
+CONFIG(SYCL_JIT_AMDGCN_PTX_KERNELS, 1, __SYCL_JIT_AMDGCN_PTX_KERNELS)
+CONFIG(SYCL_JIT_AMDGCN_PTX_TARGET_CPU, 1024, __SYCL_JIT_AMDGCN_PTX_TARGET_CPU)
+CONFIG(SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES, 1024, __SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES)
diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp
index 2b32f932fb96b..7cfd3ba9d0d18 100644
--- a/sycl/source/detail/config.hpp
+++ b/sycl/source/detail/config.hpp
@@ -616,8 +616,8 @@ template <> class SYCLConfig<SYCL_CACHE_IN_MEM> {
   }
 };
 
-template <> class SYCLConfig<SYCL_JIT_KERNELS> {
-  using BaseT = SYCLConfigBase<SYCL_JIT_KERNELS>;
+template <> class SYCLConfig<SYCL_JIT_AMDGCN_PTX_KERNELS> {
+  using BaseT = SYCLConfigBase<SYCL_JIT_AMDGCN_PTX_KERNELS>;
 
 public:
   static bool get() {
@@ -640,8 +640,8 @@ template <> class SYCLConfig<SYCL_JIT_KERNELS> {
   }
 };
 
-template <> class SYCLConfig<SYCL_JIT_TARGET_CPU> {
-  using BaseT = SYCLConfigBase<SYCL_CACHE_IN_MEM>;
+template <> class SYCLConfig<SYCL_JIT_AMDGCN_PTX_TARGET_CPU> {
+  using BaseT = SYCLConfigBase<SYCL_JIT_AMDGCN_PTX_TARGET_CPU>;
 
 public:
   static std::string get() {
@@ -668,8 +668,8 @@ template <> class SYCLConfig<SYCL_JIT_TARGET_CPU> {
   }
 };
 
-template <> class SYCLConfig<SYCL_JIT_TARGET_FEATURES> {
-  using BaseT = SYCLConfigBase<SYCL_CACHE_IN_MEM>;
+template <> class SYCLConfig<SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES> {
+  using BaseT = SYCLConfigBase<SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES>;
 
 public:
   static std::string get() {
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index db8e74959e805..f18d2dd4a5a1e 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -667,9 +667,9 @@ sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
       ::jit_compiler::option::JITEnableVerbose::set(DebugEnabled));
 
   std::string TargetCPU =
-      detail::SYCLConfig<detail::SYCL_JIT_TARGET_CPU>::get();
+      detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_TARGET_CPU>::get();
   std::string TargetFeatures =
-      detail::SYCLConfig<detail::SYCL_JIT_TARGET_FEATURES>::get();
+      detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES>::get();
 
   auto MaterializerResult =
       MaterializeSpecConstHandle(KernelName.c_str(), BinInfo, SpecConstBlob,
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 21817923de9a0..0f6208ec96c7a 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2393,7 +2393,7 @@ static pi_result SetKernelParamsAndLaunch(
     const std::string &KernelName) {
   const PluginPtr &Plugin = Queue->getPlugin();
 
-  if (SYCLConfig<SYCL_JIT_KERNELS>::get()) {
+  if (SYCLConfig<SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
     std::vector<unsigned char> Empty;
     Kernel = Scheduler::getInstance().completeSpecConstMaterialization(
         Queue, BinImage, KernelName,
@@ -3050,7 +3050,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     }
 
     const RTDeviceBinaryImage *BinImage = nullptr;
-    if (detail::SYCLConfig<detail::SYCL_JIT_KERNELS>::get()) {
+    if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
       std::tie(BinImage, std::ignore) =
           retrieveKernelBinary(MQueue, KernelName.c_str());
       assert(BinImage && "Failed to obtain a binary image.");
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 4600def575613..0d0ff780c04b9 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -295,7 +295,7 @@ event handler::finalize() {
           Result = PI_SUCCESS;
         } else {
           const detail::RTDeviceBinaryImage *BinImage = nullptr;
-          if (detail::SYCLConfig<detail::SYCL_JIT_KERNELS>::get()) {
+          if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
             std::tie(BinImage, std::ignore) =
                 detail::retrieveKernelBinary(MQueue, MKernelName.c_str());
             assert(BinImage && "Failed to obtain a binary image.");

From c23a986ebf98092a9cce4e9e8ce82d4c697ca00f Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Wed, 10 Jul 2024 14:04:38 +0100
Subject: [PATCH 09/22] Merge fixes

---
 sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp | 3 +++
 sycl/source/handler.cpp                                        | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
index 281b85aaa41e9..fc942f2817bb9 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
@@ -7,10 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "SYCLSpecConstMaterializer.h"
+
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/SYCLLowerIR/TargetHelpers.h"
 #include <llvm/ADT/StringRef.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/Debug.h>
 
 #define DEBUG_TYPE "sycl-spec-const-materializer"
 
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index d5d2acb45d369..d55efa194456e 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -283,7 +283,7 @@ event handler::finalize() {
         enqueueImpKernel(MQueue, MNDRDesc, MArgs, KernelBundleImpPtr, MKernel,
                          MKernelName.c_str(), RawEvents, NewEvent, nullptr,
                          MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative,
-                         MImpl->MKernelUsesClusterLaunch, BInImage);
+                         MImpl->MKernelUsesClusterLaunch, BinImage);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
         // Emit signal only when event is created
         if (NewEvent != nullptr) {

From 93b07a7dc0ee015f4dfe50381699a5e3c165476b Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Thu, 11 Jul 2024 10:10:45 +0100
Subject: [PATCH 10/22] Merge fixes

---
 .../kernel-fusion/SYCLSpecConstMaterializer.h |  2 +-
 sycl/source/detail/config.hpp                 | 27 +++++++++----------
 .../program_manager/program_manager.cpp       |  5 +++-
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
index 138b6271466e9..47a92da0754ba 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
@@ -25,7 +25,7 @@ class SYCLSpecConstDataInserter
 public:
   SYCLSpecConstDataInserter(const unsigned char *SpecConstData,
                             size_t SpecConstDataSize)
-      : SpecConstData(SpecConstData), SpecConstDataSize(SpecConstDataSize){};
+      : SpecConstData(SpecConstData), SpecConstDataSize(SpecConstDataSize) {};
 
   SYCLSpecConstDataInserter() {
     SpecConstDataSize = 255;
diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp
index 441d676fea34f..c8a079f94ae66 100644
--- a/sycl/source/detail/config.hpp
+++ b/sycl/source/detail/config.hpp
@@ -590,11 +590,14 @@ template <> class SYCLConfig<SYCL_CACHE_IN_MEM> {
   using BaseT = SYCLConfigBase<SYCL_CACHE_IN_MEM>;
 
 public:
-  static bool get() {
-    constexpr bool DefaultValue = true;
-
-    const char *ValStr = getCachedValue();
+  static constexpr bool Default = true; // default is true
+  static bool get() { return getCachedValue(); }
+  static const char *getName() { return BaseT::MConfigName; }
+  static void reset() { (void)getCachedValue(/*ResetCache=*/true); }
 
+private:
+  static bool parseValue() {
+    const char *ValStr = BaseT::getRawValue();
     if (!ValStr)
       return Default;
     if (strlen(ValStr) != 1 || (ValStr[0] != '0' && ValStr[0] != '1')) {
@@ -606,16 +609,12 @@ template <> class SYCLConfig<SYCL_CACHE_IN_MEM> {
     return ValStr[0] == '1';
   }
 
-  static void reset() { (void)getCachedValue(/*ResetCache=*/true); }
-
-  static const char *getName() { return BaseT::MConfigName; }
-
-private:
-  static const char *getCachedValue(bool ResetCache = false) {
-    static const char *ValStr = BaseT::getRawValue();
-    if (ResetCache)
-      ValStr = BaseT::getRawValue();
-    return ValStr;
+  static bool getCachedValue(bool ResetCache = false) {
+    static bool Val = parseValue();
+    if (ResetCache) {
+      Val = BaseT::getRawValue();
+    }
+    return Val;
   }
 };
 
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index d1674b185b094..3682102ae0fef 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -2567,10 +2567,13 @@ sycl::detail::pi::PiKernel ProgramManager::getOrCreateMaterializedKernel(
   std::string CompileOpts;
   std::string LinkOpts;
   applyOptionsFromEnvironment(CompileOpts, LinkOpts);
+  // No linking of extra programs reqruired.
+  std::vector<sycl::detail::pi::PiProgram> ExtraProgramsToLink;
   auto BuildProgram =
       build(std::move(ProgramManaged), detail::getSyclObjImpl(Context),
             CompileOpts, LinkOpts, DeviceImpl->getHandleRef(),
-            /*For non SPIR-V devices DeviceLibReqdMask is always 0*/ 0);
+            /*For non SPIR-V devices DeviceLibReqdMask is always 0*/ 0,
+            ExtraProgramsToLink);
   sycl::detail::pi::PiKernel PiKernel{nullptr};
   Plugin->call<errc::kernel_not_supported, PiApiKind::piKernelCreate>(
       BuildProgram.get(), KernelName.c_str(), &PiKernel);

From cf4ec3688a4b05b28e90440f443e386201fdcfa4 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Thu, 11 Jul 2024 12:22:41 +0100
Subject: [PATCH 11/22] Debug printout fix

---
 .../passes/kernel-fusion/SYCLSpecConstMaterializer.cpp        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
index fc942f2817bb9..fe70406dff39b 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
@@ -283,8 +283,10 @@ bool SYCLSpecConstMaterializer::readMetadata() {
 PreservedAnalyses SYCLSpecConstMaterializer::run(Function &F,
                                                  FunctionAnalysisManager &) {
   if (const char *DebugEnv = std::getenv("SYCL_MATERIALIZER_DEBUG"))
-    if (0 == strcmp(DebugEnv, DEBUG_TYPE))
+    if (0 == strcmp(DebugEnv, DEBUG_TYPE)) {
+      DebugFlag = true;
       llvm::setCurrentDebugType(DEBUG_TYPE);
+    }
 
   Mod = F.getParent();
   LLVM_DEBUG(dbgs() << "Working on function:\n==================\n"

From f86d9985b4cae2b3e24649cb0ff00f75a8016d5e Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Thu, 11 Jul 2024 13:33:58 +0100
Subject: [PATCH 12/22] PR feedback 3

---
 sycl-fusion/jit-compiler/lib/KernelFusion.cpp |  2 +-
 .../lib/fusion/FusionPipeline.cpp             |  7 +++---
 .../jit-compiler/lib/fusion/FusionPipeline.h  |  6 ++---
 .../SYCLSpecConstMaterializer.cpp             | 22 +++++++++----------
 .../kernel-fusion/SYCLSpecConstMaterializer.h | 21 +++---------------
 sycl-fusion/test/materializer/basic.ll        | 20 +++++++----------
 sycl-fusion/test/materializer/debug_output.ll | 10 ++++-----
 sycl-fusion/test/materializer/multi_type.ll   | 20 +++++++----------
 sycl/doc/EnvironmentVariables.md              |  2 +-
 sycl/source/detail/jit_compiler.cpp           |  7 ++----
 sycl/source/detail/jit_compiler.hpp           |  8 ++++---
 11 files changed, 48 insertions(+), 77 deletions(-)

diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 7abf51ddb5922..2277c99374919 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -100,7 +100,7 @@ materializeSpecConstants(const char *KernelName,
   }
   std::unique_ptr<llvm::Module> NewMod = std::move(*ModOrError);
   if (!fusion::FusionPipeline::runMaterializerPasses(
-          *NewMod, SpecConstBlob.begin(), SpecConstBlob.size()) ||
+          *NewMod, SpecConstBlob.to<llvm::ArrayRef>()) ||
       !NewMod->getFunction(KernelName)) {
     return JITResult{"Materializer passes should not fail"};
   }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index dc94eda2917e2..dee3c3933d3be 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -143,9 +143,8 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
   return std::make_unique<SYCLModuleInfo>(std::move(*NewModInfo.ModuleInfo));
 }
 
-bool FusionPipeline::runMaterializerPasses(llvm::Module &Mod,
-                                           const unsigned char *SpecConstData,
-                                           size_t SpecConstDataSize) {
+bool FusionPipeline::runMaterializerPasses(
+    llvm::Module &Mod, llvm::ArrayRef<unsigned char> SpecConstData) {
   PassBuilder PB;
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
@@ -163,7 +162,7 @@ bool FusionPipeline::runMaterializerPasses(llvm::Module &Mod,
   // Register inserter and materializer passes.
   {
     FunctionPassManager FPM;
-    MPM.addPass(SYCLSpecConstDataInserter{SpecConstData, SpecConstDataSize});
+    MPM.addPass(SYCLSpecConstDataInserter{SpecConstData});
     FPM.addPass(SYCLSpecConstMaterializer{});
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
index e1284484f7494..22d71cc16187e 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
@@ -31,9 +31,9 @@ class FusionPipeline {
   ///
   /// Run the necessary passes in a custom pass pipeline to perform
   /// materialization of kernel specialization constants.
-  static bool runMaterializerPasses(llvm::Module &Mod,
-                                    const unsigned char *SpecConstData,
-                                    size_t SpecConstDataSize);
+  static bool
+  runMaterializerPasses(llvm::Module &Mod,
+                        llvm::ArrayRef<unsigned char> SpecConstData);
 };
 } // namespace fusion
 } // namespace jit_compiler
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
index fe70406dff39b..4e78cbd96d2b2 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
@@ -19,21 +19,18 @@
 
 using namespace llvm;
 
-cl::opt<size_t> UseTestConstValues(
-    "sycl-materializer-debug-value-size",
-    cl::desc("Size of the spec const blob, debug use only."));
-
 constexpr llvm::StringLiteral SpecConstDataNodeName{"SYCL_SpecConst_data"};
 
 PreservedAnalyses SYCLSpecConstDataInserter::run(Module &M,
                                                  ModuleAnalysisManager &) {
-  if (M.getNamedMetadata(SpecConstDataNodeName))
-    llvm_unreachable("Did not expect the node to be present.");
+  assert(!M.getNamedMetadata(SpecConstDataNodeName) &&
+         "Did not expect the node to be present.");
 
   auto &Context = M.getContext();
   auto *SpecConstMD = M.getOrInsertNamedMetadata(SpecConstDataNodeName);
   auto *StringMD = MDString::get(
-      Context, StringRef{(const char *)SpecConstData, SpecConstDataSize});
+      Context, StringRef{reinterpret_cast<const char *>(SpecConstData.data()),
+                         SpecConstData.size()});
   auto *TupleMD = MDTuple::get(Context, {StringMD});
   SpecConstMD->addOperand(TupleMD);
 
@@ -103,7 +100,7 @@ void SYCLSpecConstMaterializer::allocateSpecConstant(StringRef KernelName) {
     constexpr unsigned AS = 4;
     auto *SpecConstGlobal = new GlobalVariable(
         *Mod, Ty, /*isConstant*/ true, GlobalValue::WeakODRLinkage, Initializer,
-        Twine("SpecConsBlob_" + KernelName + "_" + Twine(I.index())),
+        Twine("SpecConsBlob_") + KernelName + "_" + Twine(I.index()),
         /*InsertBefore*/ nullptr, GlobalValue::NotThreadLocal, AS,
         /*isExternallyInitialized*/ false);
     TypesAndOffsetsToBlob[I.value()] = SpecConstGlobal;
@@ -251,8 +248,9 @@ SYCLSpecConstMaterializer::handleKernel(llvm::Function &Kernel) {
   if (!readMetadata())
     return PreservedAnalyses::all();
 
-  if (!SpecConstData || SpecConstDataSize < 1)
-    llvm_unreachable("Specialisation constant data not found");
+  // Make sure that the data was in an expected format.
+  assert((!SpecConstData || SpecConstDataSize < 1) &&
+         "Specialisation constant data not found");
 
   populateUses(SpecConstArg);
 
@@ -282,8 +280,8 @@ bool SYCLSpecConstMaterializer::readMetadata() {
 
 PreservedAnalyses SYCLSpecConstMaterializer::run(Function &F,
                                                  FunctionAnalysisManager &) {
-  if (const char *DebugEnv = std::getenv("SYCL_MATERIALIZER_DEBUG"))
-    if (0 == strcmp(DebugEnv, DEBUG_TYPE)) {
+  if (const char *DebugEnv = std::getenv("SYCL_JIT_COMPILER_DEBUG"))
+    if (0 == strstr(DebugEnv, DEBUG_TYPE)) {
       DebugFlag = true;
       llvm::setCurrentDebugType(DEBUG_TYPE);
     }
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
index 47a92da0754ba..e35361b24ba2a 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
@@ -23,24 +23,13 @@ class Function;
 class SYCLSpecConstDataInserter
     : public PassInfoMixin<SYCLSpecConstDataInserter> {
 public:
-  SYCLSpecConstDataInserter(const unsigned char *SpecConstData,
-                            size_t SpecConstDataSize)
-      : SpecConstData(SpecConstData), SpecConstDataSize(SpecConstDataSize) {};
-
-  SYCLSpecConstDataInserter() {
-    SpecConstDataSize = 255;
-    for (unsigned i = 0; i < SpecConstDataSize; ++i)
-      DebugSpecConstData.push_back(i);
-
-    SpecConstData = DebugSpecConstData.data();
-  }
+  SYCLSpecConstDataInserter(ArrayRef<unsigned char> SpecConstData)
+      : SpecConstData(SpecConstData) {};
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
 
 private:
-  const unsigned char *SpecConstData;
-  size_t SpecConstDataSize;
-  SmallVector<unsigned char> DebugSpecConstData;
+  ArrayRef<unsigned char> SpecConstData;
 };
 
 ///
@@ -88,10 +77,6 @@ class SYCLSpecConstMaterializer
   void reportAndReset();
 
 private:
-  // Flag enabling debug output, guarded by: SYCL_MATERIALIZER_DEBUG environment
-  // variable.
-  static const bool IsDebug;
-
   // Run time known values of specialization constants passed from SYCL rt,
   // data pointer and size.
   const unsigned char *SpecConstData;
diff --git a/sycl-fusion/test/materializer/basic.ll b/sycl-fusion/test/materializer/basic.ll
index 629b8f95e18bd..19bab11479671 100644
--- a/sycl-fusion/test/materializer/basic.ll
+++ b/sycl-fusion/test/materializer/basic.ll
@@ -1,22 +1,18 @@
 ; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
-; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
-; RUN: -passes=sycl-spec-const-materializer -S %s | FileCheck\
-; RUN: --check-prefix=CHECK-MATERIALIZER %s %}
+; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
 
 ; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
-; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
-; RUN: -passes=sycl-spec-const-materializer -S %s | FileCheck\
-; RUN: --check-prefix=CHECK-MATERIALIZER %s %}
+; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
 
 ; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
-; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
-; RUN: -passes=sycl-spec-const-materializer,early-cse,adce -S %s | FileCheck\
-; RUN: --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer,early-cse,adce -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER-CSE %s %}
 
 ; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
-; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
-; RUN: -passes=sycl-spec-const-materializer,early-cse,adce -S %s | FileCheck\
-; RUN: --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer,early-cse,adce -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER-CSE %s %}
 
 source_filename = "basic.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
diff --git a/sycl-fusion/test/materializer/debug_output.ll b/sycl-fusion/test/materializer/debug_output.ll
index bc971e2c987a3..b7949e7d3f401 100644
--- a/sycl-fusion/test/materializer/debug_output.ll
+++ b/sycl-fusion/test/materializer/debug_output.ll
@@ -1,14 +1,12 @@
-; RUN: %if hip_amd %{ env SYCL_MATERIALIZER_DEBUG="sycl-spec-const-materializer" opt\
-; RUN: -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
-; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
+; RUN: %if hip_amd %{ env SYCL_JIT_COMPILER_DEBUG="sycl-spec-const-materializer" opt\
+; RUN: -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext --mtriple amdgcn-amd-amdhsa\
 ; RUN: -passes=sycl-spec-const-materializer,sccp -S %s 2> %t.stderr\
 ; RUN: | FileCheck %s %}
 ; RUN: %if hip_amd %{ FileCheck --input-file=%t.stderr --check-prefix=CHECK-DEBUG %s %}
 
-; RUN: %if cuda %{ env SYCL_MATERIALIZER_DEBUG="sycl-spec-const-materializer" opt\
+; RUN: %if cuda %{ env SYCL_JIT_COMPILER_DEBUG="sycl-spec-const-materializer" opt\
 ; RUN: -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
-; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
-; RUN: -passes=sycl-spec-const-materializer,sccp -S %s 2> %t.stderr\
+; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer,sccp -S %s 2> %t.stderr\
 ; RUN: | FileCheck %s %}
 ; RUN: %if hip_amd %{ FileCheck --input-file=%t.stderr --check-prefix=CHECK-DEBUG %s %}
 
diff --git a/sycl-fusion/test/materializer/multi_type.ll b/sycl-fusion/test/materializer/multi_type.ll
index 19972cbe3832d..112b685b959b5 100644
--- a/sycl-fusion/test/materializer/multi_type.ll
+++ b/sycl-fusion/test/materializer/multi_type.ll
@@ -1,22 +1,18 @@
 ; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
-; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
-; RUN: -passes=sycl-spec-const-materializer -S %s | FileCheck\
-; RUN: --check-prefix=CHECK-MATERIALIZER %s %}
+; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
 
 ; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
-; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
-; RUN: -passes=sycl-spec-const-materializer -S %s | FileCheck\
-; RUN: --check-prefix=CHECK-MATERIALIZER %s %}
+; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
 
 ; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
-; RUN: --mtriple amdgcn-amd-amdhsa -sycl-materializer-debug-value-size=256\
-; RUN: -passes=sycl-spec-const-materializer,early-cse -S %s | FileCheck\
-; RUN: --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer,early-cse -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER-CSE %s %}
 
 ; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
-; RUN: --mtriple nvptx64-nvidia-cuda -sycl-materializer-debug-value-size=256\
-; RUN: -passes=sycl-spec-const-materializer,early-cse -S %s | FileCheck\
-; RUN: --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer,early-cse -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER-CSE %s %}
 
 source_filename = "multi_type.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 3b2cf2e6f9798..f0188b127d7f4 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -28,7 +28,7 @@ compiler and runtime.
 | `SYCL_JIT_AMDGCN_PTX_KERNELS` | '1' or '0' | Enable ('1') or disable ('0') JIT compilation of kernels. Only supported for Nvidia and AMD backends. When JIT-ing is enabled SYCL runtime will try to cache and reuse JIT-compiled kernels, furthermore if a kernel uses specialization constants the compiler will attempt to materialize the values in place, turning them to de-facto compile time constants. Default is '0'. |
 | `SYCL_JIT_AMDGCN_PTX_TARGET_CPU` | Any(\*) | Allows setting the target CPU to be used when JIT-ing kernels. Examples include setting SM version for Nvidia, or target architecture for AMD. |
 | `SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES` | Any(\*) | Allows setting desired target features to be used when JIT-ing kernels. Examples include setting PTX version for Nvidia. |
-| `SYCL_MATERIALIZER_DEBUG` | `sycl-spec-const-materializer` | Enables debug output generation in specialization constants materialization pass. |
+| `SYCL_JIT_COMPILER_DEBUG` | Any(\*) | Passes can specify their own debug types, `sycl-spec-const-materializer` enables debug output generation in specialization constants materialization pass. |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index bb628a1c70eae..2816eeab843c1 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -627,17 +627,16 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
 
 sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
     QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage,
-    const std::string &KernelName, std::vector<unsigned char> &SpecConstBlob) {
+    const std::string &KernelName,
+    const std::vector<unsigned char> &SpecConstBlob) {
   if (!BinImage) {
     throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
                           "No suitable IR available for materializing");
-    return nullptr;
   }
   if (KernelName.empty()) {
     throw sycl::exception(
         sycl::make_error_code(sycl::errc::invalid),
         "Cannot jit kernel with invalid kernel function name");
-    return nullptr;
   }
   auto &PM = detail::ProgramManager::getInstance();
   if (auto CachedKernel =
@@ -653,7 +652,6 @@ sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
   if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
     throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
                           "No suitable IR available for materializing");
-    return nullptr;
   }
   ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
       BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
@@ -681,7 +679,6 @@ sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
       std::cerr << Message << "\n";
     }
     throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), Message);
-    return nullptr;
   }
 
   auto &MaterializerKernelInfo = MaterializerResult.getKernelInfo();
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index d0df4b775cd46..656dc548e17e0 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -40,9 +40,11 @@ class jit_compiler {
   std::unique_ptr<detail::CG>
   fuseKernels(QueueImplPtr Queue, std::vector<ExecCGCommand *> &InputKernels,
               const property_list &);
-  sycl::detail::pi::PiKernel materializeSpecConstants(
-      QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage,
-      const std::string &KernelName, std::vector<unsigned char> &SpecConstBlob);
+  sycl::detail::pi::PiKernel
+  materializeSpecConstants(QueueImplPtr Queue,
+                           const RTDeviceBinaryImage *BinImage,
+                           const std::string &KernelName,
+                           const std::vector<unsigned char> &SpecConstBlob);
 
   bool isAvailable() { return Available; }
 

From e6169ce16fad789a7306cba063187ad23d4edba8 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Fri, 12 Jul 2024 08:33:42 +0100
Subject: [PATCH 13/22] Correct assert

---
 sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
index 4e78cbd96d2b2..39ef24ff3af6a 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
@@ -249,7 +249,7 @@ SYCLSpecConstMaterializer::handleKernel(llvm::Function &Kernel) {
     return PreservedAnalyses::all();
 
   // Make sure that the data was in an expected format.
-  assert((!SpecConstData || SpecConstDataSize < 1) &&
+  assert(SpecConstData && SpecConstDataSize > 0 &&
          "Specialisation constant data not found");
 
   populateUses(SpecConstArg);

From 5385ad530f2f60e86e76c8f480bcbcbdb05d45be Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Fri, 12 Jul 2024 04:49:08 -0400
Subject: [PATCH 14/22] strstr returns a pointer on success

---
 sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
index 39ef24ff3af6a..ffcb81a97ca18 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
@@ -281,7 +281,7 @@ bool SYCLSpecConstMaterializer::readMetadata() {
 PreservedAnalyses SYCLSpecConstMaterializer::run(Function &F,
                                                  FunctionAnalysisManager &) {
   if (const char *DebugEnv = std::getenv("SYCL_JIT_COMPILER_DEBUG"))
-    if (0 == strstr(DebugEnv, DEBUG_TYPE)) {
+    if (strstr(DebugEnv, DEBUG_TYPE)) {
       DebugFlag = true;
       llvm::setCurrentDebugType(DEBUG_TYPE);
     }

From 0a3ecf1b450a38ca0ed795e80132f0160ddda406 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Mon, 15 Jul 2024 08:24:22 +0100
Subject: [PATCH 15/22] Use default pipeline

---
 sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index dee3c3933d3be..c1a5041d17c57 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -156,9 +156,7 @@ bool FusionPipeline::runMaterializerPasses(
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  // Base the pipeline on O3 opt level.
-  ModulePassManager MPM =
-      PB.buildPerModuleDefaultPipeline(OptimizationLevel::O3);
+  ModulePassManager MPM;
   // Register inserter and materializer passes.
   {
     FunctionPassManager FPM;

From 8bef1d44a50b1e277e4dbf9b8661816628ad6cc8 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Tue, 16 Jul 2024 06:26:06 +0100
Subject: [PATCH 16/22] Docs tidy-up

---
 sycl/doc/EnvironmentVariables.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index f0188b127d7f4..e3d68eed3528b 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -25,10 +25,9 @@ compiler and runtime.
 | `SYCL_EAGER_INIT` | Integer | Enable by specifying non-zero value. Tells the SYCL runtime to do as much as possible initialization at objects construction as opposed to doing lazy initialization on the fly. This may mean doing some redundant work at warmup but ensures fastest possible execution on the following hot and reportable paths. It also instructs PI plugins to do the same. Default is "0".  |
 | `SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE` | See [below](#sycl_reduction_preferred_workgroup_size) | Controls the preferred work-group size of reductions. |
 | `SYCL_ENABLE_FUSION_CACHING` | '1' or '0' | Enable ('1') or disable ('0') caching of JIT compilations for kernel fusion. Caching avoids repeatedly running the JIT compilation pipeline if the same sequence of kernels is fused multiple times. Default value is '1'. |
-| `SYCL_JIT_AMDGCN_PTX_KERNELS` | '1' or '0' | Enable ('1') or disable ('0') JIT compilation of kernels. Only supported for Nvidia and AMD backends. When JIT-ing is enabled SYCL runtime will try to cache and reuse JIT-compiled kernels, furthermore if a kernel uses specialization constants the compiler will attempt to materialize the values in place, turning them to de-facto compile time constants. Default is '0'. |
-| `SYCL_JIT_AMDGCN_PTX_TARGET_CPU` | Any(\*) | Allows setting the target CPU to be used when JIT-ing kernels. Examples include setting SM version for Nvidia, or target architecture for AMD. |
+| `SYCL_JIT_AMDGCN_PTX_KERNELS` | '1' or '0' | Enable ('1') or disable ('0') JIT compilation of kernels. Only supported for Nvidia and AMD backends. Note, that it is required to have a valid binary for the desired backend (AMD or CUDA), that was compiled with `-fsycl-embed-ir` in order to use JIT-ing. When JIT-ing is enabled SYCL runtime will try to cache and reuse JIT-compiled kernels, furthermore if a kernel uses specialization constants the compiler will attempt to materialize the values in place, turning them to de-facto compile time constants. Default is '0'. |
+| `SYCL_JIT_AMDGCN_PTX_TARGET_CPU` | Any(\*) | Allows setting the target architecture to be used when JIT-ing kernels. Examples include setting SM version for Nvidia, or target architecture for AMD. |
 | `SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES` | Any(\*) | Allows setting desired target features to be used when JIT-ing kernels. Examples include setting PTX version for Nvidia. |
-| `SYCL_JIT_COMPILER_DEBUG` | Any(\*) | Passes can specify their own debug types, `sycl-spec-const-materializer` enables debug output generation in specialization constants materialization pass. |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 
@@ -214,6 +213,7 @@ variables in production code.</span>
 | `SYCL_CACHE_TRACE` | Any(\*) | If the variable is set, messages are sent to std::cerr when caching events or non-blocking failures happen (e.g. unable to access cache item file). |
 | `SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE` | Any(\*) | Enables tracing of `parallel_for` invocations with rounded-up ranges. |
 | `SYCL_PI_SUPPRESS_ERROR_MESSAGE` | Any(\*) | Suppress printing of error message, only used for CI in order not to interrupt errors generated by underlying toolchains; note that the variable only modifies the printing of the error message (error value, name, description and location), the handling of error return code and aborting/throwing behaviour remains unchanged. |
+| `SYCL_JIT_COMPILER_DEBUG` | Any(\*) | Passes can specify their own debug types, `sycl-spec-const-materializer` enables debug output generation in specialization constants materialization pass. |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 

From 94b5ad500d0e8461ac532b8d113aabdeddd5d449 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Wed, 24 Jul 2024 08:58:27 +0100
Subject: [PATCH 17/22] Constexpr debug output in program manager

---
 .../program_manager/program_manager.cpp       | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 4cc4cee5f3c3e..4d07c28584faa 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -175,7 +175,7 @@ static const char *getFormatStr(sycl::detail::pi::PiDeviceBinaryType Format) {
 sycl::detail::pi::PiProgram
 ProgramManager::createPIProgram(const RTDeviceBinaryImage &Img,
                                 const context &Context, const device &Device) {
-  if (DbgProgMgr > 0)
+  if constexpr (DbgProgMgr > 0)
     std::cerr << ">>> ProgramManager::createPIProgram(" << &Img << ", "
               << getSyclObjImpl(Context).get() << ", "
               << getSyclObjImpl(Device).get() << ")\n";
@@ -231,7 +231,7 @@ ProgramManager::createPIProgram(const RTDeviceBinaryImage &Img,
 
   Ctx->addDeviceGlobalInitializer(Res, {Device}, &Img);
 
-  if (DbgProgMgr > 1)
+  if constexpr (DbgProgMgr > 1)
     std::cerr << "created program: " << Res
               << "; image format: " << getFormatStr(Format) << "\n";
 
@@ -791,7 +791,7 @@ ProgramManager::getOrCreateKernel(const ContextImplPtr &ContextImpl,
                                   const DeviceImplPtr &DeviceImpl,
                                   const std::string &KernelName,
                                   const NDRDescT &NDRDesc) {
-  if (DbgProgMgr > 0) {
+  if constexpr (DbgProgMgr > 0) {
     std::cerr << ">>> ProgramManager::getOrCreateKernel(" << ContextImpl.get()
               << ", " << DeviceImpl.get() << ", " << KernelName << ")\n";
   }
@@ -1096,7 +1096,7 @@ ProgramManager::ProgramManager() : m_AsanFoundInImage(false) {
     m_SpvFileImage =
         std::make_unique<DynRTDeviceBinaryImage>(std::move(Data), Size);
 
-    if (DbgProgMgr > 0) {
+    if constexpr (DbgProgMgr > 0) {
       std::cerr << "loaded device image binary from " << SpvFile << "\n";
       std::cerr << "format: " << getFormatStr(m_SpvFileImage->getFormat())
                 << "\n";
@@ -1150,7 +1150,7 @@ RTDeviceBinaryImage &
 ProgramManager::getDeviceImage(const std::string &KernelName,
                                const context &Context, const device &Device,
                                bool JITCompilationIsRequired) {
-  if (DbgProgMgr > 0) {
+  if constexpr (DbgProgMgr > 0) {
     std::cerr << ">>> ProgramManager::getDeviceImage(\"" << KernelName << "\", "
               << getSyclObjImpl(Context).get() << ", "
               << getSyclObjImpl(Device).get() << ", "
@@ -1184,7 +1184,7 @@ ProgramManager::getDeviceImage(const std::string &KernelName,
   if (Img) {
     CheckJITCompilationForImage(Img, JITCompilationIsRequired);
 
-    if (DbgProgMgr > 0) {
+    if constexpr (DbgProgMgr > 0) {
       std::cerr << "selected device image: " << &Img->getRawData() << "\n";
       Img->print();
     }
@@ -1201,7 +1201,7 @@ RTDeviceBinaryImage &ProgramManager::getDeviceImage(
     bool JITCompilationIsRequired) {
   assert(ImageSet.size() > 0);
 
-  if (DbgProgMgr > 0) {
+  if constexpr (DbgProgMgr > 0) {
     std::cerr << ">>> ProgramManager::getDeviceImage(Custom SPV file "
               << getSyclObjImpl(Context).get() << ", "
               << getSyclObjImpl(Device).get() << ", "
@@ -1230,7 +1230,7 @@ RTDeviceBinaryImage &ProgramManager::getDeviceImage(
 
   CheckJITCompilationForImage(*ImageIterator, JITCompilationIsRequired);
 
-  if (DbgProgMgr > 0) {
+  if constexpr (DbgProgMgr > 0) {
     std::cerr << "selected device image: " << &(*ImageIterator)->getRawData()
               << "\n";
     (*ImageIterator)->print();
@@ -1322,7 +1322,7 @@ ProgramManager::ProgramPtr ProgramManager::build(
     const sycl::detail::pi::PiDevice &Device, uint32_t DeviceLibReqMask,
     const std::vector<sycl::detail::pi::PiProgram> &ExtraProgramsToLink) {
 
-  if (DbgProgMgr > 0) {
+  if constexpr (DbgProgMgr > 0) {
     std::cerr << ">>> ProgramManager::build(" << Program.get() << ", "
               << CompileOptions << ", " << LinkOptions << ", ... " << Device
               << ")\n";
@@ -2518,7 +2518,7 @@ ProgramManager::getOrCreateKernel(const context &Context,
 sycl::detail::pi::PiKernel ProgramManager::getCachedMaterializedKernel(
     const std::string &KernelName,
     const std::vector<unsigned char> &SpecializationConsts) {
-  if (DbgProgMgr > 0)
+  if constexpr (DbgProgMgr > 0)
     std::cerr << ">>> ProgramManager::getCachedMaterializedKernel\n"
               << "KernelName: " << KernelName << "\n";
 
@@ -2526,20 +2526,20 @@ sycl::detail::pi::PiKernel ProgramManager::getCachedMaterializedKernel(
     std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
     if (auto KnownMaterializations = m_MaterializedKernels.find(KernelName);
         KnownMaterializations != m_MaterializedKernels.end()) {
-      if (DbgProgMgr > 0)
+      if constexpr (DbgProgMgr > 0)
         std::cerr << ">>> There are:" << KnownMaterializations->second.size()
                   << " materialized kernels.\n";
       if (auto Kernel =
               KnownMaterializations->second.find(SpecializationConsts);
           Kernel != KnownMaterializations->second.end()) {
-        if (DbgProgMgr > 0)
+        if constexpr (DbgProgMgr > 0)
           std::cerr << ">>> Kernel in the chache\n";
         return Kernel->second;
       }
     }
   }
 
-  if (DbgProgMgr > 0)
+  if constexpr (DbgProgMgr > 0)
     std::cerr << ">>> Kernel not in the chache\n";
 
   return nullptr;
@@ -2550,7 +2550,7 @@ sycl::detail::pi::PiKernel ProgramManager::getOrCreateMaterializedKernel(
     const device &Device, const std::string &KernelName,
     const std::vector<unsigned char> &SpecializationConsts) {
   // Check if we already have the kernel in the cache.
-  if (DbgProgMgr > 0)
+  if constexpr (DbgProgMgr > 0)
     std::cerr << ">>> ProgramManager::getOrCreateMaterializedKernel\n"
               << "KernelName: " << KernelName << "\n";
 
@@ -2558,7 +2558,7 @@ sycl::detail::pi::PiKernel ProgramManager::getOrCreateMaterializedKernel(
           getCachedMaterializedKernel(KernelName, SpecializationConsts))
     return CachedKernel;
 
-  if (DbgProgMgr > 0)
+  if constexpr (DbgProgMgr > 0)
     std::cerr << ">>> Adding the kernel to the cache.\n";
   auto Program = createPIProgram(Img, Context, Device);
   auto DeviceImpl = detail::getSyclObjImpl(Device);

From 21c814f7d247a7b6371024068c9b865f4d70e081 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Wed, 24 Jul 2024 11:36:27 +0100
Subject: [PATCH 18/22] build fix

---
 sycl/include/sycl/detail/helpers.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/include/sycl/detail/helpers.hpp b/sycl/include/sycl/detail/helpers.hpp
index d0303a0f2d2cb..4d632ada7c767 100644
--- a/sycl/include/sycl/detail/helpers.hpp
+++ b/sycl/include/sycl/detail/helpers.hpp
@@ -11,6 +11,7 @@
 #include <CL/__spirv/spirv_types.hpp> // for MemorySemanticsMask
 #include <sycl/access/access.hpp>     // for fence_space
 #include <sycl/detail/export.hpp>     // for __SYCL_EXPORT
+#include <sycl/detail/pi.hpp>         // for PiProgram
 #include <sycl/memory_enums.hpp>      // for memory_order
 
 #ifdef __SYCL_DEVICE_ONLY__

From 87f1a874d1d7064c57ea024626e49ae0fee3c0dd Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Wed, 24 Jul 2024 12:08:47 +0100
Subject: [PATCH 19/22] orfer of includes

---
 sycl/test/include_deps/sycl_accessor.hpp.cpp    | 7 ++++---
 sycl/test/include_deps/sycl_buffer.hpp.cpp      | 3 ++-
 sycl/test/include_deps/sycl_detail_core.hpp.cpp | 8 ++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/sycl/test/include_deps/sycl_accessor.hpp.cpp b/sycl/test/include_deps/sycl_accessor.hpp.cpp
index 37bc9bf84559d..85b2543205279 100644
--- a/sycl/test/include_deps/sycl_accessor.hpp.cpp
+++ b/sycl/test/include_deps/sycl_accessor.hpp.cpp
@@ -21,10 +21,13 @@
 // CHECK-NEXT: info/aspects_deprecated.def
 // CHECK-NEXT: atomic.hpp
 // CHECK-NEXT: detail/helpers.hpp
-// CHECK-NEXT: memory_enums.hpp
+// CHECK-NEXT: detail/pi.hpp
+// CHECK-NEXT: backend_types.hpp
+// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: detail/pi.h
 // CHECK-NEXT: detail/pi_error.def
 // CHECK-NEXT: detail/pi.def
+// CHECK-NEXT: memory_enums.hpp
 // CHECK-NEXT: CL/__spirv/spirv_vars.hpp
 // CHECK-NEXT: multi_ptr.hpp
 // CHECK-NEXT: detail/type_traits.hpp
@@ -58,7 +61,6 @@
 // CHECK-NEXT: detail/boost/mp11/detail/mp_with_index.hpp
 // CHECK-NEXT: detail/boost/mp11/integer_sequence.hpp
 // CHECK-NEXT: buffer.hpp
-// CHECK-NEXT: backend_types.hpp
 // CHECK-NEXT: detail/array.hpp
 // CHECK-NEXT: exception.hpp
 // CHECK-NEXT: detail/cl.h
@@ -76,7 +78,6 @@
 // CHECK-NEXT: detail/stl_type_traits.hpp
 // CHECK-NEXT: detail/sycl_mem_obj_allocator.hpp
 // CHECK-NEXT: detail/aligned_allocator.hpp
-// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: ext/oneapi/accessor_property_list.hpp
 // CHECK-NEXT: detail/property_list_base.hpp
 // CHECK-NEXT: property_list.hpp
diff --git a/sycl/test/include_deps/sycl_buffer.hpp.cpp b/sycl/test/include_deps/sycl_buffer.hpp.cpp
index 4bda4fe9cbe5f..ad5a5cafbf22c 100644
--- a/sycl/test/include_deps/sycl_buffer.hpp.cpp
+++ b/sycl/test/include_deps/sycl_buffer.hpp.cpp
@@ -25,6 +25,8 @@
 // CHECK-NEXT: detail/string.hpp
 // CHECK-NEXT: detail/common.hpp
 // CHECK-NEXT: detail/helpers.hpp
+// CHECK-NEXT: detail/pi.hpp
+// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: memory_enums.hpp
 // CHECK-NEXT: CL/__spirv/spirv_vars.hpp
 // CHECK-NEXT: detail/iostream_proxy.hpp
@@ -36,7 +38,6 @@
 // CHECK-NEXT: detail/stl_type_traits.hpp
 // CHECK-NEXT: detail/sycl_mem_obj_allocator.hpp
 // CHECK-NEXT: detail/aligned_allocator.hpp
-// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: ext/oneapi/accessor_property_list.hpp
 // CHECK-NEXT: detail/property_list_base.hpp
 // CHECK-NEXT: property_list.hpp
diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
index fc4961262b09c..2ec31465dcc7c 100644
--- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp
+++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
@@ -22,10 +22,13 @@
 // CHECK-NEXT: info/aspects_deprecated.def
 // CHECK-NEXT: atomic.hpp
 // CHECK-NEXT: detail/helpers.hpp
-// CHECK-NEXT: memory_enums.hpp
+// CHECK-NEXT: detail/pi.hpp
+// CHECK-NEXT: backend_types.hpp
+// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: detail/pi.h
 // CHECK-NEXT: detail/pi_error.def
 // CHECK-NEXT: detail/pi.def
+// CHECK-NEXT: memory_enums.hpp
 // CHECK-NEXT: CL/__spirv/spirv_vars.hpp
 // CHECK-NEXT: multi_ptr.hpp
 // CHECK-NEXT: detail/type_traits.hpp
@@ -59,7 +62,6 @@
 // CHECK-NEXT: detail/boost/mp11/detail/mp_with_index.hpp
 // CHECK-NEXT: detail/boost/mp11/integer_sequence.hpp
 // CHECK-NEXT: buffer.hpp
-// CHECK-NEXT: backend_types.hpp
 // CHECK-NEXT: detail/array.hpp
 // CHECK-NEXT: exception.hpp
 // CHECK-NEXT: detail/cl.h
@@ -77,7 +79,6 @@
 // CHECK-NEXT: detail/stl_type_traits.hpp
 // CHECK-NEXT: detail/sycl_mem_obj_allocator.hpp
 // CHECK-NEXT: detail/aligned_allocator.hpp
-// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: ext/oneapi/accessor_property_list.hpp
 // CHECK-NEXT: detail/property_list_base.hpp
 // CHECK-NEXT: property_list.hpp
@@ -144,7 +145,6 @@
 // CHECK-NEXT: ext/oneapi/properties/properties.hpp
 // CHECK-NEXT: ext/oneapi/experimental/graph.hpp
 // CHECK-NEXT: handler.hpp
-// CHECK-NEXT: detail/pi.hpp
 // CHECK-NEXT: detail/reduction_forward.hpp
 // CHECK-NEXT: ext/intel/experimental/fp_control_kernel_properties.hpp
 // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp

From 509e3e6eb13913d86e66e1b586c5cc7f50e2f765 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Wed, 24 Jul 2024 13:42:14 +0100
Subject: [PATCH 20/22] JIT e2e test

---
 .../AmdNvidiaJIT/kernel_and_bundle.cpp        | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp

diff --git a/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp b/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp
new file mode 100644
index 0000000000000..e86fbdebedd79
--- /dev/null
+++ b/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp
@@ -0,0 +1,108 @@
+// REQUIRES: cuda || hip
+
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: env SYCL_JIT_AMDGCN_PTX_KERNELS=1 env SYCL_JIT_COMPILER_DEBUG="sycl-spec-const-materializer" %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
+
+#include <sycl/sycl.hpp>
+
+constexpr size_t Size = 16;
+constexpr int SeedKernel = 3;
+constexpr int SeedKernelBundle = 5;
+
+constexpr int ValInt = 11;
+constexpr std::array<int, 2> ValArr{13, 17};
+const static sycl::specialization_id<int> SpecConstInt;
+const static sycl::specialization_id<std::array<int, 2>> SpecConstArr;
+
+int validate(int Seed, std::vector<int> &Input, std::vector<int> &Output) {
+  for (int i = 0; i < Size; ++i) {
+    int Expected = ValInt + ValArr[0] + ValArr[1] + Input[i] + Seed;
+    if (Expected != Output[i]) {
+      return -1;
+    }
+  }
+  return 0;
+}
+
+// CHECK: Working on function:
+// CHECK: ==================
+// CHECK: _ZTSZ15runKernelBundleN4sycl3_V15queueERSt6vectorIiSaIiEES5_E10WoofBundle
+int runKernelBundle(sycl::queue Queue, std::vector<int> &Input, std::vector<int> &Output) {
+  for (int i = 0; i < Size; ++i) {
+    Output[i] = 42;
+    Input[i] = i * i;
+  }
+
+  sycl::device Device;
+  sycl::context Context = Queue.get_context();
+
+  auto InputBundle =
+      sycl::get_kernel_bundle<class WoofBundle, sycl::bundle_state::input>(
+          Context, {Device});
+  InputBundle.set_specialization_constant<SpecConstInt>(ValInt);
+  InputBundle.set_specialization_constant<SpecConstArr>(ValArr);
+
+  auto ExecBundle = sycl::build(InputBundle);
+
+  {
+    sycl::buffer<int> OutBuff(Output.data(), Output.size());
+    sycl::buffer<int> InBuff(Input.data(), Input.size());
+    Queue.submit([&](sycl::handler &cgh) {
+      sycl::accessor OutAcc(OutBuff, cgh, sycl::write_only);
+      sycl::accessor InAcc(InBuff, cgh, sycl::read_only);
+      cgh.use_kernel_bundle(ExecBundle);
+      cgh.template parallel_for<class WoofBundle>(
+          sycl::range<1>{Size}, [=](sycl::id<1> i, sycl::kernel_handler kh) {
+            const auto KernelSpecConst =
+                kh.get_specialization_constant<SpecConstInt>();
+            const auto KernelSpecConstArr =
+                kh.get_specialization_constant<SpecConstArr>();
+            OutAcc[i] = KernelSpecConst + KernelSpecConstArr[0] +
+                        KernelSpecConstArr[1] + InAcc[i] + SeedKernelBundle;
+          });
+    });
+    Queue.wait_and_throw();
+  }
+
+  return validate(SeedKernelBundle, Input, Output);
+}
+
+// CHECK: Working on function:
+// CHECK: ==================
+// CHECK: _ZTSZZ9runKernelN4sycl3_V15queueERSt6vectorIiSaIiEES5_ENKUlRT_E_clINS0_7handlerEEEDaS7_E10WoofKernel
+int runKernel(sycl::queue Queue, std::vector<int> &Input, std::vector<int> &Output) {
+  for (int i = 0; i < Size; ++i) {
+    Output[i] = 42;
+    Input[i] = i * i;
+  }
+  {
+    sycl::buffer<int> OutBuff(Output.data(), Output.size());
+    sycl::buffer<int> InBuff(Input.data(), Input.size());
+    Queue.submit([&](auto &CGH) {
+      sycl::accessor OutAcc(OutBuff, CGH, sycl::write_only);
+      sycl::accessor InAcc(InBuff, CGH, sycl::read_only);
+      CGH.template set_specialization_constant<SpecConstInt>(ValInt);
+      CGH.template set_specialization_constant<SpecConstArr>(ValArr);
+      CGH.template parallel_for<class WoofKernel>(
+          sycl::range<1>{Size}, [=](sycl::id<1> i, sycl::kernel_handler KH) {
+            const auto KernelSpecConst =
+                KH.get_specialization_constant<SpecConstInt>();
+            const auto KernelSpecConstArr =
+                KH.get_specialization_constant<SpecConstArr>();
+            OutAcc[i] = KernelSpecConst + KernelSpecConstArr[0] +
+                        KernelSpecConstArr[1] + InAcc[i] + SeedKernel;
+          });
+    });
+    Queue.wait_and_throw();
+  }
+
+  return validate(SeedKernel, Input, Output);
+}
+
+int main() {
+  std::vector<int> Input(Size);
+  std::vector<int> Output(Size);
+  sycl::queue Queue;
+  return runKernel(Queue, Input, Output) | runKernelBundle(Queue, Input, Output);
+}
+

From d8c64990222a0a60c3604904358663116b8fdea4 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Wed, 24 Jul 2024 13:46:33 +0100
Subject: [PATCH 21/22] clang format the test

---
 sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp b/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp
index e86fbdebedd79..1607c8ff9a113 100644
--- a/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp
+++ b/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp
@@ -27,7 +27,8 @@ int validate(int Seed, std::vector<int> &Input, std::vector<int> &Output) {
 // CHECK: Working on function:
 // CHECK: ==================
 // CHECK: _ZTSZ15runKernelBundleN4sycl3_V15queueERSt6vectorIiSaIiEES5_E10WoofBundle
-int runKernelBundle(sycl::queue Queue, std::vector<int> &Input, std::vector<int> &Output) {
+int runKernelBundle(sycl::queue Queue, std::vector<int> &Input,
+                    std::vector<int> &Output) {
   for (int i = 0; i < Size; ++i) {
     Output[i] = 42;
     Input[i] = i * i;
@@ -70,7 +71,8 @@ int runKernelBundle(sycl::queue Queue, std::vector<int> &Input, std::vector<int>
 // CHECK: Working on function:
 // CHECK: ==================
 // CHECK: _ZTSZZ9runKernelN4sycl3_V15queueERSt6vectorIiSaIiEES5_ENKUlRT_E_clINS0_7handlerEEEDaS7_E10WoofKernel
-int runKernel(sycl::queue Queue, std::vector<int> &Input, std::vector<int> &Output) {
+int runKernel(sycl::queue Queue, std::vector<int> &Input,
+              std::vector<int> &Output) {
   for (int i = 0; i < Size; ++i) {
     Output[i] = 42;
     Input[i] = i * i;
@@ -103,6 +105,6 @@ int main() {
   std::vector<int> Input(Size);
   std::vector<int> Output(Size);
   sycl::queue Queue;
-  return runKernel(Queue, Input, Output) | runKernelBundle(Queue, Input, Output);
+  return runKernel(Queue, Input, Output) |
+         runKernelBundle(Queue, Input, Output);
 }
-

From 66bd1101ea1ab0367ed1229f9d6a67235ba6d83e Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Wed, 24 Jul 2024 14:39:29 +0100
Subject: [PATCH 22/22] include fix in the test

---
 sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp b/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp
index 1607c8ff9a113..21eccd983c7cb 100644
--- a/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp
+++ b/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp
@@ -3,7 +3,8 @@
 // RUN: %{build} -fsycl-embed-ir -o %t.out
 // RUN: env SYCL_JIT_AMDGCN_PTX_KERNELS=1 env SYCL_JIT_COMPILER_DEBUG="sycl-spec-const-materializer" %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
 
-#include <sycl/sycl.hpp>
+#include <sycl/detail/core.hpp>
+#include <sycl/specialization_id.hpp>
 
 constexpr size_t Size = 16;
 constexpr int SeedKernel = 3;