diff --git a/sycl-fusion/jit-compiler/include/KernelFusion.h b/sycl-fusion/jit-compiler/include/KernelFusion.h
index dd1865e2f8193..37489b640a597 100644
--- a/sycl-fusion/jit-compiler/include/KernelFusion.h
+++ b/sycl-fusion/jit-compiler/include/KernelFusion.h
@@ -19,19 +19,18 @@
 
 namespace jit_compiler {
 
-class FusionResult {
+class JITResult {
 public:
-  explicit FusionResult(const char *ErrorMessage)
-      : Type{FusionResultType::FAILED}, KernelInfo{},
-        ErrorMessage{ErrorMessage} {}
+  explicit JITResult(const char *ErrorMessage)
+      : Type{JITResultType::FAILED}, KernelInfo{}, ErrorMessage{ErrorMessage} {}
 
-  explicit FusionResult(const SYCLKernelInfo &KernelInfo, bool Cached = false)
-      : Type{(Cached) ? FusionResultType::CACHED : FusionResultType::NEW},
+  explicit JITResult(const SYCLKernelInfo &KernelInfo, bool Cached = false)
+      : Type{(Cached) ? JITResultType::CACHED : JITResultType::NEW},
         KernelInfo(KernelInfo), ErrorMessage{} {}
 
-  bool failed() const { return Type == FusionResultType::FAILED; }
+  bool failed() const { return Type == JITResultType::FAILED; }
 
-  bool cached() const { return Type == FusionResultType::CACHED; }
+  bool cached() const { return Type == JITResultType::CACHED; }
 
   const char *getErrorMessage() const {
     assert(failed() && "No error message present");
@@ -44,9 +43,9 @@ class FusionResult {
   }
 
 private:
-  enum class FusionResultType { FAILED, CACHED, NEW };
+  enum class JITResultType { FAILED, CACHED, NEW };
 
-  FusionResultType Type;
+  JITResultType Type;
   SYCLKernelInfo KernelInfo;
   sycl::detail::string ErrorMessage;
 };
@@ -56,12 +55,18 @@ extern "C" {
 #ifdef __clang__
 #pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
 #endif // __clang__
-FusionResult fuseKernels(View<SYCLKernelInfo> KernelInformation,
-                         const char *FusedKernelName,
-                         View<ParameterIdentity> Identities,
-                         BarrierFlags BarriersFlags,
-                         View<ParameterInternalization> Internalization,
-                         View<jit_compiler::JITConstant> JITConstants);
+JITResult fuseKernels(View<SYCLKernelInfo> KernelInformation,
+                      const char *FusedKernelName,
+                      View<ParameterIdentity> Identities,
+                      BarrierFlags BarriersFlags,
+                      View<ParameterInternalization> Internalization,
+                      View<jit_compiler::JITConstant> JITConstants);
+
+JITResult materializeSpecConstants(const char *KernelName,
+                                   jit_compiler::SYCLKernelBinaryInfo &BinInfo,
+                                   View<unsigned char> SpecConstBlob,
+                                   const char *TargetCPU,
+                                   const char *TargetFeatures);
 
 /// Clear all previously set options.
 void resetJITConfiguration();
diff --git a/sycl-fusion/jit-compiler/ld-version-script.txt b/sycl-fusion/jit-compiler/ld-version-script.txt
index 101355ab10c1a..eb7892fdfec9c 100644
--- a/sycl-fusion/jit-compiler/ld-version-script.txt
+++ b/sycl-fusion/jit-compiler/ld-version-script.txt
@@ -2,6 +2,7 @@
   global:
     /* Export the library entry points */
     fuseKernels;
+    materializeSpecConstants;
     resetJITConfiguration;
     addToJITConfiguration;
 
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 28f099c537a2a..2277c99374919 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -24,8 +24,8 @@ using namespace jit_compiler;
 using FusedFunction = helper::FusionHelper::FusedFunction;
 using FusedFunctionList = std::vector<FusedFunction>;
 
-static FusionResult errorToFusionResult(llvm::Error &&Err,
-                                        const std::string &Msg) {
+static JITResult errorToFusionResult(llvm::Error &&Err,
+                                     const std::string &Msg) {
   std::stringstream ErrMsg;
   ErrMsg << Msg << "\nDetailed information:\n";
   llvm::handleAllErrors(std::move(Err),
@@ -34,7 +34,7 @@ static FusionResult errorToFusionResult(llvm::Error &&Err,
                           // compiled without exception support.
                           ErrMsg << "\t" << StrErr.getMessage() << "\n";
                         });
-  return FusionResult{ErrMsg.str().c_str()};
+  return JITResult{ErrMsg.str().c_str()};
 }
 
 static std::vector<jit_compiler::NDRange>
@@ -70,11 +70,58 @@ static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
   }
 }
 
-extern "C" FusionResult
-fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
-            View<ParameterIdentity> Identities, BarrierFlags BarriersFlags,
-            View<ParameterInternalization> Internalization,
-            View<jit_compiler::JITConstant> Constants) {
+extern "C" JITResult
+materializeSpecConstants(const char *KernelName,
+                         jit_compiler::SYCLKernelBinaryInfo &BinInfo,
+                         View<unsigned char> SpecConstBlob,
+                         const char *TargetCPU, const char *TargetFeatures) {
+  auto &JITCtx = JITContext::getInstance();
+
+  TargetInfo TargetInfo = ConfigHelper::get<option::JITTargetInfo>();
+  BinaryFormat TargetFormat = TargetInfo.getFormat();
+  if (TargetFormat != BinaryFormat::PTX &&
+      TargetFormat != BinaryFormat::AMDGCN) {
+    return JITResult("Output target format not supported by this build. "
+                     "Available targets are: PTX or AMDGCN.");
+  }
+
+  ::jit_compiler::SYCLKernelInfo KernelInfo{
+      KernelName, ::jit_compiler::SYCLArgumentDescriptor{},
+      ::jit_compiler::NDRange{}, BinInfo};
+  SYCLModuleInfo ModuleInfo;
+  ModuleInfo.kernels().insert(ModuleInfo.kernels().end(), KernelInfo);
+  // Load all input kernels from their respective modules into a single
+  // LLVM IR module.
+  llvm::Expected<std::unique_ptr<llvm::Module>> ModOrError =
+      translation::KernelTranslator::loadKernels(*JITCtx.getLLVMContext(),
+                                                 ModuleInfo.kernels());
+  if (auto Error = ModOrError.takeError()) {
+    return errorToFusionResult(std::move(Error), "Failed to load kernels");
+  }
+  std::unique_ptr<llvm::Module> NewMod = std::move(*ModOrError);
+  if (!fusion::FusionPipeline::runMaterializerPasses(
+          *NewMod, SpecConstBlob.to<llvm::ArrayRef>()) ||
+      !NewMod->getFunction(KernelName)) {
+    return JITResult{"Materializer passes should not fail"};
+  }
+
+  SYCLKernelInfo &MaterializerKernelInfo = *ModuleInfo.getKernelFor(KernelName);
+  if (auto Error = translation::KernelTranslator::translateKernel(
+          MaterializerKernelInfo, *NewMod, JITCtx, TargetFormat, TargetCPU,
+          TargetFeatures)) {
+    return errorToFusionResult(std::move(Error),
+                               "Translation to output format failed");
+  }
+
+  return JITResult{MaterializerKernelInfo};
+}
+
+extern "C" JITResult fuseKernels(View<SYCLKernelInfo> KernelInformation,
+                                 const char *FusedKernelName,
+                                 View<ParameterIdentity> Identities,
+                                 BarrierFlags BarriersFlags,
+                                 View<ParameterInternalization> Internalization,
+                                 View<jit_compiler::JITConstant> Constants) {
 
   std::vector<std::string> KernelsToFuse;
   llvm::transform(KernelInformation, std::back_inserter(KernelsToFuse),
@@ -93,8 +140,7 @@ fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
   }
 
   if (!isTargetFormatSupported(TargetFormat)) {
-    return FusionResult(
-        "Fusion output target format not supported by this build");
+    return JITResult("Fusion output target format not supported by this build");
   }
 
   auto &JITCtx = JITContext::getInstance();
@@ -117,7 +163,7 @@ fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
         // before returning the kernel info to the runtime.
         CachedKernel->NDR = FusedNDR->getNDR();
       }
-      return FusionResult{*CachedKernel, /*Cached*/ true};
+      return JITResult{*CachedKernel, /*Cached*/ true};
     }
     helper::printDebugMessage(
         "Compiling new kernel, no suitable cached kernel found");
@@ -165,13 +211,13 @@ fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
                                               BarriersFlags);
 
   if (!NewMod->getFunction(FusedKernelName)) {
-    return FusionResult{"Kernel fusion failed"};
+    return JITResult{"Kernel fusion failed"};
   }
 
   // Get the updated kernel info for the fused kernel and add the information to
   // the existing KernelInfo.
   if (!NewModInfo->hasKernelFor(FusedKernelName)) {
-    return FusionResult{"No KernelInfo for fused kernel"};
+    return JITResult{"No KernelInfo for fused kernel"};
   }
 
   SYCLKernelInfo &FusedKernelInfo = *NewModInfo->getKernelFor(FusedKernelName);
@@ -188,7 +234,7 @@ fuseKernels(View<SYCLKernelInfo> KernelInformation, const char *FusedKernelName,
     JITCtx.addCacheEntry(CacheKey, FusedKernelInfo);
   }
 
-  return FusionResult{FusedKernelInfo};
+  return JITResult{FusedKernelInfo};
 }
 
 extern "C" void resetJITConfiguration() { ConfigHelper::reset(); }
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index bf769b50e2f30..c1a5041d17c57 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -12,6 +12,7 @@
 #include "helper/ConfigHelper.h"
 #include "internalization/Internalization.h"
 #include "kernel-fusion/SYCLKernelFusion.h"
+#include "kernel-fusion/SYCLSpecConstMaterializer.h"
 #include "kernel-info/SYCLKernelInfo.h"
 #include "syclcp/SYCLCP.h"
 
@@ -141,3 +142,48 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
 
   return std::make_unique<SYCLModuleInfo>(std::move(*NewModInfo.ModuleInfo));
 }
+
+bool FusionPipeline::runMaterializerPasses(
+    llvm::Module &Mod, llvm::ArrayRef<unsigned char> SpecConstData) {
+  PassBuilder PB;
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  ModulePassManager MPM;
+  // Register inserter and materializer passes.
+  {
+    FunctionPassManager FPM;
+    MPM.addPass(SYCLSpecConstDataInserter{SpecConstData});
+    FPM.addPass(SYCLSpecConstMaterializer{});
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+  // Add generic optimizations,
+  {
+    FunctionPassManager FPM;
+    MPM.addPass(AlwaysInlinerPass{});
+    FPM.addPass(SROAPass{SROAOptions::ModifyCFG});
+    FPM.addPass(SCCPPass{});
+    FPM.addPass(ADCEPass{});
+    FPM.addPass(EarlyCSEPass{/*UseMemorySSA*/ true});
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+  // followed by unrolling.
+  {
+    FunctionPassManager FPM;
+    FPM.addPass(createFunctionToLoopPassAdaptor(IndVarSimplifyPass{}));
+    LoopUnrollOptions UnrollOptions;
+    FPM.addPass(LoopUnrollPass{UnrollOptions});
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+
+  MPM.run(Mod, MAM);
+
+  return true;
+}
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
index 61ae8e0bcb274..22d71cc16187e 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
@@ -27,6 +27,13 @@ class FusionPipeline {
   static std::unique_ptr<SYCLModuleInfo>
   runFusionPasses(llvm::Module &Mod, SYCLModuleInfo &InputInfo,
                   BarrierFlags BarriersFlags);
+
+  ///
+  /// Run the necessary passes in a custom pass pipeline to perform
+  /// materialization of kernel specialization constants.
+  static bool
+  runMaterializerPasses(llvm::Module &Mod,
+                        llvm::ArrayRef<unsigned char> SpecConstData);
 };
 } // namespace fusion
 } // namespace jit_compiler
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 977d5a4a73eef..0d45a22c9e5a1 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -168,10 +168,11 @@ KernelTranslator::loadSPIRVKernel(llvm::LLVMContext &LLVMCtx,
   return SPIRVLLVMTranslator::loadSPIRVKernel(LLVMCtx, Kernel);
 }
 
-llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
-                                              llvm::Module &Mod,
-                                              JITContext &JITCtx,
-                                              BinaryFormat Format) {
+llvm::Error
+KernelTranslator::translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod,
+                                  JITContext &JITCtx, BinaryFormat Format,
+                                  const std::string &TargetCPU,
+                                  const std::string &TargetFeatures) {
 
   KernelBinary *KernelBin = nullptr;
   switch (Format) {
@@ -186,7 +187,7 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
   }
   case BinaryFormat::PTX: {
     llvm::Expected<KernelBinary *> BinaryOrError =
-        translateToPTX(Kernel, Mod, JITCtx);
+        translateToPTX(Kernel, Mod, JITCtx, TargetCPU, TargetFeatures);
     if (auto Error = BinaryOrError.takeError()) {
       return Error;
     }
@@ -195,7 +196,7 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
   }
   case BinaryFormat::AMDGCN: {
     llvm::Expected<KernelBinary *> BinaryOrError =
-        translateToAMDGCN(Kernel, Mod, JITCtx);
+        translateToAMDGCN(Kernel, Mod, JITCtx, TargetCPU, TargetFeatures);
     if (auto Error = BinaryOrError.takeError())
       return Error;
     KernelBin = *BinaryOrError;
@@ -226,9 +227,9 @@ KernelTranslator::translateToSPIRV(llvm::Module &Mod, JITContext &JITCtx) {
   return SPIRVLLVMTranslator::translateLLVMtoSPIRV(Mod, JITCtx);
 }
 
-llvm::Expected<KernelBinary *>
-KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
-                                 JITContext &JITCtx) {
+llvm::Expected<KernelBinary *> KernelTranslator::translateToPTX(
+    SYCLKernelInfo &KernelInfo, llvm::Module &Mod, JITContext &JITCtx,
+    const std::string &TargetCPU, const std::string &TargetFeatures) {
 #ifndef FUSION_JIT_SUPPORT_PTX
   (void)KernelInfo;
   (void)Mod;
@@ -257,23 +258,32 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
         ErrorMessage.c_str());
   }
 
-  llvm::StringRef TargetCPU{"sm_50"};
-  llvm::StringRef TargetFeatures{"+sm_50,+ptx76"};
-  if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name.c_str())) {
-    if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
-      TargetCPU =
-          KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
+  // Give priority to user specified values (through environment variables:
+  // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES).
+  llvm::StringRef CPU{TargetCPU};
+  llvm::StringRef Features{TargetFeatures};
+
+  auto *KernelFunc = Mod.getFunction(KernelInfo.Name.c_str());
+  // If they were not set, use default and consult the module for alternatives
+  // (if present).
+  if (CPU.empty()) {
+    CPU = "sm_50";
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
+      CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
     }
-    if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
-      TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
-                           .getValueAsString();
+  }
+  if (Features.empty()) {
+    Features = "+sm_50,+ptx76";
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
+      Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
+                     .getValueAsString();
     }
   }
 
   // FIXME: Check whether we can provide more accurate target information here
   auto *TargetMachine = Target->createTargetMachine(
-      TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_,
-      std::nullopt, llvm::CodeGenOptLevel::Default);
+      TargetTriple, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
+      llvm::CodeGenOptLevel::Default);
 
   llvm::legacy::PassManager PM;
 
@@ -298,9 +308,9 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
 #endif // FUSION_JIT_SUPPORT_PTX
 }
 
-llvm::Expected<KernelBinary *>
-KernelTranslator::translateToAMDGCN(SYCLKernelInfo &KernelInfo,
-                                    llvm::Module &Mod, JITContext &JITCtx) {
+llvm::Expected<KernelBinary *> KernelTranslator::translateToAMDGCN(
+    SYCLKernelInfo &KernelInfo, llvm::Module &Mod, JITContext &JITCtx,
+    const std::string &TargetCPU, const std::string &TargetFeatures) {
 #ifndef FUSION_JIT_SUPPORT_AMDGCN
   (void)KernelInfo;
   (void)Mod;
@@ -329,25 +339,29 @@ KernelTranslator::translateToAMDGCN(SYCLKernelInfo &KernelInfo,
         "Failed to load and translate AMDGCN LLVM IR module with error %s",
         ErrorMessage.c_str());
 
-  // Set to the lowest tested target according to the GetStartedGuide, section
-  // "Build DPC++ toolchain with support for HIP AMD"
-  llvm::StringRef TargetCPU{"gfx906"};
-  llvm::StringRef TargetFeatures{""};
-  if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name.c_str())) {
-    if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
-      TargetCPU =
-          KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
+  llvm::StringRef CPU{TargetCPU};
+  llvm::StringRef Features{TargetFeatures};
+
+  auto *KernelFunc = Mod.getFunction(KernelInfo.Name.c_str());
+  if (CPU.empty()) {
+    // Set to the lowest tested target according to the GetStartedGuide, section
+    // "Build DPC++ toolchain with support for HIP AMD"
+    CPU = "gfx906";
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
+      CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
     }
-    if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
-      TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
-                           .getValueAsString();
+  }
+  if (Features.empty()) {
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
+      Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
+                     .getValueAsString();
     }
   }
 
   // FIXME: Check whether we can provide more accurate target information here
   auto *TargetMachine = Target->createTargetMachine(
-      TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_,
-      std::nullopt, llvm::CodeGenOptLevel::Default);
+      TargetTriple, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
+      llvm::CodeGenOptLevel::Default);
 
   std::string AMDObj;
   {
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
index 809c8fab2e42f..a881dd6176b70 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
@@ -25,7 +25,9 @@ class KernelTranslator {
   loadKernels(llvm::LLVMContext &LLVMCtx, std::vector<SYCLKernelInfo> &Kernels);
 
   static llvm::Error translateKernel(SYCLKernelInfo &Kernel, llvm::Module &Mod,
-                                     JITContext &JITCtx, BinaryFormat Format);
+                                     JITContext &JITCtx, BinaryFormat Format,
+                                     const std::string &TargetCPU = {},
+                                     const std::string &TargetFeatures = {});
 
 private:
   ///
@@ -42,11 +44,14 @@ class KernelTranslator {
                                                          JITContext &JITCtx);
 
   static llvm::Expected<KernelBinary *>
-  translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx);
+  translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx,
+                 const std::string &TargetCPU = {},
+                 const std::string &TargetFeatures = {});
 
   static llvm::Expected<KernelBinary *>
   translateToAMDGCN(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
-                    JITContext &JITCtx);
+                    JITContext &JITCtx, const std::string &TargetCPU = {},
+                    const std::string &TargetFeatures = {});
 };
 } // namespace translation
 } // namespace jit_compiler
diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index e768044f6e297..e3e8af1518560 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(SYCLKernelFusion MODULE
   SYCLFusionPasses.cpp
   kernel-fusion/Builtins.cpp
   kernel-fusion/SYCLKernelFusion.cpp
+  kernel-fusion/SYCLSpecConstMaterializer.cpp
   kernel-info/SYCLKernelInfo.cpp
   internalization/Internalization.cpp
   syclcp/SYCLCP.cpp
@@ -50,6 +51,7 @@ add_llvm_library(SYCLKernelFusionPasses
   SYCLFusionPasses.cpp
   kernel-fusion/Builtins.cpp
   kernel-fusion/SYCLKernelFusion.cpp
+  kernel-fusion/SYCLSpecConstMaterializer.cpp
   kernel-info/SYCLKernelInfo.cpp
   internalization/Internalization.cpp
   syclcp/SYCLCP.cpp
diff --git a/sycl-fusion/passes/SYCLFusionPasses.cpp b/sycl-fusion/passes/SYCLFusionPasses.cpp
index 16a938ec991ce..c7959fd70ee0b 100644
--- a/sycl-fusion/passes/SYCLFusionPasses.cpp
+++ b/sycl-fusion/passes/SYCLFusionPasses.cpp
@@ -13,6 +13,7 @@
 
 #include "internalization/Internalization.h"
 #include "kernel-fusion/SYCLKernelFusion.h"
+#include "kernel-fusion/SYCLSpecConstMaterializer.h"
 #include "kernel-info/SYCLKernelInfo.h"
 #include "syclcp/SYCLCP.h"
 
@@ -49,6 +50,12 @@ llvm::PassPluginLibraryInfo getSYCLKernelFusionPluginInfo() {
                 MPM.addPass(SYCLModuleInfoPrinter());
                 return true;
               }
+              if (Name == "sycl-spec-const-materializer") {
+                FunctionPassManager FPM;
+                FPM.addPass(SYCLSpecConstMaterializer());
+                MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+                return true;
+              }
               return false;
             });
         PB.registerAnalysisRegistrationCallback([](ModuleAnalysisManager &MAM) {
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
new file mode 100644
index 0000000000000..ffcb81a97ca18
--- /dev/null
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp
@@ -0,0 +1,305 @@
+//==-------------------- SYCLSpecConstMaterializer.cpp ---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SYCLSpecConstMaterializer.h"
+
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/SYCLLowerIR/TargetHelpers.h"
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/Debug.h>
+
+#define DEBUG_TYPE "sycl-spec-const-materializer"
+
+using namespace llvm;
+
+constexpr llvm::StringLiteral SpecConstDataNodeName{"SYCL_SpecConst_data"};
+
+PreservedAnalyses SYCLSpecConstDataInserter::run(Module &M,
+                                                 ModuleAnalysisManager &) {
+  assert(!M.getNamedMetadata(SpecConstDataNodeName) &&
+         "Did not expect the node to be present.");
+
+  auto &Context = M.getContext();
+  auto *SpecConstMD = M.getOrInsertNamedMetadata(SpecConstDataNodeName);
+  auto *StringMD = MDString::get(
+      Context, StringRef{reinterpret_cast<const char *>(SpecConstData.data()),
+                         SpecConstData.size()});
+  auto *TupleMD = MDTuple::get(Context, {StringMD});
+  SpecConstMD->addOperand(TupleMD);
+
+  return PreservedAnalyses::all();
+}
+
+static Constant *getScalarConstant(const unsigned char **ValPtr, Type *Ty) {
+  if (Ty->isIntegerTy()) {
+    unsigned NumBytes = Ty->getIntegerBitWidth() / 8;
+    uint64_t IntValue = 0;
+    std::memcpy(&IntValue, *ValPtr, NumBytes);
+    *ValPtr = *ValPtr + NumBytes;
+    return ConstantInt::get(Ty, IntValue);
+  }
+  if (Ty->isDoubleTy()) {
+    double DoubleValue = *(reinterpret_cast<const double *>(*ValPtr));
+    *ValPtr = *ValPtr + sizeof(double);
+    return ConstantFP::get(Ty, DoubleValue);
+  }
+  if (Ty->isFloatTy()) {
+    float FloatValue = *(reinterpret_cast<const float *>(*ValPtr));
+    *ValPtr = *ValPtr + sizeof(float);
+    return ConstantFP::get(Ty, FloatValue);
+  }
+  if (Ty->isHalfTy()) {
+    uint16_t HalfValue = *(reinterpret_cast<const uint16_t *>(*ValPtr));
+    *ValPtr = *ValPtr + sizeof(uint16_t);
+    return ConstantFP::get(Ty, HalfValue);
+  }
+
+  llvm_unreachable("Scalar type not found.");
+}
+
+static Constant *getConstantOfType(const unsigned char **ValPtr, Type *Ty) {
+  assert(ValPtr && Ty && "Invalid input.");
+  if (Ty->isIntegerTy() || Ty->isFloatTy() || Ty->isDoubleTy() ||
+      Ty->isHalfTy())
+    return getScalarConstant(ValPtr, Ty);
+  if (auto *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    SmallVector<Constant *> Elems;
+    auto *ElemTy = ArrTy->getArrayElementType();
+    auto NumElems = ArrTy->getArrayNumElements();
+    for (uint64_t I = 0; I < NumElems; ++I)
+      Elems.push_back(getConstantOfType(ValPtr, ElemTy));
+    return ConstantArray::get(ArrayType::get(ElemTy, NumElems), Elems);
+  }
+  if (auto *StructTy = dyn_cast<StructType>(Ty)) {
+    SmallVector<Constant *> StructElems;
+    for (auto *ElemTy : StructTy->elements())
+      StructElems.push_back(getConstantOfType(ValPtr, ElemTy));
+    return ConstantStruct::get(StructTy, StructElems);
+  }
+
+  llvm_unreachable("Unknown type in getConstantOfType.");
+}
+
+void SYCLSpecConstMaterializer::allocateSpecConstant(StringRef KernelName) {
+  for (auto I : llvm::enumerate(TypesAndOffsets)) {
+    auto *const Ty = I.value().first;
+    assert(Ty->isSized());
+    const auto Offset = I.value().second;
+    assert(Offset < SpecConstDataSize && "Out of bounds access.");
+    const unsigned char *ValPtr = &SpecConstData[Offset];
+    auto *Initializer = getConstantOfType(&ValPtr, I.value().first);
+    // AMD's CONSTANT_ADDRESS and Nvidia's ADDRESS_SPACE_CONST happen to have
+    // the same value.
+    constexpr unsigned AS = 4;
+    auto *SpecConstGlobal = new GlobalVariable(
+        *Mod, Ty, /*isConstant*/ true, GlobalValue::WeakODRLinkage, Initializer,
+        Twine("SpecConsBlob_") + KernelName + "_" + Twine(I.index()),
+        /*InsertBefore*/ nullptr, GlobalValue::NotThreadLocal, AS,
+        /*isExternallyInitialized*/ false);
+    TypesAndOffsetsToBlob[I.value()] = SpecConstGlobal;
+  }
+}
+
+void SYCLSpecConstMaterializer::fixupSpecConstantUses() {
+  IRBuilder B(Mod->getContext());
+  for (auto &LT : LoadsToTypes) {
+    auto *Load = LT.first;
+    auto &TyOff = LT.second;
+    auto *GV = TypesAndOffsetsToBlob[TyOff];
+    B.SetInsertPoint(Load);
+    auto *NewLoad = B.CreateLoad(TyOff.first, GV);
+    Load->replaceAllUsesWith(NewLoad);
+  }
+}
+
+SmallVector<LoadInst *>
+SYCLSpecConstMaterializer::collectGEPsLoads(GetElementPtrInst *GEP) {
+  SmallVector<LoadInst *> Loads;
+  SmallVector<Instruction *> WorkList;
+  WorkList.push_back(GEP);
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+    for (auto *U : I->users()) {
+      auto *NewI = cast<Instruction>(&*U);
+      switch (NewI->getOpcode()) {
+      default: {
+        std::string Str;
+        raw_string_ostream Out(Str);
+        Out << "Unhandled instruction: ";
+        NewI->print(Out);
+        llvm_unreachable(Str.c_str());
+      }
+      case Instruction::BitCast: {
+        WorkList.push_back(NewI);
+        break;
+      }
+      case Instruction::Load: {
+        Loads.push_back(cast<LoadInst>(NewI));
+        break;
+      }
+      }
+    }
+  }
+
+  return Loads;
+}
+
+void SYCLSpecConstMaterializer::populateUses(Argument *A) {
+  SmallVector<AddrSpaceCastInst *> ASCasts;
+  for (auto *U : A->users()) {
+    auto *I = cast<Instruction>(&*U);
+    switch (I->getOpcode()) {
+    default: {
+      std::string Str;
+      raw_string_ostream Out(Str);
+      Out << "Unhandled instruction: ";
+      I->print(Out);
+      llvm_unreachable(Str.c_str());
+    }
+    case Instruction::AddrSpaceCast: {
+      ASCasts.push_back(cast<AddrSpaceCastInst>(I));
+      break;
+    }
+    }
+  }
+
+  const DataLayout &DL = Mod->getDataLayout();
+  for (auto *AS : ASCasts) {
+    for (auto *U : AS->users()) {
+      auto *I = cast<Instruction>(&*U);
+      switch (I->getOpcode()) {
+      default: {
+        LLVM_DEBUG(
+            dbgs()
+            << "Optimization opportunity missed, unhandled instruction: \n");
+        LLVM_DEBUG(I->dump());
+        LLVM_DEBUG(dbgs() << "Function:\n");
+        LLVM_DEBUG(I->getParent()->getParent()->dump());
+        break;
+      }
+      case Instruction::Load: {
+        TypeAtOffset TyO{
+            I->getType(),
+            /* Non GEP load starts at the beginnig of memory region */ 0};
+        TypesAndOffsets.insert(TyO);
+        LoadsToTypes[cast<LoadInst>(I)] = TyO;
+        break;
+      }
+      case Instruction::GetElementPtr: {
+        auto *GEP = cast<GetElementPtrInst>(I);
+        unsigned int ASL = GEP->getPointerAddressSpace();
+        unsigned OffsetBitWidth = DL.getIndexSizeInBits(ASL);
+        APInt Offset(OffsetBitWidth, 0);
+        bool FoundOffset = GEP->accumulateConstantOffset(DL, Offset);
+        if (!FoundOffset)
+          llvm_unreachable_internal("Offset unknown.");
+        auto Loads = collectGEPsLoads(GEP);
+        for (auto *Load : Loads) {
+          TypeAtOffset TyO{Load->getType(), Offset.getSExtValue()};
+          TypesAndOffsets.insert(TyO);
+          LoadsToTypes[Load] = TyO;
+        }
+        break;
+      }
+      }
+    }
+  }
+}
+
+void SYCLSpecConstMaterializer::reportAndReset() {
+  if (LoadsToTypes.empty()) {
+    LLVM_DEBUG(dbgs() << "Did not find any loads from spec const buffer.\n");
+  } else {
+    LLVM_DEBUG(dbgs() << "Replaced: " << LoadsToTypes.size()
+                      << " loads from spec const buffer.\n");
+    LLVM_DEBUG(dbgs() << "Load to global variable mappings:\n");
+    for (auto &LTT : LoadsToTypes) {
+      LLVM_DEBUG(dbgs() << "\tLoad:\n");
+      LLVM_DEBUG(LTT.first->dump());
+      LLVM_DEBUG(dbgs() << "\tGlobal Variable:\n");
+      LLVM_DEBUG(TypesAndOffsetsToBlob[LTT.second]->dump());
+      LLVM_DEBUG(dbgs() << "\n");
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\n\n");
+
+  // Reset the state.
+  TypesAndOffsets.clear();
+  TypesAndOffsetsToBlob.clear();
+  LoadsToTypes.clear();
+}
+
+PreservedAnalyses
+SYCLSpecConstMaterializer::handleKernel(llvm::Function &Kernel) {
+  if (Kernel.arg_empty())
+    return PreservedAnalyses::all();
+  auto *SpecConstArg = std::prev(Kernel.arg_end());
+  if (!SpecConstArg || !SpecConstArg->hasName() ||
+      (SpecConstArg->getName() != "_arg__specialization_constants_buffer"))
+    return PreservedAnalyses::all();
+
+  if (!readMetadata())
+    return PreservedAnalyses::all();
+
+  // Make sure that the data was in an expected format.
+  assert(SpecConstData && SpecConstDataSize > 0 &&
+         "Specialisation constant data not found");
+
+  populateUses(SpecConstArg);
+
+  allocateSpecConstant(Kernel.getName());
+
+  fixupSpecConstantUses();
+
+  reportAndReset();
+
+  return PreservedAnalyses::none();
+}
+
+bool SYCLSpecConstMaterializer::readMetadata() {
+  auto *NamedMD = Mod->getNamedMetadata(SpecConstDataNodeName);
+  if (!NamedMD || NamedMD->getNumOperands() != 1)
+    return false;
+
+  auto *MDN = cast<MDTuple>(NamedMD->getOperand(0));
+  assert(MDN->getNumOperands() == 1 && "Malformed data node.");
+  auto *MDS = cast<MDString>(MDN->getOperand(0));
+
+  SpecConstData = MDS->getString().bytes_begin();
+  SpecConstDataSize = MDS->getString().size();
+
+  return true;
+}
+
+PreservedAnalyses SYCLSpecConstMaterializer::run(Function &F,
+                                                 FunctionAnalysisManager &) {
+  if (const char *DebugEnv = std::getenv("SYCL_JIT_COMPILER_DEBUG"))
+    if (strstr(DebugEnv, DEBUG_TYPE)) {
+      DebugFlag = true;
+      llvm::setCurrentDebugType(DEBUG_TYPE);
+    }
+
+  Mod = F.getParent();
+  LLVM_DEBUG(dbgs() << "Working on function:\n==================\n"
+                    << (F.hasName() ? F.getName() : "unnamed kernel")
+                    << "\n\n");
+
+  // Invariant: This pass is only intended to operate on SYCL kernels being
+  // compiled to either `nvptx{,64}-nvidia-cuda`, or `amdgcn-amd-amdhsa`
+  // triples.
+  auto AT = TargetHelpers::getArchType(*Mod);
+  if (TargetHelpers::ArchType::Cuda != AT &&
+      TargetHelpers::ArchType::AMDHSA != AT) {
+    LLVM_DEBUG(dbgs() << "Unsupported architecture\n");
+    return PreservedAnalyses::all();
+  }
+
+  return handleKernel(F);
+}
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
new file mode 100644
index 0000000000000..e35361b24ba2a
--- /dev/null
+++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.h
@@ -0,0 +1,103 @@
+//==--------------------- SYCLSpecConstMaterializer.h ----------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYCL_SPEC_CONST_MATERIALIZER_H
+#define SYCL_SPEC_CONST_MATERIALIZER_H
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include <map>
+#include <set>
+
+namespace llvm {
+class Function;
+
+///
+/// Utility pass to insert specialization constants values into the module as a
+/// metadata node.
+class SYCLSpecConstDataInserter
+    : public PassInfoMixin<SYCLSpecConstDataInserter> {
+public:
+  SYCLSpecConstDataInserter(ArrayRef<unsigned char> SpecConstData)
+      : SpecConstData(SpecConstData) {};
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
+
+private:
+  ArrayRef<unsigned char> SpecConstData;
+};
+
+///
+/// Pass to materialize specialization constants. Specialization constants
+/// represent constants whose values can be set dynamically during execution of
+/// the SYCL application. The values of these constants are fixed when a SYCL
+/// kernel function is invoked, and they do not change during the execution of
+/// the kernel. This pass receives the values of all specialization constants
+/// used by a kernel and materializes them as concrete types. This is done in
+/// order to be able to enable other optimization opportunities (SCCP, SROA and
+/// CSE), we do not track instructions that can be removed as a result of
+/// materialization, as the pipeline runs DCE pass afterwords.
+class SYCLSpecConstMaterializer
+    : public PassInfoMixin<SYCLSpecConstMaterializer> {
+public:
+  SYCLSpecConstMaterializer() : SpecConstData(nullptr), SpecConstDataSize(0) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
+
+private:
+  // Main entry point, checks for implicit specialization constant kernel
+  // argument and, if present, performs the optimizations.
+  PreservedAnalyses handleKernel(Function &Kernel);
+
+  bool readMetadata();
+
+  // Collects all the uses of the specialization constant kernel argument.
+  // This results with TypesAndOffsets and LoadsToType being populated.
+  void populateUses(Argument *A);
+
+  // Use TypesAndOffsets to allocate global variables of given types which get
+  // initialized with value taken from the specialization constant blob at a
+  // given offset.
+  void allocateSpecConstant(StringRef KernelName);
+
+  // Re-write uses of loads from the specialization constant kernel argument to
+  // the global variable.
+  void fixupSpecConstantUses();
+
+  // Walk down all uses of a given GEP instruction in order to find loads from
+  // the offsetted pointer.
+  SmallVector<LoadInst *> collectGEPsLoads(GetElementPtrInst *GEP);
+
+  // Helper to report debug message (if enabled) and reset the state.
+  void reportAndReset();
+
+private:
+  // Run time known values of specialization constants passed from SYCL rt,
+  // data pointer and size.
+  const unsigned char *SpecConstData;
+  size_t SpecConstDataSize;
+
+  // Module the current function belongs to.
+  Module *Mod{nullptr};
+
+  // Type of the specialization constant and the offset into the SpecConstBlob,
+  // at which the value is located.
+  using TypeAtOffset = std::pair<Type *, uint64_t>;
+
+  // Unique uses of spec const (type and offset).
+  std::set<TypeAtOffset> TypesAndOffsets{};
+  // A map from type and offset to a specialization constant blob to a
+  // GlobalVariable containing its value.
+  std::map<TypeAtOffset, GlobalVariable *> TypesAndOffsetsToBlob{};
+  // A map of load instruction to its type and offset to a specialization
+  // constant blob.
+  std::map<LoadInst *, TypeAtOffset> LoadsToTypes{};
+};
+} // namespace llvm
+
+#endif // SYCL_SPEC_CONST_MATERIALIZER_H
diff --git a/sycl-fusion/test/materializer/basic.ll b/sycl-fusion/test/materializer/basic.ll
new file mode 100644
index 0000000000000..19bab11479671
--- /dev/null
+++ b/sycl-fusion/test/materializer/basic.ll
@@ -0,0 +1,64 @@
+; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
+
+; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
+
+; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer,early-cse,adce -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+
+; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer,early-cse,adce -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+
+source_filename = "basic.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+
+; Check the basic replacement of specialization constant. We expect 2 global ;
+; variables (i32 and [2 x i32]), no loads from implicit kernel argument:
+; CHECK-MATERIALIZER.
+
+; For CHECK-MATERIALIZER-CSE also include early commons subexpression
+; elimination pass and expect the final literal to be stored to the output
+; pointer.
+
+;CHECK-MATERIALIZER: @SpecConsBlob___test_kernel_0 = weak_odr addrspace(4) constant i32 7
+;CHECK-MATERIALIZER: @SpecConsBlob___test_kernel_1 = weak_odr addrspace(4) constant [2 x i32] [i32 3, i32 1]
+
+
+;CHECK: __test_kernel
+define weak_odr protected amdgpu_kernel void @__test_kernel(ptr addrspace(1) noundef align 4 %out, ptr addrspace(1) noundef align 1 %_arg__specialization_constants_buffer) {
+entry:
+  ;CHECK-MATERIALIZER-CSE-NOT: addrspacecast ptr addrspace(1) %_arg__specialization_constants_buffer to ptr
+  ;CHECK-MATERIALIZER: [[REG1:%[0-9]+]] = load i32, ptr addrspace(4) @SpecConsBlob___test_kernel_0
+  ;CHECK-MATERIALIZER: [[REG2:%[0-9]+]] = load [2 x i32], ptr addrspace(4) @SpecConsBlob___test_kernel_1
+  %0 = addrspacecast ptr addrspace(1) %_arg__specialization_constants_buffer to ptr
+  %gep = getelementptr i8, ptr %0, i32 0
+  %bc = bitcast ptr %gep to ptr
+  ;CHECK-MATERIALIZER-CSE-NOT: load i32, ptr %bc
+  %load1 = load i32, ptr %bc, align 4
+  %gep1 = getelementptr i8, ptr %0, i32 4
+  %bc2 = bitcast ptr %gep1 to ptr
+  ;CHECK-MATERIALIZER-CSE-NOT: load [2 x i32], ptr %bc2
+  %load2 = load [2 x i32], ptr %bc2, align 4
+  ;CHECK-MATERIALIZER: load i32, ptr addrspace(4) @SpecConsBlob___test_kernel_0
+  %straight_load = load i32, ptr %0, align 4
+  %extract1 = extractvalue [2 x i32] %load2, 0
+  %extract2 = extractvalue [2 x i32] %load2, 1
+  %add1 = add nsw i32 %extract1, %load1
+  %add2 = add nsw i32 %add1, %extract2
+  %add3 = add nsw i32 %add2, %straight_load
+  ;CHECK-MATERIALIZER-CSE: store i32 18, ptr addrspace(1) %out,
+  ;CHECK-MATERIALIZER: %extract1 = extractvalue [2 x i32] [[REG2]], 0
+  ;CHECK-MATERIALIZER: %extract2 = extractvalue [2 x i32] [[REG2]], 1
+  ;CHECK-MATERIALIZER: %add1 = add nsw i32 %extract1, [[REG1]]
+
+  store i32 %add3, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+!SYCL_SpecConst_data = !{!1}
+!1 = !{!"\07\00\00\00\03\00\00\00\01\00\00\00"}
diff --git a/sycl-fusion/test/materializer/debug_output.ll b/sycl-fusion/test/materializer/debug_output.ll
new file mode 100644
index 0000000000000..b7949e7d3f401
--- /dev/null
+++ b/sycl-fusion/test/materializer/debug_output.ll
@@ -0,0 +1,58 @@
+; RUN: %if hip_amd %{ env SYCL_JIT_COMPILER_DEBUG="sycl-spec-const-materializer" opt\
+; RUN: -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext --mtriple amdgcn-amd-amdhsa\
+; RUN: -passes=sycl-spec-const-materializer,sccp -S %s 2> %t.stderr\
+; RUN: | FileCheck %s %}
+; RUN: %if hip_amd %{ FileCheck --input-file=%t.stderr --check-prefix=CHECK-DEBUG %s %}
+
+; RUN: %if cuda %{ env SYCL_JIT_COMPILER_DEBUG="sycl-spec-const-materializer" opt\
+; RUN: -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer,sccp -S %s 2> %t.stderr\
+; RUN: | FileCheck %s %}
+; RUN: %if hip_amd %{ FileCheck --input-file=%t.stderr --check-prefix=CHECK-DEBUG %s %}
+
+source_filename = "debug_output.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+
+; This is a derivative of basic.ll, which checks for the debug output of
+; specialization constant materializer pass.
+
+;CHECK-DEBUG: Working on function:
+;CHECK-DEBUG-NEXT: ==================
+;CHECK-DEBUG-NEXT: __test_kernel
+;CHECK-DEBUG: Replaced: 2 loads from spec const buffer.
+;CHECK-DEBUG-NEXT: Load to global variable mappings:
+;CHECK-DEBUG-NEXT: Load:
+;CHECK-DEBUG-NEXT: %load1 = load i32, ptr %bc, align 4
+;CHECK-DEBUG-NEXT: Global Variable:
+;CHECK-DEBUG-NEXT: @SpecConsBlob___test_kernel_0 = weak_odr addrspace(4) constant i32 7
+;CHECK-DEBUG: Load:
+;CHECK-DEBUG-NEXT: %load2 = load [2 x i32], ptr %bc2, align 4
+;CHECK-DEBUG-NEXT: Global Variable:
+;CHECK-DEBUG-NEXT: @SpecConsBlob___test_kernel_1 = weak_odr addrspace(4) constant [2 x i32] [i32 3, i32 1]
+
+;CHECK: @SpecConsBlob___test_kernel_0 = weak_odr addrspace(4) constant i32 7
+;CHECK: @SpecConsBlob___test_kernel_1 = weak_odr addrspace(4) constant [2 x i32] [i32 3, i32 1]
+
+;CHECK: __test_kernel
+define weak_odr protected amdgpu_kernel void @__test_kernel(ptr addrspace(1) noundef align 4 %out, ptr addrspace(1) noundef align 1 %_arg__specialization_constants_buffer) {
+entry:
+  %0 = addrspacecast ptr addrspace(1) %_arg__specialization_constants_buffer to ptr
+  %gep = getelementptr i8, ptr %0, i32 0
+  %bc = bitcast ptr %gep to ptr
+  %load1 = load i32, ptr %bc, align 4
+  %gep1 = getelementptr i8, ptr %0, i32 4
+  %bc2 = bitcast ptr %gep1 to ptr
+  %load2 = load [2 x i32], ptr %bc2, align 4
+  ;CHECK: extractvalue [2 x i32] [i32 3, i32 1], 0
+  ;CHECK: extractvalue [2 x i32] [i32 3, i32 1], 1
+  ;CHECK: add nsw i32 %extract1, 7
+  %extract1 = extractvalue [2 x i32] %load2, 0
+  %extract2 = extractvalue [2 x i32] %load2, 1
+  %add1 = add nsw i32 %extract1, %load1
+  %add2 = add nsw i32 %add1, %extract2
+  store i32 %add2, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+!SYCL_SpecConst_data = !{!1}
+!1 = !{!"\07\00\00\00\03\00\00\00\01\00\00\00"}
diff --git a/sycl-fusion/test/materializer/multi_type.ll b/sycl-fusion/test/materializer/multi_type.ll
new file mode 100644
index 0000000000000..112b685b959b5
--- /dev/null
+++ b/sycl-fusion/test/materializer/multi_type.ll
@@ -0,0 +1,77 @@
+; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
+
+; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER %s %}
+
+; RUN: %if hip_amd %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple amdgcn-amd-amdhsa -passes=sycl-spec-const-materializer,early-cse -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+
+; RUN: %if cuda %{ opt -load-pass-plugin %shlibdir/SYCLKernelFusion%shlibext\
+; RUN: --mtriple nvptx64-nvidia-cuda -passes=sycl-spec-const-materializer,early-cse -S %s |\
+; RUN: FileCheck --check-prefix=CHECK-MATERIALIZER-CSE %s %}
+
+source_filename = "multi_type.ll"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+
+; The same logic as in the basic.ll, but with more complicated types; an array
+; of struct with multiple members. This is important as the pass has to
+; correctly set the type of offsetted memory region corresponding to
+; specialization constant (see getConstantOfType in the pass).
+; For CHECK-MATERIALIZER-CSE expect literal store only.
+
+;CHECK-MATERIALIZER: @SpecConsBlob___test_kernel_0 = weak_odr addrspace(4) constant i32
+;CHECK-MATERIALIZER: @SpecConsBlob___test_kernel_1 = weak_odr addrspace(4) constant %"struct.std::array"
+;CHECK-MATERIALIZER: @SpecConsBlob___test_kernel_2 = weak_odr addrspace(4) constant [2 x i32]
+
+%"struct.std::array" = type { [5 x %struct.ThreePrimitives] }
+%struct.ThreePrimitives = type <{ double, i64, half }>
+
+;CHECK: __test_kernel
+define weak_odr protected amdgpu_kernel void @__test_kernel(ptr addrspace(1) noundef align 4 %out, ptr addrspace(1) noundef align 1 %_arg__specialization_constants_buffer) {
+entry:
+  ;CHECK-MATERIALIZER-CSE-NOT: load
+  ;CHECK-MATERIALIZER-CSE: store double 1.100000e+01, ptr addrspace(1) %out
+
+  ;CHECK-MATERIALIZER-CSE-NOT: addrspacecast ptr addrspace(1) %_arg__specialization_constants_buffer to ptr
+  %0 = addrspacecast ptr addrspace(1) %_arg__specialization_constants_buffer to ptr
+  %gep = getelementptr i8, ptr %0, i32 0
+  %bc = bitcast ptr %gep to ptr
+  ;CHECK-MATERIALIZER: load i32, ptr addrspace(4) @SpecConsBlob___test_kernel_0
+  %load = load i32, ptr %bc, align 4
+
+  %gep2 = getelementptr i8, ptr %0, i32 4
+  %bc2 = bitcast ptr %gep2 to ptr
+  ;CHECK-MATERIALIZER: load [2 x i32], ptr addrspace(4) @SpecConsBlob___test_kernel_2
+  %load2 = load [2 x i32], ptr %bc2, align 4
+  %extract1 = extractvalue [2 x i32] %load2, 0
+  %extract2 = extractvalue [2 x i32] %load2, 1
+
+  ;CHECK-MATERIALIZER: load %"struct.std::array", ptr addrspace(4) @SpecConsBlob___test_kernel_1, align 1
+  %gep3 = getelementptr i8, ptr %0, i32 18
+  %bc3 = bitcast ptr %gep3 to ptr
+  %load3 = load %"struct.std::array", ptr %bc3, align 1
+  %D = extractvalue %"struct.std::array" %load3, 0, 2, 0
+  %L = extractvalue %"struct.std::array" %load3, 0, 2, 1
+  %H = extractvalue %"struct.std::array" %load3, 0, 2, 2
+
+  %add1 = add nsw i32 %extract1, %load
+  %add2 = add nsw i32 %add1, %extract2
+  %conv1 = sitofp i32 %add2 to double
+
+  %add3 = fadd double %D, %conv1
+  %conv2 = sitofp i64 %L to double
+  %add4 = fadd double %conv2, %add3
+  %conv3 = fpext half %H to double
+  %add5 = fadd double %conv3, %add4
+
+  store double %add5, ptr addrspace(1) %out, align 4
+
+  ret void
+}
+
+!SYCL_SpecConst_data = !{!1}
+!1 = !{!"\07\00\00\00\03\00\00\00\01\00\00\00\06\00\00\00\04\00\00\00"}
diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 455f4ddf0bd43..e3d68eed3528b 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -25,6 +25,9 @@ compiler and runtime.
 | `SYCL_EAGER_INIT` | Integer | Enable by specifying non-zero value. Tells the SYCL runtime to do as much as possible initialization at objects construction as opposed to doing lazy initialization on the fly. This may mean doing some redundant work at warmup but ensures fastest possible execution on the following hot and reportable paths. It also instructs PI plugins to do the same. Default is "0".  |
 | `SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE` | See [below](#sycl_reduction_preferred_workgroup_size) | Controls the preferred work-group size of reductions. |
 | `SYCL_ENABLE_FUSION_CACHING` | '1' or '0' | Enable ('1') or disable ('0') caching of JIT compilations for kernel fusion. Caching avoids repeatedly running the JIT compilation pipeline if the same sequence of kernels is fused multiple times. Default value is '1'. |
+| `SYCL_JIT_AMDGCN_PTX_KERNELS` | '1' or '0' | Enable ('1') or disable ('0') JIT compilation of kernels. Only supported for Nvidia and AMD backends. Note, that it is required to have a valid binary for the desired backend (AMD or CUDA), that was compiled with `-fsycl-embed-ir` in order to use JIT-ing. When JIT-ing is enabled SYCL runtime will try to cache and reuse JIT-compiled kernels, furthermore if a kernel uses specialization constants the compiler will attempt to materialize the values in place, turning them to de-facto compile time constants. Default is '0'. |
+| `SYCL_JIT_AMDGCN_PTX_TARGET_CPU` | Any(\*) | Allows setting the target architecture to be used when JIT-ing kernels. Examples include setting SM version for Nvidia, or target architecture for AMD. |
+| `SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES` | Any(\*) | Allows setting desired target features to be used when JIT-ing kernels. Examples include setting PTX version for Nvidia. |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 
@@ -210,6 +213,7 @@ variables in production code.</span>
 | `SYCL_CACHE_TRACE` | Any(\*) | If the variable is set, messages are sent to std::cerr when caching events or non-blocking failures happen (e.g. unable to access cache item file). |
 | `SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE` | Any(\*) | Enables tracing of `parallel_for` invocations with rounded-up ranges. |
 | `SYCL_PI_SUPPRESS_ERROR_MESSAGE` | Any(\*) | Suppress printing of error message, only used for CI in order not to interrupt errors generated by underlying toolchains; note that the variable only modifies the printing of the error message (error value, name, description and location), the handling of error return code and aborting/throwing behaviour remains unchanged. |
+| `SYCL_JIT_COMPILER_DEBUG` | Any(\*) | Passes can specify their own debug types, `sycl-spec-const-materializer` enables debug output generation in specialization constants materialization pass. |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 
diff --git a/sycl/include/sycl/detail/helpers.hpp b/sycl/include/sycl/detail/helpers.hpp
index adcac95c04c45..4d632ada7c767 100644
--- a/sycl/include/sycl/detail/helpers.hpp
+++ b/sycl/include/sycl/detail/helpers.hpp
@@ -11,6 +11,7 @@
 #include <CL/__spirv/spirv_types.hpp> // for MemorySemanticsMask
 #include <sycl/access/access.hpp>     // for fence_space
 #include <sycl/detail/export.hpp>     // for __SYCL_EXPORT
+#include <sycl/detail/pi.hpp>         // for PiProgram
 #include <sycl/memory_enums.hpp>      // for memory_order
 
 #ifdef __SYCL_DEVICE_ONLY__
@@ -38,9 +39,13 @@ template <typename Type, std::size_t NumElements> class marray;
 enum class memory_order;
 
 namespace detail {
-
+class CGExecKernel;
 class buffer_impl;
 class context_impl;
+class queue_impl;
+using QueueImplPtr = std::shared_ptr<sycl::detail::queue_impl>;
+class RTDeviceBinaryImage;
+
 __SYCL_EXPORT void waitEvents(std::vector<sycl::event> DepEvents);
 
 __SYCL_EXPORT void
@@ -247,6 +252,10 @@ template <size_t count, class F> void loop(F &&f) {
   loop_impl(std::make_index_sequence<count>{}, std::forward<F>(f));
 }
 inline constexpr bool is_power_of_two(int x) { return (x & (x - 1)) == 0; }
+
+std::tuple<const RTDeviceBinaryImage *, sycl::detail::pi::PiProgram>
+retrieveKernelBinary(const QueueImplPtr &, const char *KernelName,
+                     CGExecKernel *CGKernel = nullptr);
 } // namespace detail
 
 } // namespace _V1
diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def
index 04744c5c6841a..94424312c14d2 100644
--- a/sycl/source/detail/config.def
+++ b/sycl/source/detail/config.def
@@ -42,3 +42,6 @@ CONFIG(SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE, 16, __SYCL_REDUCTION_PREFERRED_W
 CONFIG(ONEAPI_DEVICE_SELECTOR, 1024, __ONEAPI_DEVICE_SELECTOR)
 CONFIG(SYCL_ENABLE_FUSION_CACHING, 1, __SYCL_ENABLE_FUSION_CACHING)
 CONFIG(SYCL_CACHE_IN_MEM, 1, __SYCL_CACHE_IN_MEM)
+CONFIG(SYCL_JIT_AMDGCN_PTX_KERNELS, 1, __SYCL_JIT_AMDGCN_PTX_KERNELS)
+CONFIG(SYCL_JIT_AMDGCN_PTX_TARGET_CPU, 1024, __SYCL_JIT_AMDGCN_PTX_TARGET_CPU)
+CONFIG(SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES, 1024, __SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES)
diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp
index 4659805ca76fa..c8a079f94ae66 100644
--- a/sycl/source/detail/config.hpp
+++ b/sycl/source/detail/config.hpp
@@ -593,6 +593,7 @@ template <> class SYCLConfig<SYCL_CACHE_IN_MEM> {
   static constexpr bool Default = true; // default is true
   static bool get() { return getCachedValue(); }
   static const char *getName() { return BaseT::MConfigName; }
+  static void reset() { (void)getCachedValue(/*ResetCache=*/true); }
 
 private:
   static bool parseValue() {
@@ -608,12 +609,95 @@ template <> class SYCLConfig<SYCL_CACHE_IN_MEM> {
     return ValStr[0] == '1';
   }
 
-  static bool getCachedValue() {
+  static bool getCachedValue(bool ResetCache = false) {
     static bool Val = parseValue();
+    if (ResetCache) {
+      Val = BaseT::getRawValue();
+    }
     return Val;
   }
 };
 
+template <> class SYCLConfig<SYCL_JIT_AMDGCN_PTX_KERNELS> {
+  using BaseT = SYCLConfigBase<SYCL_JIT_AMDGCN_PTX_KERNELS>;
+
+public:
+  static bool get() {
+    constexpr bool DefaultValue = false;
+    const char *ValStr = getCachedValue();
+    if (!ValStr)
+      return DefaultValue;
+
+    return ValStr[0] == '1';
+  }
+
+  static const char *getName() { return BaseT::MConfigName; }
+
+private:
+  static const char *getCachedValue(bool ResetCache = false) {
+    static const char *ValStr = BaseT::getRawValue();
+    if (ResetCache)
+      ValStr = BaseT::getRawValue();
+    return ValStr;
+  }
+};
+
+template <> class SYCLConfig<SYCL_JIT_AMDGCN_PTX_TARGET_CPU> {
+  using BaseT = SYCLConfigBase<SYCL_JIT_AMDGCN_PTX_TARGET_CPU>;
+
+public:
+  static std::string get() {
+    const std::string DefaultValue{""};
+
+    const char *ValStr = getCachedValue();
+
+    if (!ValStr)
+      return DefaultValue;
+
+    return std::string{ValStr};
+  }
+
+  static void reset() { (void)getCachedValue(/*ResetCache=*/true); }
+
+  static const char *getName() { return BaseT::MConfigName; }
+
+private:
+  static const char *getCachedValue(bool ResetCache = false) {
+    static const char *ValStr = BaseT::getRawValue();
+    if (ResetCache)
+      ValStr = BaseT::getRawValue();
+    return ValStr;
+  }
+};
+
+template <> class SYCLConfig<SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES> {
+  using BaseT = SYCLConfigBase<SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES>;
+
+public:
+  static std::string get() {
+    const std::string DefaultValue{""};
+
+    const char *ValStr = getCachedValue();
+
+    if (!ValStr)
+      return DefaultValue;
+
+    return std::string{ValStr};
+  }
+
+  static void reset() { (void)getCachedValue(/*ResetCache=*/true); }
+
+  static const char *getName() { return BaseT::MConfigName; }
+
+private:
+  static const char *getCachedValue(bool ResetCache = false) {
+    static const char *ValStr = BaseT::getRawValue();
+    if (ResetCache)
+      ValStr = BaseT::getRawValue();
+    return ValStr;
+  }
+};
+
 #undef INVALID_CONFIG_EXCEPTION
 
 } // namespace detail
diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp
index 59625db039d07..a35b9c159b6f9 100644
--- a/sycl/source/detail/helpers.cpp
+++ b/sycl/source/detail/helpers.cpp
@@ -11,11 +11,14 @@
 
 #include <detail/buffer_impl.hpp>
 #include <detail/context_impl.hpp>
+#include <detail/device_binary_image.hpp>
 #include <detail/event_impl.hpp>
+#include <detail/kernel_bundle_impl.hpp>
 #include <detail/queue_impl.hpp>
 #include <sycl/event.hpp>
 
 #include <memory>
+#include <tuple>
 
 namespace sycl {
 inline namespace _V1 {
@@ -31,6 +34,69 @@ void markBufferAsInternal(const std::shared_ptr<buffer_impl> &BufImpl) {
   BufImpl->markAsInternal();
 }
 
+std::tuple<const RTDeviceBinaryImage *, sycl::detail::pi::PiProgram>
+retrieveKernelBinary(const QueueImplPtr &Queue, const char *KernelName,
+                     CGExecKernel *KernelCG) {
+  bool isNvidia =
+      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_cuda;
+  bool isHIP =
+      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_hip;
+  if (isNvidia || isHIP) {
+    auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
+    std::vector<kernel_id> KernelIds{KernelID};
+    auto DeviceImages =
+        ProgramManager::getInstance().getRawDeviceImages(KernelIds);
+    auto DeviceImage = std::find_if(
+        DeviceImages.begin(), DeviceImages.end(),
+        [isNvidia](RTDeviceBinaryImage *DI) {
+          const std::string &TargetSpec = isNvidia ? std::string("llvm_nvptx64")
+                                                   : std::string("llvm_amdgcn");
+          return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
+                 DI->getRawData().DeviceTargetSpec == TargetSpec;
+        });
+    if (DeviceImage == DeviceImages.end()) {
+      return {nullptr, nullptr};
+    }
+    auto ContextImpl = Queue->getContextImplPtr();
+    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+    auto DeviceImpl = Queue->getDeviceImplPtr();
+    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+    sycl::detail::pi::PiProgram Program =
+        detail::ProgramManager::getInstance().createPIProgram(**DeviceImage,
+                                                              Context, Device);
+    return {*DeviceImage, Program};
+  }
+
+  assert(KernelCG && "CGExecKernel must be provided.");
+  const RTDeviceBinaryImage *DeviceImage = nullptr;
+  sycl::detail::pi::PiProgram Program = nullptr;
+  if (KernelCG->getKernelBundle() != nullptr) {
+    // Retrieve the device image from the kernel bundle.
+    auto KernelBundle = KernelCG->getKernelBundle();
+    kernel_id KernelID =
+        detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
+
+    auto SyclKernel = detail::getSyclObjImpl(
+        KernelBundle->get_kernel(KernelID, KernelBundle));
+
+    DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
+    Program = SyclKernel->getDeviceImage()->get_program_ref();
+  } else if (KernelCG->MSyclKernel != nullptr) {
+    DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
+    Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
+  } else {
+    auto ContextImpl = Queue->getContextImplPtr();
+    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+    auto DeviceImpl = Queue->getDeviceImplPtr();
+    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+    DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
+        KernelName, Context, Device);
+    Program = detail::ProgramManager::getInstance().createPIProgram(
+        *DeviceImage, Context, Device);
+  }
+  return {DeviceImage, Program};
+}
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 179f11b97b492..48d1d0c3e6ba7 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -64,6 +64,16 @@ jit_compiler::jit_compiler() {
       return false;
     }
 
+    this->MaterializeSpecConstHandle =
+        reinterpret_cast<MaterializeSpecConstFuncT>(
+            sycl::detail::pi::getOsLibraryFuncAddress(
+                LibraryPtr, "materializeSpecConstants"));
+    if (!this->MaterializeSpecConstHandle) {
+      printPerformanceWarning(
+          "Cannot resolve JIT library function entry point");
+      return false;
+    }
+
     return true;
   };
   Available = checkJITLibrary();
@@ -106,69 +116,6 @@ ::jit_compiler::TargetInfo getTargetInfo(QueueImplPtr &Queue) {
                   Queue->getDeviceImplPtr()->getDeviceArch()));
 }
 
-std::pair<const RTDeviceBinaryImage *, sycl::detail::pi::PiProgram>
-retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
-  auto KernelName = KernelCG->getKernelName();
-
-  bool isNvidia =
-      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_cuda;
-  bool isHIP =
-      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_hip;
-  if (isNvidia || isHIP) {
-    auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
-    std::vector<kernel_id> KernelIds{KernelID};
-    auto DeviceImages =
-        ProgramManager::getInstance().getRawDeviceImages(KernelIds);
-    auto DeviceImage = std::find_if(
-        DeviceImages.begin(), DeviceImages.end(),
-        [isNvidia](RTDeviceBinaryImage *DI) {
-          const std::string &TargetSpec = isNvidia ? std::string("llvm_nvptx64")
-                                                   : std::string("llvm_amdgcn");
-          return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
-                 DI->getRawData().DeviceTargetSpec == TargetSpec;
-        });
-    if (DeviceImage == DeviceImages.end()) {
-      return {nullptr, nullptr};
-    }
-    auto ContextImpl = Queue->getContextImplPtr();
-    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
-    auto DeviceImpl = Queue->getDeviceImplPtr();
-    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
-    sycl::detail::pi::PiProgram Program =
-        detail::ProgramManager::getInstance().createPIProgram(**DeviceImage,
-                                                              Context, Device);
-    return {*DeviceImage, Program};
-  }
-
-  const RTDeviceBinaryImage *DeviceImage = nullptr;
-  sycl::detail::pi::PiProgram Program = nullptr;
-  if (KernelCG->getKernelBundle() != nullptr) {
-    // Retrieve the device image from the kernel bundle.
-    auto KernelBundle = KernelCG->getKernelBundle();
-    kernel_id KernelID =
-        detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
-
-    auto SyclKernel = detail::getSyclObjImpl(
-        KernelBundle->get_kernel(KernelID, KernelBundle));
-
-    DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
-    Program = SyclKernel->getDeviceImage()->get_program_ref();
-  } else if (KernelCG->MSyclKernel != nullptr) {
-    DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
-    Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
-  } else {
-    auto ContextImpl = Queue->getContextImplPtr();
-    auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
-    auto DeviceImpl = Queue->getDeviceImplPtr();
-    auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
-    DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
-        KernelName, Context, Device);
-    Program = detail::ProgramManager::getInstance().createPIProgram(
-        *DeviceImage, Context, Device);
-  }
-  return {DeviceImage, Program};
-}
-
 static ::jit_compiler::ParameterKind
 translateArgType(kernel_param_kind_t Kind) {
   using PK = ::jit_compiler::ParameterKind;
@@ -678,6 +625,98 @@ updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
   }
 }
 
+sycl::detail::pi::PiKernel jit_compiler::materializeSpecConstants(
+    QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage,
+    const std::string &KernelName,
+    const std::vector<unsigned char> &SpecConstBlob) {
+  if (!BinImage) {
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "No suitable IR available for materializing");
+  }
+  if (KernelName.empty()) {
+    throw sycl::exception(
+        sycl::make_error_code(sycl::errc::invalid),
+        "Cannot jit kernel with invalid kernel function name");
+  }
+  auto &PM = detail::ProgramManager::getInstance();
+  if (auto CachedKernel =
+          PM.getCachedMaterializedKernel(KernelName, SpecConstBlob))
+    return CachedKernel;
+
+  auto &RawDeviceImage = BinImage->getRawData();
+  auto DeviceImageSize = static_cast<size_t>(RawDeviceImage.BinaryEnd -
+                                             RawDeviceImage.BinaryStart);
+  // Set 0 as the number of address bits, because the JIT compiler can set this
+  // field based on information from LLVM module's data-layout.
+  auto BinaryImageFormat = translateBinaryImageFormat(BinImage->getFormat());
+  if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "No suitable IR available for materializing");
+  }
+  ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
+      BinaryImageFormat, 0, RawDeviceImage.BinaryStart, DeviceImageSize};
+
+  ::jit_compiler::TargetInfo TargetInfo = getTargetInfo(Queue);
+  AddToConfigHandle(
+      ::jit_compiler::option::JITTargetInfo::set(std::move(TargetInfo)));
+  bool DebugEnabled =
+      detail::SYCLConfig<detail::SYCL_RT_WARNING_LEVEL>::get() > 0;
+  AddToConfigHandle(
+      ::jit_compiler::option::JITEnableVerbose::set(DebugEnabled));
+
+  std::string TargetCPU =
+      detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_TARGET_CPU>::get();
+  std::string TargetFeatures =
+      detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES>::get();
+
+  auto MaterializerResult =
+      MaterializeSpecConstHandle(KernelName.c_str(), BinInfo, SpecConstBlob,
+                                 TargetCPU.c_str(), TargetFeatures.c_str());
+  if (MaterializerResult.failed()) {
+    std::string Message{"Compilation for kernel failed with message:\n"};
+    Message.append(MaterializerResult.getErrorMessage());
+    if (DebugEnabled) {
+      std::cerr << Message << "\n";
+    }
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), Message);
+  }
+
+  auto &MaterializerKernelInfo = MaterializerResult.getKernelInfo();
+  pi_device_binary_struct MaterializedRawDeviceImage{RawDeviceImage};
+  MaterializedRawDeviceImage.BinaryStart =
+      MaterializerKernelInfo.BinaryInfo.BinaryStart;
+  MaterializedRawDeviceImage.BinaryEnd =
+      MaterializerKernelInfo.BinaryInfo.BinaryStart +
+      MaterializerKernelInfo.BinaryInfo.BinarySize;
+
+  const bool OrigCacheCfg = SYCLConfig<SYCL_CACHE_IN_MEM>::get();
+  if (OrigCacheCfg) {
+    if (0 != setenv("SYCL_CACHE_IN_MEM", "0", true)) {
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::invalid),
+          "Failed to set env variable in materialize spec constel.");
+    }
+    SYCLConfig<SYCL_CACHE_IN_MEM>::reset();
+  }
+
+  RTDeviceBinaryImage MaterializedRTDevBinImage{&MaterializedRawDeviceImage};
+  const auto &Context = Queue->get_context();
+  const auto &Device = Queue->get_device();
+  auto NewKernel = PM.getOrCreateMaterializedKernel(
+      MaterializedRTDevBinImage, Context, Device, KernelName, SpecConstBlob);
+
+  if (OrigCacheCfg) {
+    if (0 != setenv("SYCL_CACHE_IN_MEM", "1", true)) {
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::invalid),
+          "Failed to set env variable in materialize spec const.");
+    }
+    SYCLConfig<SYCL_CACHE_IN_MEM>::reset();
+  }
+
+  return NewKernel;
+}
+
 std::unique_ptr<detail::CG>
 jit_compiler::fuseKernels(QueueImplPtr Queue,
                           std::vector<ExecCGCommand *> &InputKernels,
@@ -723,8 +762,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
       return nullptr;
     }
 
-    auto [DeviceImage, Program] = retrieveKernelBinary(Queue, KernelCG);
-
+    auto [DeviceImage, Program] =
+        retrieveKernelBinary(Queue, KernelName.c_str(), KernelCG);
     if (!DeviceImage || !Program) {
       printPerformanceWarning("No suitable IR available for fusion");
       return nullptr;
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 2612c26943c94..656dc548e17e0 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -40,6 +40,11 @@ class jit_compiler {
   std::unique_ptr<detail::CG>
   fuseKernels(QueueImplPtr Queue, std::vector<ExecCGCommand *> &InputKernels,
               const property_list &);
+  sycl::detail::pi::PiKernel
+  materializeSpecConstants(QueueImplPtr Queue,
+                           const RTDeviceBinaryImage *BinImage,
+                           const std::string &KernelName,
+                           const std::vector<unsigned char> &SpecConstBlob);
 
   bool isAvailable() { return Available; }
 
@@ -75,9 +80,12 @@ class jit_compiler {
 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
   // Handles to the entry points of the lazily loaded JIT library.
   using FuseKernelsFuncT = decltype(::jit_compiler::fuseKernels) *;
+  using MaterializeSpecConstFuncT =
+      decltype(::jit_compiler::materializeSpecConstants) *;
   using ResetConfigFuncT = decltype(::jit_compiler::resetJITConfiguration) *;
   using AddToConfigFuncT = decltype(::jit_compiler::addToJITConfiguration) *;
   FuseKernelsFuncT FuseKernelsHandle = nullptr;
+  MaterializeSpecConstFuncT MaterializeSpecConstHandle = nullptr;
   ResetConfigFuncT ResetConfigHandle = nullptr;
   AddToConfigFuncT AddToConfigHandle = nullptr;
 #endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 7e659c83cc0f6..1b875d6bc0675 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -175,7 +175,7 @@ static const char *getFormatStr(sycl::detail::pi::PiDeviceBinaryType Format) {
 sycl::detail::pi::PiProgram
 ProgramManager::createPIProgram(const RTDeviceBinaryImage &Img,
                                 const context &Context, const device &Device) {
-  if (DbgProgMgr > 0)
+  if constexpr (DbgProgMgr > 0)
     std::cerr << ">>> ProgramManager::createPIProgram(" << &Img << ", "
               << getSyclObjImpl(Context).get() << ", "
               << getSyclObjImpl(Device).get() << ")\n";
@@ -231,7 +231,7 @@ ProgramManager::createPIProgram(const RTDeviceBinaryImage &Img,
 
   Ctx->addDeviceGlobalInitializer(Res, {Device}, &Img);
 
-  if (DbgProgMgr > 1)
+  if constexpr (DbgProgMgr > 1)
     std::cerr << "created program: " << Res
               << "; image format: " << getFormatStr(Format) << "\n";
 
@@ -871,7 +871,7 @@ ProgramManager::getOrCreateKernel(const ContextImplPtr &ContextImpl,
                                   const DeviceImplPtr &DeviceImpl,
                                   const std::string &KernelName,
                                   const NDRDescT &NDRDesc) {
-  if (DbgProgMgr > 0) {
+  if constexpr (DbgProgMgr > 0) {
     std::cerr << ">>> ProgramManager::getOrCreateKernel(" << ContextImpl.get()
               << ", " << DeviceImpl.get() << ", " << KernelName << ")\n";
   }
@@ -1176,7 +1176,7 @@ ProgramManager::ProgramManager() : m_AsanFoundInImage(false) {
     m_SpvFileImage =
         std::make_unique<DynRTDeviceBinaryImage>(std::move(Data), Size);
 
-    if (DbgProgMgr > 0) {
+    if constexpr (DbgProgMgr > 0) {
       std::cerr << "loaded device image binary from " << SpvFile << "\n";
       std::cerr << "format: " << getFormatStr(m_SpvFileImage->getFormat())
                 << "\n";
@@ -1230,7 +1230,7 @@ RTDeviceBinaryImage &
 ProgramManager::getDeviceImage(const std::string &KernelName,
                                const context &Context, const device &Device,
                                bool JITCompilationIsRequired) {
-  if (DbgProgMgr > 0) {
+  if constexpr (DbgProgMgr > 0) {
     std::cerr << ">>> ProgramManager::getDeviceImage(\"" << KernelName << "\", "
               << getSyclObjImpl(Context).get() << ", "
               << getSyclObjImpl(Device).get() << ", "
@@ -1264,7 +1264,7 @@ ProgramManager::getDeviceImage(const std::string &KernelName,
   if (Img) {
     CheckJITCompilationForImage(Img, JITCompilationIsRequired);
 
-    if (DbgProgMgr > 0) {
+    if constexpr (DbgProgMgr > 0) {
       std::cerr << "selected device image: " << &Img->getRawData() << "\n";
       Img->print();
     }
@@ -1281,7 +1281,7 @@ RTDeviceBinaryImage &ProgramManager::getDeviceImage(
     bool JITCompilationIsRequired) {
   assert(ImageSet.size() > 0);
 
-  if (DbgProgMgr > 0) {
+  if constexpr (DbgProgMgr > 0) {
     std::cerr << ">>> ProgramManager::getDeviceImage(Custom SPV file "
               << getSyclObjImpl(Context).get() << ", "
               << getSyclObjImpl(Device).get() << ", "
@@ -1310,7 +1310,7 @@ RTDeviceBinaryImage &ProgramManager::getDeviceImage(
 
   CheckJITCompilationForImage(*ImageIterator, JITCompilationIsRequired);
 
-  if (DbgProgMgr > 0) {
+  if constexpr (DbgProgMgr > 0) {
     std::cerr << "selected device image: " << &(*ImageIterator)->getRawData()
               << "\n";
     (*ImageIterator)->print();
@@ -1402,7 +1402,7 @@ ProgramManager::ProgramPtr ProgramManager::build(
     const sycl::detail::pi::PiDevice &Device, uint32_t DeviceLibReqMask,
     const std::vector<sycl::detail::pi::PiProgram> &ExtraProgramsToLink) {
 
-  if (DbgProgMgr > 0) {
+  if constexpr (DbgProgMgr > 0) {
     std::cerr << ">>> ProgramManager::build(" << Program.get() << ", "
               << CompileOptions << ", " << LinkOptions << ", ... " << Device
               << ")\n";
@@ -2573,6 +2573,78 @@ ProgramManager::getOrCreateKernel(const context &Context,
                          BuildResult->Val.second);
 }
 
+sycl::detail::pi::PiKernel ProgramManager::getCachedMaterializedKernel(
+    const std::string &KernelName,
+    const std::vector<unsigned char> &SpecializationConsts) {
+  if constexpr (DbgProgMgr > 0)
+    std::cerr << ">>> ProgramManager::getCachedMaterializedKernel\n"
+              << "KernelName: " << KernelName << "\n";
+
+  {
+    std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
+    if (auto KnownMaterializations = m_MaterializedKernels.find(KernelName);
+        KnownMaterializations != m_MaterializedKernels.end()) {
+      if constexpr (DbgProgMgr > 0)
+        std::cerr << ">>> There are:" << KnownMaterializations->second.size()
+                  << " materialized kernels.\n";
+      if (auto Kernel =
+              KnownMaterializations->second.find(SpecializationConsts);
+          Kernel != KnownMaterializations->second.end()) {
+        if constexpr (DbgProgMgr > 0)
+          std::cerr << ">>> Kernel in the chache\n";
+        return Kernel->second;
+      }
+    }
+  }
+
+  if constexpr (DbgProgMgr > 0)
+    std::cerr << ">>> Kernel not in the chache\n";
+
+  return nullptr;
+}
+
+sycl::detail::pi::PiKernel ProgramManager::getOrCreateMaterializedKernel(
+    const RTDeviceBinaryImage &Img, const context &Context,
+    const device &Device, const std::string &KernelName,
+    const std::vector<unsigned char> &SpecializationConsts) {
+  // Check if we already have the kernel in the cache.
+  if constexpr (DbgProgMgr > 0)
+    std::cerr << ">>> ProgramManager::getOrCreateMaterializedKernel\n"
+              << "KernelName: " << KernelName << "\n";
+
+  if (auto CachedKernel =
+          getCachedMaterializedKernel(KernelName, SpecializationConsts))
+    return CachedKernel;
+
+  if constexpr (DbgProgMgr > 0)
+    std::cerr << ">>> Adding the kernel to the cache.\n";
+  auto Program = createPIProgram(Img, Context, Device);
+  auto DeviceImpl = detail::getSyclObjImpl(Device);
+  auto &Plugin = DeviceImpl->getPlugin();
+  ProgramPtr ProgramManaged(
+      Program, Plugin->getPiPlugin().PiFunctionTable.piProgramRelease);
+
+  std::string CompileOpts;
+  std::string LinkOpts;
+  applyOptionsFromEnvironment(CompileOpts, LinkOpts);
+  // No linking of extra programs reqruired.
+  std::vector<sycl::detail::pi::PiProgram> ExtraProgramsToLink;
+  auto BuildProgram =
+      build(std::move(ProgramManaged), detail::getSyclObjImpl(Context),
+            CompileOpts, LinkOpts, DeviceImpl->getHandleRef(),
+            /*For non SPIR-V devices DeviceLibReqdMask is always 0*/ 0,
+            ExtraProgramsToLink);
+  sycl::detail::pi::PiKernel PiKernel{nullptr};
+  Plugin->call<errc::kernel_not_supported, PiApiKind::piKernelCreate>(
+      BuildProgram.get(), KernelName.c_str(), &PiKernel);
+  {
+    std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
+    m_MaterializedKernels[KernelName][SpecializationConsts] = PiKernel;
+  }
+
+  return PiKernel;
+}
+
 bool doesDevSupportDeviceRequirements(const device &Dev,
                                       const RTDeviceBinaryImage &Img) {
   return !checkDevSupportDeviceRequirements(Dev, Img).has_value();
diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp
index cb0b9f28a74f1..f5a08f0262bb6 100644
--- a/sycl/source/detail/program_manager/program_manager.hpp
+++ b/sycl/source/detail/program_manager/program_manager.hpp
@@ -156,6 +156,15 @@ class ProgramManager {
                     const std::string &KernelName,
                     const NDRDescT &NDRDesc = {});
 
+  sycl::detail::pi::PiKernel getCachedMaterializedKernel(
+      const std::string &KernelName,
+      const std::vector<unsigned char> &SpecializationConsts);
+
+  sycl::detail::pi::PiKernel getOrCreateMaterializedKernel(
+      const RTDeviceBinaryImage &Img, const context &Context,
+      const device &Device, const std::string &KernelName,
+      const std::vector<unsigned char> &SpecializationConsts);
+
   sycl::detail::pi::PiProgram
   getPiProgramFromPiKernel(sycl::detail::pi::PiKernel Kernel,
                            const ContextImplPtr Context);
@@ -425,6 +434,10 @@ class ProgramManager {
 
   /// Protects m_HostPipes and m_Ptr2HostPipe.
   std::mutex m_HostPipesMutex;
+
+  using MaterializedEntries =
+      std::map<std::vector<unsigned char>, pi::PiKernel>;
+  std::unordered_map<std::string, MaterializedEntries> m_MaterializedKernels;
 };
 } // namespace detail
 } // namespace _V1
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 33eade0cadb70..76320c2833891 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -25,6 +25,7 @@
 #include <sycl/access/access.hpp>
 #include <sycl/backend_types.hpp>
 #include <sycl/detail/cg_types.hpp>
+#include <sycl/detail/helpers.hpp>
 #include <sycl/detail/kernel_desc.hpp>
 #include <sycl/sampler.hpp>
 
@@ -122,7 +123,7 @@ static std::string demangleKernelName(std::string Name) { return Name; }
 void applyFuncOnFilteredArgs(
     const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
     std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
-  if (!EliminatedArgMask) {
+  if (!EliminatedArgMask || EliminatedArgMask->size() == 0) {
     for (ArgDesc &Arg : Args) {
       Func(Arg, Arg.MIndex);
     }
@@ -2373,10 +2374,19 @@ static pi_result SetKernelParamsAndLaunch(
     const detail::EventImplPtr &OutEventImpl,
     const KernelArgMask *EliminatedArgMask,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    bool IsCooperative, bool KernelUsesClusterLaunch) {
+    bool IsCooperative, bool KernelUsesClusterLaunch,
+    const RTDeviceBinaryImage *BinImage, const std::string &KernelName) {
   assert(Queue && "Kernel submissions should have an associated queue");
   const PluginPtr &Plugin = Queue->getPlugin();
 
+  if (SYCLConfig<SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
+    std::vector<unsigned char> Empty;
+    Kernel = Scheduler::getInstance().completeSpecConstMaterialization(
+        Queue, BinImage, KernelName,
+        DeviceImageImpl.get() ? DeviceImageImpl->get_spec_const_blob_ref()
+                              : Empty);
+  }
+
   auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
                   &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
     SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,
@@ -2568,7 +2578,8 @@ void enqueueImpKernel(
     const detail::EventImplPtr &OutEventImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
-    const bool KernelIsCooperative, const bool KernelUsesClusterLaunch) {
+    const bool KernelIsCooperative, const bool KernelUsesClusterLaunch,
+    const RTDeviceBinaryImage *BinImage) {
   assert(Queue && "Kernel submissions should have an associated queue");
   // Run OpenCL kernel
   auto ContextImpl = Queue->getContextImplPtr();
@@ -2660,7 +2671,7 @@ void enqueueImpKernel(
     Error = SetKernelParamsAndLaunch(
         Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList,
         OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
-        KernelIsCooperative, KernelUsesClusterLaunch);
+        KernelIsCooperative, KernelUsesClusterLaunch, BinImage, KernelName);
 
     const PluginPtr &Plugin = Queue->getPlugin();
     if (!SyclKernelImpl && !MSyclKernel) {
@@ -3008,11 +3019,17 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
       }
     }
 
+    const RTDeviceBinaryImage *BinImage = nullptr;
+    if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
+      std::tie(BinImage, std::ignore) =
+          retrieveKernelBinary(MQueue, KernelName.c_str());
+      assert(BinImage && "Failed to obtain a binary image.");
+    }
     enqueueImpKernel(MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(),
                      SyclKernel, KernelName, RawEvents, EventImpl,
                      getMemAllocationFunc, ExecKernel->MKernelCacheConfig,
                      ExecKernel->MKernelIsCooperative,
-                     ExecKernel->MKernelUsesClusterLaunch);
+                     ExecKernel->MKernelUsesClusterLaunch, BinImage);
 
     return PI_SUCCESS;
   }
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 8944444c44ed2..463ddecd63b77 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -37,6 +37,9 @@ void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
                                 xpti_td *TraceEvent, uint16_t Type,
                                 const void *Addr);
 #endif
+RTDeviceBinaryImage *
+retrieveAMDGCNOrNVPTXKernelBinary(const DeviceImplPtr DeviceImpl,
+                                  const std::string &KernelName);
 
 class queue_impl;
 class event_impl;
@@ -633,7 +636,8 @@ void enqueueImpKernel(
     const detail::EventImplPtr &Event,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
-    bool KernelIsCooperative, const bool KernelUsesClusterLaunch);
+    bool KernelIsCooperative, const bool KernelUsesClusterLaunch,
+    const RTDeviceBinaryImage *BinImage = nullptr);
 
 class KernelFusionCommand;
 
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 5fd4b17f746d1..9b8c6b358cc3f 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -9,6 +9,10 @@
 #include "detail/sycl_mem_obj_i.hpp"
 #include <detail/global_handler.hpp>
 #include <detail/graph_impl.hpp>
+#include <sycl/feature_test.hpp>
+#if SYCL_EXT_CODEPLAY_KERNEL_FUSION
+#include <detail/jit_compiler.hpp>
+#endif
 #include <detail/queue_impl.hpp>
 #include <detail/scheduler/scheduler.hpp>
 #include <detail/stream_impl.hpp>
@@ -603,6 +607,21 @@ void Scheduler::cancelFusion(QueueImplPtr Queue) {
   enqueueCommandForCG(nullptr, ToEnqueue);
 }
 
+sycl::detail::pi::PiKernel Scheduler::completeSpecConstMaterialization(
+    [[maybe_unused]] QueueImplPtr Queue,
+    [[maybe_unused]] const RTDeviceBinaryImage *BinImage,
+    [[maybe_unused]] const std::string &KernelName,
+    [[maybe_unused]] std::vector<unsigned char> &SpecConstBlob) {
+#if SYCL_EXT_CODEPLAY_KERNEL_FUSION
+  return detail::jit_compiler::get_instance().materializeSpecConstants(
+      Queue, BinImage, KernelName, SpecConstBlob);
+#else  // SYCL_EXT_CODEPLAY_KERNEL_FUSION
+  printFusionWarning(
+      "Materialization of spec constants not supported by this build");
+  return nullptr;
+#endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
+}
+
 EventImplPtr Scheduler::completeFusion(QueueImplPtr Queue,
                                        const property_list &PropList) {
   std::vector<Command *> ToEnqueue;
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 0cdf4ae9ec254..8907a9ee1fe93 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -462,6 +462,9 @@ class Scheduler {
   void cancelFusion(QueueImplPtr Queue);
 
   EventImplPtr completeFusion(QueueImplPtr Queue, const property_list &);
+  sycl::detail::pi::PiKernel completeSpecConstMaterialization(
+      QueueImplPtr Queue, const RTDeviceBinaryImage *BinImage,
+      const std::string &KernelName, std::vector<unsigned char> &SpecConstBlob);
 
   bool isInFusionMode(QueueIdT Queue);
 
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index ed92bd270d379..99f22eca198d1 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "sycl/detail/helpers.hpp"
 #include <algorithm>
 
 #include <detail/config.hpp>
@@ -276,11 +277,17 @@ event handler::finalize() {
         detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
                                            xpti::trace_task_begin, nullptr);
 #endif
+        const detail::RTDeviceBinaryImage *BinImage = nullptr;
+        if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get()) {
+          std::tie(BinImage, std::ignore) =
+              detail::retrieveKernelBinary(MQueue, MKernelName.c_str());
+          assert(BinImage && "Failed to obtain a binary image.");
+        }
         enqueueImpKernel(MQueue, impl->MNDRDesc, impl->MArgs,
                          KernelBundleImpPtr, MKernel, MKernelName.c_str(),
                          RawEvents, NewEvent, nullptr, impl->MKernelCacheConfig,
                          impl->MKernelIsCooperative,
-                         impl->MKernelUsesClusterLaunch);
+                         impl->MKernelUsesClusterLaunch, BinImage);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
         // Emit signal only when event is created
         if (NewEvent != nullptr) {
diff --git a/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp b/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp
new file mode 100644
index 0000000000000..21eccd983c7cb
--- /dev/null
+++ b/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp
@@ -0,0 +1,111 @@
+// REQUIRES: cuda || hip
+
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: env SYCL_JIT_AMDGCN_PTX_KERNELS=1 env SYCL_JIT_COMPILER_DEBUG="sycl-spec-const-materializer" %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt
+
+#include <sycl/detail/core.hpp>
+#include <sycl/specialization_id.hpp>
+
+constexpr size_t Size = 16;
+constexpr int SeedKernel = 3;
+constexpr int SeedKernelBundle = 5;
+
+constexpr int ValInt = 11;
+constexpr std::array<int, 2> ValArr{13, 17};
+const static sycl::specialization_id<int> SpecConstInt;
+const static sycl::specialization_id<std::array<int, 2>> SpecConstArr;
+
+int validate(int Seed, std::vector<int> &Input, std::vector<int> &Output) {
+  for (int i = 0; i < Size; ++i) {
+    int Expected = ValInt + ValArr[0] + ValArr[1] + Input[i] + Seed;
+    if (Expected != Output[i]) {
+      return -1;
+    }
+  }
+  return 0;
+}
+
+// CHECK: Working on function:
+// CHECK: ==================
+// CHECK: _ZTSZ15runKernelBundleN4sycl3_V15queueERSt6vectorIiSaIiEES5_E10WoofBundle
+int runKernelBundle(sycl::queue Queue, std::vector<int> &Input,
+                    std::vector<int> &Output) {
+  for (int i = 0; i < Size; ++i) {
+    Output[i] = 42;
+    Input[i] = i * i;
+  }
+
+  sycl::device Device;
+  sycl::context Context = Queue.get_context();
+
+  auto InputBundle =
+      sycl::get_kernel_bundle<class WoofBundle, sycl::bundle_state::input>(
+          Context, {Device});
+  InputBundle.set_specialization_constant<SpecConstInt>(ValInt);
+  InputBundle.set_specialization_constant<SpecConstArr>(ValArr);
+
+  auto ExecBundle = sycl::build(InputBundle);
+
+  {
+    sycl::buffer<int> OutBuff(Output.data(), Output.size());
+    sycl::buffer<int> InBuff(Input.data(), Input.size());
+    Queue.submit([&](sycl::handler &cgh) {
+      sycl::accessor OutAcc(OutBuff, cgh, sycl::write_only);
+      sycl::accessor InAcc(InBuff, cgh, sycl::read_only);
+      cgh.use_kernel_bundle(ExecBundle);
+      cgh.template parallel_for<class WoofBundle>(
+          sycl::range<1>{Size}, [=](sycl::id<1> i, sycl::kernel_handler kh) {
+            const auto KernelSpecConst =
+                kh.get_specialization_constant<SpecConstInt>();
+            const auto KernelSpecConstArr =
+                kh.get_specialization_constant<SpecConstArr>();
+            OutAcc[i] = KernelSpecConst + KernelSpecConstArr[0] +
+                        KernelSpecConstArr[1] + InAcc[i] + SeedKernelBundle;
+          });
+    });
+    Queue.wait_and_throw();
+  }
+
+  return validate(SeedKernelBundle, Input, Output);
+}
+
+// CHECK: Working on function:
+// CHECK: ==================
+// CHECK: _ZTSZZ9runKernelN4sycl3_V15queueERSt6vectorIiSaIiEES5_ENKUlRT_E_clINS0_7handlerEEEDaS7_E10WoofKernel
+int runKernel(sycl::queue Queue, std::vector<int> &Input,
+              std::vector<int> &Output) {
+  for (int i = 0; i < Size; ++i) {
+    Output[i] = 42;
+    Input[i] = i * i;
+  }
+  {
+    sycl::buffer<int> OutBuff(Output.data(), Output.size());
+    sycl::buffer<int> InBuff(Input.data(), Input.size());
+    Queue.submit([&](auto &CGH) {
+      sycl::accessor OutAcc(OutBuff, CGH, sycl::write_only);
+      sycl::accessor InAcc(InBuff, CGH, sycl::read_only);
+      CGH.template set_specialization_constant<SpecConstInt>(ValInt);
+      CGH.template set_specialization_constant<SpecConstArr>(ValArr);
+      CGH.template parallel_for<class WoofKernel>(
+          sycl::range<1>{Size}, [=](sycl::id<1> i, sycl::kernel_handler KH) {
+            const auto KernelSpecConst =
+                KH.get_specialization_constant<SpecConstInt>();
+            const auto KernelSpecConstArr =
+                KH.get_specialization_constant<SpecConstArr>();
+            OutAcc[i] = KernelSpecConst + KernelSpecConstArr[0] +
+                        KernelSpecConstArr[1] + InAcc[i] + SeedKernel;
+          });
+    });
+    Queue.wait_and_throw();
+  }
+
+  return validate(SeedKernel, Input, Output);
+}
+
+int main() {
+  std::vector<int> Input(Size);
+  std::vector<int> Output(Size);
+  sycl::queue Queue;
+  return runKernel(Queue, Input, Output) |
+         runKernelBundle(Queue, Input, Output);
+}
diff --git a/sycl/test/include_deps/sycl_accessor.hpp.cpp b/sycl/test/include_deps/sycl_accessor.hpp.cpp
index 37bc9bf84559d..85b2543205279 100644
--- a/sycl/test/include_deps/sycl_accessor.hpp.cpp
+++ b/sycl/test/include_deps/sycl_accessor.hpp.cpp
@@ -21,10 +21,13 @@
 // CHECK-NEXT: info/aspects_deprecated.def
 // CHECK-NEXT: atomic.hpp
 // CHECK-NEXT: detail/helpers.hpp
-// CHECK-NEXT: memory_enums.hpp
+// CHECK-NEXT: detail/pi.hpp
+// CHECK-NEXT: backend_types.hpp
+// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: detail/pi.h
 // CHECK-NEXT: detail/pi_error.def
 // CHECK-NEXT: detail/pi.def
+// CHECK-NEXT: memory_enums.hpp
 // CHECK-NEXT: CL/__spirv/spirv_vars.hpp
 // CHECK-NEXT: multi_ptr.hpp
 // CHECK-NEXT: detail/type_traits.hpp
@@ -58,7 +61,6 @@
 // CHECK-NEXT: detail/boost/mp11/detail/mp_with_index.hpp
 // CHECK-NEXT: detail/boost/mp11/integer_sequence.hpp
 // CHECK-NEXT: buffer.hpp
-// CHECK-NEXT: backend_types.hpp
 // CHECK-NEXT: detail/array.hpp
 // CHECK-NEXT: exception.hpp
 // CHECK-NEXT: detail/cl.h
@@ -76,7 +78,6 @@
 // CHECK-NEXT: detail/stl_type_traits.hpp
 // CHECK-NEXT: detail/sycl_mem_obj_allocator.hpp
 // CHECK-NEXT: detail/aligned_allocator.hpp
-// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: ext/oneapi/accessor_property_list.hpp
 // CHECK-NEXT: detail/property_list_base.hpp
 // CHECK-NEXT: property_list.hpp
diff --git a/sycl/test/include_deps/sycl_buffer.hpp.cpp b/sycl/test/include_deps/sycl_buffer.hpp.cpp
index 4bda4fe9cbe5f..ad5a5cafbf22c 100644
--- a/sycl/test/include_deps/sycl_buffer.hpp.cpp
+++ b/sycl/test/include_deps/sycl_buffer.hpp.cpp
@@ -25,6 +25,8 @@
 // CHECK-NEXT: detail/string.hpp
 // CHECK-NEXT: detail/common.hpp
 // CHECK-NEXT: detail/helpers.hpp
+// CHECK-NEXT: detail/pi.hpp
+// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: memory_enums.hpp
 // CHECK-NEXT: CL/__spirv/spirv_vars.hpp
 // CHECK-NEXT: detail/iostream_proxy.hpp
@@ -36,7 +38,6 @@
 // CHECK-NEXT: detail/stl_type_traits.hpp
 // CHECK-NEXT: detail/sycl_mem_obj_allocator.hpp
 // CHECK-NEXT: detail/aligned_allocator.hpp
-// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: ext/oneapi/accessor_property_list.hpp
 // CHECK-NEXT: detail/property_list_base.hpp
 // CHECK-NEXT: property_list.hpp
diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
index fc4961262b09c..2ec31465dcc7c 100644
--- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp
+++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
@@ -22,10 +22,13 @@
 // CHECK-NEXT: info/aspects_deprecated.def
 // CHECK-NEXT: atomic.hpp
 // CHECK-NEXT: detail/helpers.hpp
-// CHECK-NEXT: memory_enums.hpp
+// CHECK-NEXT: detail/pi.hpp
+// CHECK-NEXT: backend_types.hpp
+// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: detail/pi.h
 // CHECK-NEXT: detail/pi_error.def
 // CHECK-NEXT: detail/pi.def
+// CHECK-NEXT: memory_enums.hpp
 // CHECK-NEXT: CL/__spirv/spirv_vars.hpp
 // CHECK-NEXT: multi_ptr.hpp
 // CHECK-NEXT: detail/type_traits.hpp
@@ -59,7 +62,6 @@
 // CHECK-NEXT: detail/boost/mp11/detail/mp_with_index.hpp
 // CHECK-NEXT: detail/boost/mp11/integer_sequence.hpp
 // CHECK-NEXT: buffer.hpp
-// CHECK-NEXT: backend_types.hpp
 // CHECK-NEXT: detail/array.hpp
 // CHECK-NEXT: exception.hpp
 // CHECK-NEXT: detail/cl.h
@@ -77,7 +79,6 @@
 // CHECK-NEXT: detail/stl_type_traits.hpp
 // CHECK-NEXT: detail/sycl_mem_obj_allocator.hpp
 // CHECK-NEXT: detail/aligned_allocator.hpp
-// CHECK-NEXT: detail/os_util.hpp
 // CHECK-NEXT: ext/oneapi/accessor_property_list.hpp
 // CHECK-NEXT: detail/property_list_base.hpp
 // CHECK-NEXT: property_list.hpp
@@ -144,7 +145,6 @@
 // CHECK-NEXT: ext/oneapi/properties/properties.hpp
 // CHECK-NEXT: ext/oneapi/experimental/graph.hpp
 // CHECK-NEXT: handler.hpp
-// CHECK-NEXT: detail/pi.hpp
 // CHECK-NEXT: detail/reduction_forward.hpp
 // CHECK-NEXT: ext/intel/experimental/fp_control_kernel_properties.hpp
 // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp