diff --git a/applications/dual_gemm/collective/xe_dual_gemm_mma.hpp b/applications/dual_gemm/collective/xe_dual_gemm_mma.hpp
index 64ca0c4a91..70c9061d25 100644
--- a/applications/dual_gemm/collective/xe_dual_gemm_mma.hpp
+++ b/applications/dual_gemm/collective/xe_dual_gemm_mma.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -169,7 +170,7 @@ struct DualGemmMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, ElementA_
     TiledMma tiled_mma;
     // TODO(Codeplay): see if we can make this nicer
     // To make all work items in a subgroup have the same global tensors pass in the index of work item 0 in each subgroup
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
     auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
 
diff --git a/applications/flash_attention_v2/collective/fmha_fusion.hpp b/applications/flash_attention_v2/collective/fmha_fusion.hpp
index a87752588f..d943228538 100644
--- a/applications/flash_attention_v2/collective/fmha_fusion.hpp
+++ b/applications/flash_attention_v2/collective/fmha_fusion.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
-* Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,6 +40,7 @@ using namespace cute;
 
 struct VariableLength {
   int max_length;
+  int total_length = 0;
   int* cumulative_length = nullptr;
 
   CUTE_HOST_DEVICE operator int() const {
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_epilogue.hpp
new file mode 100644
index 0000000000..dfaf883274
--- /dev/null
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_epilogue.hpp
@@ -0,0 +1,255 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include <sycl/sycl.hpp>
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/detail/layout.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace flash_attention {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class MMAOperation_, class TileShapeOutput_, class SubgroupLayout_, class... Args> class  FlashChunkPrefillEpilogue {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Could not find an epilogue specialization.");
+};
+
+template <class MMAOperation_, class TileShapeOutput_, class SubgroupLayout_, class ElementCompute_, class ElementO_, class StrideO_, class ElementLSE_, class CopyOpO_>
+class FlashChunkPrefillEpilogue<epilogue::IntelXeXMX16,  MMAOperation_, TileShapeOutput_, SubgroupLayout_, ElementCompute_, ElementO_, StrideO_, ElementLSE_, CopyOpO_> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = epilogue::IntelXeXMX16;
+  using ElementO = ElementO_;
+  using StrideO = StrideO_;
+  using ElementLSE = ElementLSE_;
+  using CopyOpO = CopyOpO_;
+  using SubgroupLayout = SubgroupLayout_;
+  using TileShapeOutput = TileShapeOutput_;
+  using TiledMmaOutput = typename TiledMMAHelper<MMA_Atom<MMAOperation_>, Layout<TileShapeOutput>, SubgroupLayout>::TiledMMA;
+  using GmemTiledCopyO = CopyOpO;
+  using ElementOutput = ElementO_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementCompute_;
+  using SubgroupTileShape = decltype(cute::shape_div(TileShapeOutput{}, (SubgroupLayout{}.shape())));
+
+  static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
+
+  static_assert(cute::rank(TileShapeOutput{}) == 3, "TileShapeOutput must be rank-3: [CTA_M_QO, CTA_N_VO, CTA_K_PV]");
+  static_assert(cute::rank(StrideO{}) == 3, "StrideO must be rank-3: [seq_len_qo, head_size_vo, batch * num_heads]");
+
+  using CopyThreadShape = Shape<_1, Int<SubgroupSize>>;
+  
+  using traits_store_O = Copy_Traits<GmemTiledCopyO, StrideO>;
+  using atom_load_O = Copy_Atom<traits_store_O, ElementO>;
+  using val_layout_load_O = decltype(make_layout(shape_div(typename traits_store_O::BlockShape{}, CopyThreadShape{})));
+  using XE_Copy_O = decltype(make_tiled_copy(atom_load_O{}, Layout<CopyThreadShape>{}, val_layout_load_O{}));
+
+private:
+  constexpr static bool is_destination_supported = not cute::is_void_v<ElementO>;
+
+public:
+  using EmptyType = cute::tuple<>;
+
+  struct TensorStorageImpl : cute::tuple<EmptyType, EmptyType> {};
+
+  struct SharedStorage {
+    using TensorStorage = TensorStorageImpl;
+
+    TensorStorage tensors;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    ElementO const *ptr_O;
+    StrideO dO;
+  };
+
+  // Device side epilogue params
+  struct Params {
+    XE_Copy_O xe_store_o;
+  };
+
+  //
+  // Methods
+  //
+  template <typename To_type, typename Engine, typename Layout>
+  CUTLASS_DEVICE auto convert_type(Tensor<Engine, Layout> const &tensor) {
+    using From_type = typename Engine::value_type;
+    constexpr int numel = decltype(size(tensor))::value;
+    cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
+    auto frag =
+    convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(
+        tensor.data()));
+        return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
+  }
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape const &problem_shape, Arguments const &args,
+                                                  [[maybe_unused]] void *workspace) {
+    auto [batch, num_heads_q, num_heads_kv, seq_len_qo, seq_len_kv, seq_len_kv_cache, head_size_qk, head_size_vo] = problem_shape;
+    auto tensorO = make_tensor(make_gmem_ptr(static_cast<ElementO const*>(args.ptr_O)), 
+                                                  make_layout(make_shape(seq_len_qo, num_heads_q * head_size_vo, batch), 
+                                                  args.dO));
+    XE_Copy_O xe_store_o{XE_Copy_O{}.with(tensorO)};
+    return {
+        xe_store_o,
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t get_workspace_size(ProblemShape const &problem_shape, Arguments const &args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status initialize_workspace(ProblemShape const &problem_shape, Arguments const &args, void *workspace,
+                                              cudaStream_t stream, CudaHostAdapter *cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool can_implement(ProblemShape const &problem_shape,
+                                                [[maybe_unused]] Arguments const &args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  FlashChunkPrefillEpilogue(Params const &params_, TensorStorage const &) : params(params_) {}
+
+  template <class ProblemShape, class SequenceLengthShape, class TileCoord, class FragOut, class FragMax, class FragSum>
+  CUTLASS_DEVICE void operator()(ProblemShape problem_shape, SequenceLengthShape sequence_length_shape, TileCoord tile_coord, FragOut &out,
+                                 FragMax const &max, FragSum &sum) {
+
+    using namespace cute;
+
+    static constexpr bool is_var_len = cutlass::fmha::collective::is_variable_length_v<tuple_element_t<2, ProblemShape>>;
+  
+    using FragOutLayout = typename FragOut::layout_type;
+
+    constexpr int Vec    = shape<0>(FragOutLayout{});
+    constexpr int FragsM = shape<1>(FragOutLayout{});
+    constexpr int FragsN = size(select<2,3>(shape(FragOutLayout{})));
+
+    auto sg = compat::get_nd_item<1>().get_sub_group();
+    auto out_reg = make_tensor(static_cast<decltype(out) &&>(out).data() , Shape<Int<Vec>, Int<FragsM>, Int<FragsN>>{});
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int y = 0; y < FragsM; y++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int x = 0; x < Vec; x++) {
+        int indx = y * Vec + x;
+        auto cur_sum = reduce_over_group(sg, sum(indx), sycl::plus<>());
+        auto cur_scale = (cur_sum == 0.f || cur_sum != cur_sum) ? 1.0f : sycl::native::recip(cur_sum);
+        CUTLASS_PRAGMA_UNROLL
+        for (int z = 0; z < FragsN; z++) {
+          out_reg(x, y, z) *= cur_scale;
+        }
+      }
+    }
+
+    // Indexing variables
+    auto [batch, num_heads_q, num_heads_kv, head_size_vo] = select<0, 1, 2, 7>(problem_shape);
+    auto [seq_len_qo] = select<0>(sequence_length_shape);
+    // Represent the full output tensor
+    Tensor mO_mnl = cute::get_xe_tensor(make_shape(seq_len_qo, head_size_vo, 1));
+    
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord;
+    // Tile the output tensor per WG
+    Tensor g_wg_O = local_tile(mO_mnl, select<0,1>(TileShapeOutput{}), make_coord(m_coord,n_coord,0));             // (BLK_M,BLK_N,m,n,l)
+    static constexpr auto ATOM_N = get<2>(typename TiledMmaOutput::ThrLayoutVMNK{}.shape());
+    auto m_sg = get_sub_group_id() / ATOM_N;
+    auto n_sg = get_sub_group_id() % ATOM_N;
+    // Tile the output tensor per SG
+    Tensor gO = local_tile(g_wg_O, SubgroupTileShape{}, make_coord(m_sg,n_sg,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    auto thread_xe_store_o = params.xe_store_o.get_thread_slice(ThreadIdxX());
+    Tensor tOgO = thread_xe_store_o.partition_D(gO);
+
+    Tensor final_out_reg = make_fragment_like<ElementOutput>(out_reg);
+    // iff ElementOutput == ElementAccumulator, then convert_type doesn't do the right conversion
+    // so we call copy() which internally performs a static_cast op on the data.
+    // for ElementOutput == bf16 | fp16, convert_type calls relevant NumericConverter specialization.    
+    if constexpr (cute::is_same_v<ElementOutput, ElementCompute>) {
+      copy(out_reg, final_out_reg);
+    } else {
+      Tensor temp = convert_type<ElementOutput>(out_reg);
+      copy(temp, final_out_reg);
+    }
+    copy(params.xe_store_o, final_out_reg, tOgO);
+  }
+
+  // SequenceLengthShapeType = Shape<int, int, int>
+  // For Fixed Sequence Length, ProblemShapeType = Shape<int, int, int, int, int, int, int, int>
+  // For Variable Sequence Length, ProblemShapeType = Shape<int, int, int, VariableSeqlen, VariableSeqlen, VariableSeqlen, int, int>
+  template <bool VarLen, class ProblemShapeType, class SequenceLengthShapeType>
+  CUTLASS_DEVICE static constexpr Params get_updated_copies(Params const& params, ProblemShapeType const& problem_shape, 
+                                                            SequenceLengthShapeType const& sequence_length_shape, int const& l_coord, int const& q_head_coord) {
+    auto [num_heads_q, num_heads_kv, head_size_vo] = select<1, 2, 7>(problem_shape);
+    auto [seq_len_qo] = select<0>(sequence_length_shape);
+    int offset_o = 0;
+    if constexpr (VarLen) {
+      auto qo_cumulative_length = get<3>(problem_shape).cumulative_length;
+      offset_o = num_heads_q * head_size_vo * qo_cumulative_length[l_coord] + q_head_coord * head_size_vo;
+    } else {
+      offset_o = num_heads_q * head_size_vo * seq_len_qo * l_coord + q_head_coord * head_size_vo;
+    }
+    auto store_traits = static_cast<traits_store_O const&>(params.xe_store_o);
+    ElementO* base_ptr = (ElementO*)store_traits.base_ptr;
+    auto shape_o = make_shape(static_cast<int>(seq_len_qo), num_heads_q * head_size_vo, 1);
+    StrideO stride_o = cutlass::make_cute_packed_stride(StrideO{}, shape_o);
+    auto tensorO = make_tensor(make_gmem_ptr(base_ptr + offset_o), make_layout(shape_o, stride_o));
+    XE_Copy_O xe_store_o{XE_Copy_O{}.with(tensorO)};
+    return Params{xe_store_o};
+  }
+
+private:
+  Params const &params;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace flash_attention
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_mma.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_mma.hpp
new file mode 100644
index 0000000000..98b5a2f074
--- /dev/null
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_mma.hpp
@@ -0,0 +1,615 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/fp8_to_fp16.h"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "fmha_fusion.hpp"
+
+
+////////////////////////////////////////////////////////////
+namespace {
+
+}
+    
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::flash_attention::collective {
+using namespace cute;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class ProblemShapeType_, class ElementQ_,
+          class StrideQ_, class ElementK_, class StrideK_, class ElementV_,
+          class StrideV_, class MMAOperation_, class TileShapeQK_,
+          class TileShapePV_, class SubgroupLayout_, class GmemTiledCopyQ_,
+          class GmemTiledCopyK_, class GmemTiledCopyV_, bool CausalMask_,
+          bool LocalMask_, bool PagedKV_>
+struct FlashChunkPrefillMma {
+  static_assert(cutlass::detail::dependent_false<ElementQ_>,
+                "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Stages, class ProblemShapeType_, class ElementQ_, class StrideQ_,
+          class ElementK_, class StrideK_, class ElementV_, class StrideV_,
+          class MMAOperation_, class TileShapeQK_, class TileShapePV_,
+          class SubgroupLayout_, class GmemTiledCopyQ_, class GmemTiledCopyK_,
+          class GmemTiledCopyV_, bool CausalMask_, bool LocalMask_, bool PagedKV_>
+struct FlashChunkPrefillMma<
+    gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeType_, ElementQ_, StrideQ_,
+    ElementK_, StrideK_, ElementV_, StrideV_, MMAOperation_, TileShapeQK_,
+    TileShapePV_, SubgroupLayout_, GmemTiledCopyQ_, GmemTiledCopyK_,
+    GmemTiledCopyV_, CausalMask_, LocalMask_, PagedKV_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = gemm::MainloopIntelXeXMX16<Stages>;
+  using TileShapeQK = TileShapeQK_;
+  using TileShapePV = TileShapePV_;
+  using SubgroupLayout = SubgroupLayout_;
+  using ProblemShapeType = ProblemShapeType_;
+  using ElementQ = ElementQ_;
+  using StrideQ = StrideQ_;
+  using ElementK = ElementK_;
+  using StrideK = StrideK_;
+  using ElementV = ElementV_;
+  using StrideV = StrideV_;
+  using GmemTiledCopyQ = GmemTiledCopyQ_;
+  using GmemTiledCopyK = GmemTiledCopyK_;
+  using GmemTiledCopyV = GmemTiledCopyV_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using MmaAtom = MMA_Atom<MMAOperation_>;
+
+  using TiledMmaQK = typename TiledMMAHelper<MmaAtom, Layout<TileShapeQK>,
+                                             SubgroupLayout>::TiledMMA;
+
+  using TiledMmaPV = typename TiledMMAHelper<MmaAtom, Layout<TileShapePV>,
+                                             SubgroupLayout>::TiledMMA;
+  using ElementAccumulator = typename TiledMmaQK::ValTypeC;
+  static constexpr bool CausalMask = CausalMask_;
+  static constexpr bool LocalMask = LocalMask_;
+  static constexpr bool PagedKV = PagedKV_;
+
+  static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
+
+  using MmaAtomShape = typename MmaAtom::Shape_MNK;
+
+  static constexpr auto PV_ATOM_M =
+      decltype(get<0>(SubgroupLayout{}.shape()))::value;
+  static constexpr auto PV_ATOM_N =
+      decltype(get<1>(SubgroupLayout{}.shape()))::value;
+  static constexpr auto PV_ATOM_K =
+      decltype(get<2>(SubgroupLayout{}.shape()))::value;
+
+  using SubgroupTileShapePV =
+      decltype(cute::shape_div(TileShapePV{}, (SubgroupLayout{}.shape())));
+  static constexpr auto QK_BLK_M = get<0>(TileShapeQK{});
+  static constexpr auto QK_BLK_N = get<1>(TileShapeQK{});
+  static constexpr auto QK_BLK_K = get<2>(TileShapeQK{});
+
+  // This TiledMma is only required to serve the specific tiling requirements
+  // for matrix K. This is due to the consumption of matrix K by all subgroups
+  // within a workgroup.
+  static constexpr auto QK_ATOM_M = PV_ATOM_M; // 8
+  static constexpr auto QK_ATOM_N = PV_ATOM_N; // 1
+  static constexpr auto QK_ATOM_K = PV_ATOM_K; // 1
+
+  using SubgroupTileShapeQK = decltype(cute::shape_div(
+      TileShapeQK{},
+      SubgroupLayout{}.shape())); // 128, 64, 32 / 16, 1, 1 = (8, 64, 32 )
+
+  static constexpr auto QK_SG_M = get<0>(SubgroupTileShapeQK{});
+  static constexpr auto QK_SG_N = get<1>(SubgroupTileShapeQK{});
+  static constexpr auto QK_SG_K = get<2>(SubgroupTileShapeQK{});
+
+  static constexpr bool is_var_len =
+      cutlass::fmha::collective::is_variable_length_v<
+          tuple_element_t<3, ProblemShapeType>>;
+
+  using FragsShapeS = decltype(cute::shape_div(
+      take<0, 2>(SubgroupTileShapeQK{}),
+      take<0, 2>(MmaAtomShape()))); // 8, 64, 32 /  8, 16, 16 (1, 4)
+  static constexpr int Vec =
+      (get<0>(MmaAtomShape()) * get<1>(MmaAtomShape())) / SubgroupSize; // 8
+  static constexpr int FragsM = get<0>(FragsShapeS{});
+  static constexpr int FragsNS = get<1>(FragsShapeS{}); // 4
+
+  static constexpr uint32_t MaxThreadsPerBlock =
+      size(SubgroupLayout{}) * SubgroupSize;
+  using CopyThreadShape = Shape<_1, Int<SubgroupSize>>;
+
+  using traits_load_Q = Copy_Traits<GmemTiledCopyQ, StrideQ>;
+  using atom_load_Q = Copy_Atom<traits_load_Q, ElementQ>;
+  using val_layout_load_Q = decltype(make_layout(
+      shape_div(typename traits_load_Q::BlockShape{}, CopyThreadShape{})));
+  using XE_Copy_Q = decltype(make_tiled_copy(
+      atom_load_Q{}, Layout<CopyThreadShape>{}, val_layout_load_Q{}));
+
+  using traits_load_K = Copy_Traits<GmemTiledCopyK, StrideK>;
+  using atom_load_K = Copy_Atom<traits_load_K, ElementK>;
+  using val_layout_load_K = decltype(make_layout(
+      shape_div(typename traits_load_K::BlockShape{}, CopyThreadShape{})));
+  using XE_Copy_K = decltype(make_tiled_copy(
+      atom_load_K{}, Layout<CopyThreadShape>{}, val_layout_load_K{}));
+
+  using traits_load_V = Copy_Traits<GmemTiledCopyV, StrideV>;
+  using atom_load_V = Copy_Atom<traits_load_V, ElementV>;
+  using val_layout_load_V = decltype(make_layout(
+      shape_div(typename traits_load_V::BlockShape{}, CopyThreadShape{})));
+  using XE_Copy_V = decltype(make_tiled_copy(
+      atom_load_V{}, Layout<CopyThreadShape>{}, val_layout_load_V{}));
+
+  template <typename T>
+  static constexpr bool is_fp8_v = cute::is_same_v<T,float_e4m3_t> || cute::is_same_v<T,float_e5m2_t>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementQ const *ptr_Q;
+    StrideQ dQ;
+    ElementK const *ptr_K;
+    StrideK dK;
+    ElementV const *ptr_V;
+    StrideV dV;
+    float const *ptr_q_scale;
+    float const *ptr_k_scale;
+    float const *ptr_v_scale;
+    ElementK const *ptr_K_cache;
+    StrideK dK_cache;
+    ElementV const *ptr_V_cache;
+    StrideV dV_cache;
+    // Paged KV Cache
+    int const *ptr_page_table;
+    int page_size;
+    int const *num_pages_per_seq;
+    int window_left;
+    int window_right;
+  };
+
+  struct Params {
+    XE_Copy_Q gmem_tiled_copy_q;
+    XE_Copy_K gmem_tiled_copy_k;
+    XE_Copy_V gmem_tiled_copy_v;
+    float const *ptr_q_scale;
+    float const *ptr_k_scale;
+    float const *ptr_v_scale;
+    XE_Copy_K gmem_tiled_copy_k_cache;
+    XE_Copy_V gmem_tiled_copy_v_cache;
+    // Paged KV Cache
+    int const *ptr_page_table;
+    int page_size;
+    int const *num_pages_per_seq;
+    int window_left;
+    int window_right;
+  };
+
+  //
+  // Methods
+  //
+
+  FlashChunkPrefillMma() = default;
+
+  static constexpr Params
+  to_underlying_arguments(ProblemShapeType const &problem_shape,
+                          Arguments const &args, void *workspace) {
+    (void)workspace;
+
+    auto [batch, num_heads_q, num_heads_kv, seq_len_qo, seq_len_kv,
+          seq_len_kv_cache, head_size_qk, head_size_vo] = problem_shape;
+
+    auto tensorQ = make_tensor(
+        make_gmem_ptr(args.ptr_Q),
+        make_layout(make_shape(seq_len_qo, num_heads_q * head_size_qk, batch),
+                    args.dQ));
+    auto tensorK = make_tensor(
+        make_gmem_ptr(args.ptr_K),
+        make_layout(make_shape(seq_len_kv, num_heads_kv * head_size_qk, batch),
+                    args.dK));
+    auto tensorV = make_tensor(
+        make_gmem_ptr(args.ptr_V),
+        make_layout(make_shape(num_heads_kv * head_size_vo, seq_len_kv, batch),
+                    args.dV));
+    auto tensorK_cache =
+        make_tensor(make_gmem_ptr(args.ptr_K_cache),
+                    make_layout(make_shape(seq_len_kv_cache,
+                                           num_heads_kv * head_size_qk, batch),
+                                args.dK_cache));
+    auto tensorV_cache = make_tensor(
+        make_gmem_ptr(args.ptr_V_cache),
+        make_layout(
+            make_shape(num_heads_kv * head_size_vo, seq_len_kv_cache, batch),
+            args.dV_cache));
+
+    XE_Copy_Q copyQ{XE_Copy_Q{}.with(tensorQ)};
+    XE_Copy_K copyK{XE_Copy_K{}.with(tensorK)};
+    XE_Copy_V copyV{XE_Copy_V{}.with(tensorV)};
+    XE_Copy_K copyK_cache{XE_Copy_K{}.with(tensorK_cache)};
+    XE_Copy_V copyV_cache{XE_Copy_V{}.with(tensorV_cache)};
+
+    return Params{copyQ,
+                  copyK,
+                  copyV,
+                  args.ptr_q_scale,
+                  args.ptr_k_scale,
+                  args.ptr_v_scale,
+                  copyK_cache,
+                  copyV_cache,
+                  args.ptr_page_table,
+                  args.page_size,
+                  args.num_pages_per_seq,
+                  args.window_left,
+                  args.window_right};
+  }
+
+  template <class FragQccum, class TensorQ, class TensorK, class FragSrc>
+  CUTLASS_DEVICE void mmaQK(FragQccum &accum, TensorQ gQ, TensorK gK,
+                            FragSrc const &frag_src, int const &k_tile_count,
+                            Params const &params, bool is_KV_cache,
+                            int const& q_head_coord, int const& kv_head_coord) {
+
+    auto &gmem_tiled_copy_k =
+        is_KV_cache ? params.gmem_tiled_copy_k_cache : params.gmem_tiled_copy_k;
+
+    int thread_idx = static_cast<int>(ThreadIdxX());
+    auto thr_copy_Q = params.gmem_tiled_copy_q.get_slice(thread_idx);
+    auto thr_copy_K = gmem_tiled_copy_k.get_slice(thread_idx);
+    // Instantiate the MMA object
+    TiledMmaQK tiled_mma;
+    // To make all threads in a warp have the same global tensors pass in the
+    // index of thread 0 in each warp
+    auto sg = compat::get_nd_item<1>().get_sub_group();
+    auto first_thread_in_sg_idx =
+        sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
+    auto thread_mma_q = tiled_mma.get_slice(first_thread_in_sg_idx);
+    auto thread_mma_k = tiled_mma.get_slice(0);
+
+    Tensor tCgQ = thread_mma_q.partition_A(gQ);
+    Tensor tCgK = thread_mma_k.partition_B(gK);
+
+    // Create fragments
+    // TODO(Codeplay): fix this, this is probably not general
+    using TCrQ_Type = cute::conditional_t<is_fp8_v<ElementQ>, uint8_t, ElementQ>;
+    using TCrK_Type = cute::conditional_t<is_fp8_v<ElementK>, uint8_t, ElementK>;
+    Tensor tCrQ = make_tensor<TCrQ_Type>(make_fragment_layout(params.gmem_tiled_copy_q, take<0,3>(tCgQ.shape())));
+    Tensor tCrK = make_tensor<TCrK_Type>(make_fragment_layout(gmem_tiled_copy_k, take<0,3>(tCgK.shape())));
+
+    // Retile registers for copies
+    Tensor tQrQ = thr_copy_Q.retile_D(tCrQ);
+    Tensor tKrK = thr_copy_K.retile_D(tCrK);
+
+    // Retile global tile for copies
+    Tensor tQgQ = thr_copy_Q.retile_S(tCgQ);
+    Tensor tKgK = thr_copy_K.retile_S(tCgK);
+
+    float q_scale = params.ptr_q_scale[q_head_coord];
+    float k_scale = params.ptr_k_scale[kv_head_coord];
+
+    //
+    // Mainloop
+    //
+    for (int k_tile = 0; k_tile < k_tile_count; ++k_tile) {
+      copy(params.gmem_tiled_copy_q, tQgQ(_, _, _, k_tile), tQrQ);
+      copy(gmem_tiled_copy_k, tKgK(_, _, _, k_tile), tKrK);
+
+      // FP8 path: Convert FP8 fragments to FP16 IN-PLACE to avoid register spilling.
+      if constexpr (is_fp8_v<ElementQ> || is_fp8_v<ElementK>) {
+        // Recast the memory region of the FP8 tensors as FP16 tensors.
+        // This does NOT allocate new registers. It reuses the existing ones.
+        auto tCrQ_fp16 = cute::recast<half_t>(tCrQ);
+        auto tCrK_fp16 = cute::recast<half_t>(tCrK);
+
+        // Perform the conversion, writing the FP16 results directly into the
+        // reused register space.
+        if constexpr (is_fp8_v<ElementQ>) {
+          convert_and_descale<ElementQ>(tCrQ, tCrQ_fp16, q_scale);
+        } else {
+          // If Q is already FP16, just copy it to the correctly-named variable.
+          copy(tCrQ, tCrQ_fp16);
+        }
+
+        if constexpr (is_fp8_v<ElementK>) {
+          convert_and_descale<ElementK>(tCrK, tCrK_fp16, k_scale);
+        } else {
+          copy(tCrK, tCrK_fp16);
+        }
+
+        // Now, gemm is called on the FP16 tensors which occupy the same
+        // register space as the original FP8 tensors did. Register pressure is not increased.
+        cute::gemm(tiled_mma, accum, tCrQ_fp16, tCrK_fp16, frag_src);
+      } else {
+        // FP16 path (already fast)
+        cute::gemm(tiled_mma, accum, tCrQ, tCrK, frag_src);
+      }
+
+#if 0
+#define PRINT(x)                                                               \
+  print(#x ": ");                                                              \
+  print(x);                                                                    \
+  print("\n");
+    if (cute::thread(0, 0)) {
+      print("======================= Q: \n");
+      PRINT(gQ);
+      PRINT(tCrQ);
+      PRINT(tCgQ);
+      PRINT(tQrQ);
+      PRINT(tQgQ);
+
+      print("=====================  K :\n");
+      PRINT(gK);
+      PRINT(tCrK);
+      PRINT(tCgK);
+      PRINT(tKrK);
+      PRINT(tKgK);
+
+      print("=====================  Config: \n");
+      PRINT(MaxThreadsPerBlock);
+      PRINT(SubgroupTileShapeQK{});
+    }
+#undef PRINT
+#endif
+    }
+  }
+
+  template <typename To_type, typename Engine, typename Layout>
+  CUTLASS_DEVICE auto convert_type(Tensor<Engine, Layout> const &tensor) {
+      using From_type = typename Engine::value_type;
+      constexpr int numel = decltype(size(tensor))::value;
+      cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
+      auto frag =
+      convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(
+          tensor.data()));
+          return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
+  }
+
+  template <int tile_count, class FragQccum, class FragS, class TensorV,
+            class FragSrc>
+  CUTLASS_DEVICE void mmaPV(FragQccum &accum, FragS const &tSr, TensorV gV,
+                            FragSrc const &frag_src, Params const &params,
+                            bool is_KV_cache, int const& kv_head_coord) {
+
+    auto &gmem_tiled_copy_v =
+        is_KV_cache ? params.gmem_tiled_copy_v_cache : params.gmem_tiled_copy_v;
+
+    float v_scale = params.ptr_v_scale[kv_head_coord];
+
+    int thread_idx = static_cast<int>(ThreadIdxX());
+    // Instantiate the MMA object
+    TiledMmaPV tiled_mma;
+    // Tile GV to the shape of <64,64> and loop over the HeadSize/64 to avoid
+    // Register spill
+    Tensor gV_ = take<0, 3>(
+        local_tile(gV, select<1, 2>(TileShapePV{}), make_coord(_, _)));
+    auto sg = compat::get_nd_item<1>().get_sub_group();
+    auto first_thread_in_sg_idx =
+        sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
+    auto thread_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
+    Tensor tCgV = thread_mma.partition_B(gV_);
+    using TCrV_Type = cute::conditional_t<is_fp8_v<ElementV>, uint8_t, ElementV>;
+    Tensor tCrV = make_tensor<TCrV_Type>(make_fragment_layout(gmem_tiled_copy_v, take<0,3>(tCgV.shape())));
+
+    // Partition the copying of A and B tiles across the threads
+    auto gmem_thr_copy_V = gmem_tiled_copy_v.get_slice(thread_idx);
+    Tensor tVrV = gmem_thr_copy_V.retile_D(tCrV);
+    Tensor tVgV = gmem_thr_copy_V.retile_S(tCgV);
+
+#if CUTLASS_ENABLE_DEBUG_PRINTS
+#define PRINT(x)                                                               \
+  print(#x ": ");                                                              \
+  print(x);                                                                    \
+  print("\n");
+    if (cute::thread(LOG_THREAD, LOG_GROUP)) {
+      print("=====================  V :\n");
+      PRINT(gV);
+      PRINT(tCrV);
+      PRINT(tCgV);
+      PRINT(tVrV);
+      PRINT(tVgV);
+
+      print("=====================  Config: \n");
+      PRINT(MaxThreadsPerBlock);
+      PRINT(SubgroupTileShapePV{});
+    }
+#undef PRINT
+#endif
+
+    // 7) Convert S to P (FP32 -> BF16)
+    Tensor tPr = convert_type<typename TiledMmaPV::ValTypeA>(tSr);
+    //
+    // Mainloop
+    //
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < tile_count; i++) {
+      copy(gmem_tiled_copy_v, tVgV(_, _, _, i), tVrV);
+
+      if constexpr (is_fp8_v<ElementV>) {
+        // Correctly reuse the registers of tCrV for the new FP16 tensor.
+        // This avoids doubling the register pressure.
+        auto tCrV_fp16 = cute::recast<half_t>(tCrV);
+
+        // Perform the conversion in-place, overwriting the old FP8 data
+        // with the new FP16 data in the same register space.
+        convert_and_descale<ElementV>(tCrV, tCrV_fp16, v_scale);
+
+        // The GEMM now operates on an FP16 tensor that is in registers,
+        // preventing a catastrophic performance drop from register spilling.
+        cute::gemm(tiled_mma, accum(_,_,_,i), tPr, tCrV_fp16, frag_src(_,_,_,i));
+      } else {
+        // Native FP16 path (already fast)
+        cute::gemm(tiled_mma, accum(_,_,_,i), tPr, tCrV, frag_src(_,_,_,i));
+      }
+    }
+  }
+
+  // SequenceLengthShape = Shape<int, int, int>
+  // For Fixed Sequence Length, ProblemShape = Shape<int, int, int, int, int,
+  // int, int, int> For Variable Sequence Length, ProblemShape = Shape<int, int,
+  // int, VariableSeqlen, VariableSeqlen, VariableSeqlen, int, int>
+  template <class ProblemShape, class SequenceLengthShape>
+  CUTLASS_DEVICE static constexpr Params
+  get_updated_copies(Params const &params, ProblemShape const &problem_shape,
+                     SequenceLengthShape const &sequence_length_shape,
+                     int const &l_coord, int const &q_head_coord = 0) {
+    auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] =
+        select<0, 1, 2, 6, 7>(problem_shape);
+    auto [seq_len_qo, seq_len_kv, seq_len_kv_cache] = sequence_length_shape;
+    auto q_group_size = num_heads_q / num_heads_kv;
+    auto kv_head_coord = q_head_coord / q_group_size;
+    int offset_q = 0, offset_k = 0, offset_v = 0, offset_k_cache = 0,
+        offset_v_cache = 0;
+    int total_seq_len_kv_cache = 0;
+    if constexpr (is_var_len) {
+      auto qo_cumulative_length = get<3>(problem_shape).cumulative_length;
+      auto kv_cumulative_length = get<4>(problem_shape).cumulative_length;
+      auto kv_cached_cumulative_length =
+          get<5>(problem_shape).cumulative_length;
+
+      offset_q = num_heads_q * head_size_qk * qo_cumulative_length[l_coord] +
+                 q_head_coord * head_size_qk;
+
+      offset_k = num_heads_kv * head_size_qk * kv_cumulative_length[l_coord] +
+                 kv_head_coord * head_size_qk;
+      offset_v = num_heads_kv * head_size_vo * kv_cumulative_length[l_coord] +
+                 kv_head_coord * head_size_vo;
+      offset_k_cache = seq_len_kv_cache == 0
+                           ? 0
+                           : PagedKV? // For page_kv, there is no batch dimension.
+                            kv_head_coord * head_size_qk
+                            : num_heads_kv * head_size_qk * kv_cached_cumulative_length[l_coord] + kv_head_coord * head_size_qk;
+      offset_v_cache = seq_len_kv_cache == 0
+                           ? 0
+                           : PagedKV?   // For page_kv, there is no batch dimension.
+                           kv_head_coord * head_size_vo
+                           : num_heads_kv * head_size_vo * kv_cached_cumulative_length[l_coord] + kv_head_coord * head_size_vo;
+      total_seq_len_kv_cache = get<5>(problem_shape).total_length;
+    } else {
+      offset_q = num_heads_q * head_size_qk * seq_len_qo * l_coord +
+                 q_head_coord * head_size_qk;
+
+      offset_k = num_heads_kv * head_size_qk * seq_len_kv * l_coord +
+                 kv_head_coord * head_size_qk;
+      offset_v = num_heads_kv * head_size_vo * seq_len_kv * l_coord +
+                 kv_head_coord * head_size_vo;
+      offset_k_cache =
+          seq_len_kv_cache == 0
+              ? 0 :
+              PagedKV?
+                kv_head_coord * head_size_qk
+                : num_heads_kv * head_size_qk * seq_len_kv_cache * l_coord + kv_head_coord * head_size_qk;
+      offset_v_cache =
+          seq_len_kv_cache == 0
+              ? 0 : 
+              PagedKV?
+               kv_head_coord * head_size_vo
+               : num_heads_kv * head_size_vo * seq_len_kv_cache * l_coord + kv_head_coord * head_size_vo;
+      total_seq_len_kv_cache = batch * seq_len_kv_cache;
+    }
+
+    auto q_traits =
+        static_cast<traits_load_Q const &>(params.gmem_tiled_copy_q);
+    const ElementQ *q_ptr = (const ElementQ *)q_traits.base_ptr;
+    auto k_traits =
+        static_cast<traits_load_K const &>(params.gmem_tiled_copy_k);
+    const ElementK *k_ptr = (const ElementK *)k_traits.base_ptr;
+    auto v_traits =
+        static_cast<traits_load_V const &>(params.gmem_tiled_copy_v);
+    const ElementV *v_ptr = (const ElementV *)v_traits.base_ptr;
+    auto k_traits_cache =
+        static_cast<traits_load_K const &>(params.gmem_tiled_copy_k_cache);
+    const ElementK *k_cache_ptr = (const ElementK *)k_traits_cache.base_ptr;
+    auto v_traits_cache =
+        static_cast<traits_load_V const &>(params.gmem_tiled_copy_v_cache);
+    const ElementV *v_cache_ptr = (const ElementV *)v_traits_cache.base_ptr;
+    // NHD format{batch, seq_len, head, dim_head}
+    // stride {seq_len*head*dim_head, head*dim_head, dim_head, 1}
+    auto shape_q =
+        make_shape(static_cast<int>(seq_len_qo), head_size_qk * num_heads_q, 1);
+    StrideQ stride_q = cutlass::make_cute_packed_stride(StrideQ{}, shape_q);
+    auto shape_k = make_shape(static_cast<int>(seq_len_kv),
+                              num_heads_kv * head_size_qk, 1);
+    StrideK stride_k = cutlass::make_cute_packed_stride(StrideK{}, shape_k);
+
+    auto shape_v = make_shape(head_size_vo * num_heads_kv,
+                              static_cast<int>(seq_len_kv), 1);
+    StrideV stride_v = cutlass::make_cute_packed_stride(StrideV{}, shape_v);
+
+    auto shape_k_cache = make_shape(static_cast<int>(PagedKV? total_seq_len_kv_cache : seq_len_kv_cache),
+                                    head_size_qk * num_heads_kv, 1);
+    StrideK stride_k_cache =
+        cutlass::make_cute_packed_stride(StrideK{}, shape_k_cache);
+    auto shape_v_cache = make_shape(head_size_vo * num_heads_kv,
+                                    static_cast<int>(PagedKV? total_seq_len_kv_cache : seq_len_kv_cache), 1);
+    StrideV stride_v_cache =
+        cutlass::make_cute_packed_stride(StrideV{}, shape_v_cache);
+    auto tensorQ = make_tensor(make_gmem_ptr(q_ptr + offset_q),
+                               make_layout(shape_q, stride_q));
+    auto tensorK = make_tensor(make_gmem_ptr(k_ptr + offset_k),
+                               make_layout(shape_k, stride_k));
+    auto tensorV = make_tensor(make_gmem_ptr(v_ptr + offset_v),
+                               make_layout(shape_v, stride_v));
+    auto tensorK_cache =
+        make_tensor(make_gmem_ptr(k_cache_ptr + offset_k_cache),
+                    make_layout(shape_k_cache, stride_k_cache));
+    auto tensorV_cache =
+        make_tensor(make_gmem_ptr(v_cache_ptr + offset_v_cache),
+                    make_layout(shape_v_cache, stride_v_cache));
+    XE_Copy_Q copyQ{XE_Copy_Q{}.with(tensorQ)};
+    XE_Copy_K copyK{XE_Copy_K{}.with(tensorK)};
+    XE_Copy_V copyV{XE_Copy_V{}.with(tensorV)};
+    XE_Copy_K copyK_cache{XE_Copy_K{}.with(tensorK_cache)};
+    XE_Copy_V copyV_cache{XE_Copy_V{}.with(tensorV_cache)};
+
+    return Params{copyQ,
+                  copyK,
+                  copyV,
+                  params.ptr_q_scale,
+                  params.ptr_k_scale,
+                  params.ptr_v_scale,
+                  copyK_cache,
+                  copyV_cache,
+                  params.ptr_page_table,
+                  params.page_size,
+                  params.num_pages_per_seq,
+                  params.window_left,
+                  params.window_right};
+  }
+};
+
+} // namespace cutlass::flash_attention::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_softmax_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_softmax_epilogue.hpp
new file mode 100644
index 0000000000..849f65971b
--- /dev/null
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_chunk_prefill_softmax_epilogue.hpp
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing online softmax.
+*/
+
+#pragma once
+
+#include <sycl/sycl.hpp>
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_epilogue.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/detail/layout.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace flash_attention {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool CausalMask_, bool LocalMask_, class DispatchPolicy, class... Args> class FlashChunkPrefillSoftmaxEpilogue {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Could not find an epilogue specialization.");
+};
+
+
+template <bool CausalMask_, bool LocalMask_, class Element_>
+class FlashChunkPrefillSoftmaxEpilogue<CausalMask_, LocalMask_, epilogue::IntelXeXMX16, Element_> {
+public:
+
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = epilogue::IntelXeXMX16;
+  using Element = Element_;
+
+  static constexpr bool CausalMask = CausalMask_;
+  static constexpr bool LocalMask = LocalMask_;
+
+  using GmemTiledCopyOut = void;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    Element const scale;
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  static constexpr Params to_underlying_arguments(Arguments const &args) {
+    constexpr double kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+    Element val = args.scale * static_cast<Element>(kLog2e);
+    return Params{val};
+  }
+
+  template <class ProblemShape>
+  static size_t get_workspace_size() {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status initialize_workspace() {
+    return Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool can_implement() {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  FlashChunkPrefillSoftmaxEpilogue(Params const &params_) : params(params_) {}
+
+  template <int Vec, int FragsM, int FragsN, class FragAcc, class FragMax, class FragSum>
+  CUTLASS_DEVICE void scale_exp_log2(FragAcc &frag_s, FragMax const &max, FragSum &sum) {
+    auto g = compat::get_nd_item<1>().get_sub_group();
+    const auto max_scale = max * params.scale;
+    CUTLASS_PRAGMA_UNROLL
+    for (int indx = 0; indx < Vec * FragsM; indx++) {
+      const auto max_scale_bcast = group_broadcast(g, max_scale, indx);
+      CUTLASS_PRAGMA_UNROLL
+      for (int z = 0; z < FragsN; z++) {
+        auto base_indx = indx + (z * Vec * FragsM);
+        if constexpr (LocalMask) {
+          if ((std::isinf(max_scale_bcast) && max_scale_bcast < 0) ||
+             (std::isinf(frag_s(base_indx)) && frag_s(base_indx) < 0)) {
+            frag_s(base_indx) = 0.f;
+            // continue;
+          } else {
+            Element eq = frag_s(base_indx) - max_scale_bcast;
+            frag_s(base_indx) = sycl::native::exp2(eq);
+          }
+        } else {
+          Element eq = frag_s(base_indx) - max_scale_bcast;
+          frag_s(base_indx) = sycl::native::exp2(eq);
+        }
+        sum(indx) += frag_s(base_indx);
+      }
+    }
+  }
+
+  template <int Vec, int FragsM, int FragsN, class FragSrc, class FragMax>
+  CUTLASS_DEVICE void reduce_max(FragSrc &src, FragMax &max) {
+    auto sg = compat::get_nd_item<1>().get_sub_group();
+    CUTLASS_PRAGMA_UNROLL
+    for (int indx = 0; indx < Vec * FragsM; indx++) {
+      auto maxptr = group_broadcast(sg, max, indx);
+      CUTLASS_PRAGMA_UNROLL
+      for (int z = 0; z < FragsN; z++) {
+        auto base_indx = indx + (z * Vec * FragsM);
+        maxptr = sycl::max(maxptr, src(base_indx));
+        src(base_indx) *= params.scale;
+      }
+      maxptr = reduce_over_group(sg, maxptr, sycl::maximum<>());
+      if (indx == sg.get_local_id()[0]) {
+        max = maxptr;
+      }
+    }
+  }
+
+  template <class FragAcc, class FragMax, class FragSum, class FragOut>
+  CUTLASS_DEVICE void operator()(bool is_first, FragAcc &frag_s, FragMax &max, FragSum &sum, FragOut &out) {
+    auto max_prev = max;
+    using FragAccLayout = typename FragAcc::layout_type;
+    using FragOutLayout = typename FragOut::layout_type;
+    constexpr int Vec = get<0>(FragAccLayout{}.shape());
+    constexpr int FragsM = get<1>(FragAccLayout{}.shape());
+    constexpr int FragsNAcc = get<2>(FragAccLayout{}.shape());
+    constexpr int FragsNOut = size(select<2,3>(FragOutLayout{}.shape()));
+    reduce_max<Vec, FragsM, FragsNAcc>(frag_s, max);
+    static_assert(Vec * FragsM  % 8 == 0, " No. of attention rows per subgroup should be >= 1 MMA Atom worth of rows.");
+    if (!is_first) {
+      auto sg = compat::get_nd_item<1>().get_sub_group();
+      Element max_scale{max * params.scale};
+      Element exp_scale;
+      if constexpr (LocalMask) {
+        if ((std::isinf(max_scale) && max_scale < 0) || (std::isinf(max_prev) && max_prev < 0)) {
+          exp_scale = 0.f;
+        } else {
+          exp_scale = sycl::native::exp2(max_prev * params.scale - max_scale);
+        }
+      } else {
+        exp_scale = sycl::native::exp2(max_prev * params.scale - max_scale);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int indx = 0; indx < Vec * FragsM; indx++) {
+        auto max_scale_bcast = group_broadcast(sg, max_scale, indx);
+        auto exp_scale_bcast = group_broadcast(sg, exp_scale, indx);
+        sum(indx) *= exp_scale_bcast;
+        CUTLASS_PRAGMA_UNROLL
+        for (int z = 0; z < FragsNAcc; z++) {
+          auto base_indx = indx + (z * Vec * FragsM);
+          if constexpr (LocalMask) {
+            if ((std::isinf(max_scale_bcast) && max_scale_bcast < 0) ||
+                (std::isinf(frag_s(base_indx)) && frag_s(base_indx) < 0)) {
+              frag_s(base_indx) = 0.f;
+              // continue;
+            } else {
+              Element eq = frag_s(base_indx) - max_scale_bcast;
+              frag_s(base_indx) = sycl::native::exp2(eq);
+            } 
+          } else {
+              Element eq = frag_s(base_indx) - max_scale_bcast;
+              frag_s(base_indx) = sycl::native::exp2(eq);
+          }
+          sum(indx) += frag_s(base_indx);
+        }
+        CUTLASS_PRAGMA_UNROLL
+        for (int z = 0; z < FragsNOut; z++) {
+          auto base_indx = indx + (z * Vec * FragsM);
+          out(base_indx) *= exp_scale_bcast;
+        }
+      }
+    } else {
+      scale_exp_log2<Vec, FragsM, FragsNAcc>(frag_s, max, sum);
+    }
+  }
+  Params params;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace flash_attention
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_decode_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_decode_epilogue.hpp
index 6cf23be44d..f9b998e03d 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_decode_epilogue.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_decode_epilogue.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -167,8 +168,8 @@ class FlashDecodeEpilogue<epilogue::IntelXeXMX16, MMAOp_, TileShapeOutput_, Subg
     using namespace cute;
     static constexpr bool is_var_len = cutlass::fmha::collective::is_variable_length_v<tuple_element_t<2, ProblemShape>>;
 
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
-    auto group = syclcompat::get_nd_item<1>().get_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
+    auto group = compat::get_nd_item<1>().get_group();
     const int sg_local_id = sg.get_local_id()[0];
     const int sg_group_id = sg.get_group_id()[0];
 
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_decode_mma.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_decode_mma.hpp
index f840021033..6f83b63b1b 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_decode_mma.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_decode_mma.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -226,7 +227,7 @@ struct FlashDecodeMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeType_, Ele
     auto thr_copy_K = gmem_tiled_copy_k.get_slice(thread_idx);
     // Instantiate the MMA object
     TiledMmaQK tiled_mma;
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
     // For Normal Attention, K matrix tile_id = subgroup_id (cache and new both)
     // For Paged Attention, K matrix tile_id = page_table[subgroup_id] (cache, new keys follow normal attention)
@@ -315,7 +316,7 @@ struct FlashDecodeMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeType_, Ele
     int thread_idx = static_cast<int>(ThreadIdxX());
     // Instantiate the MMA object
     TiledMmaPV tiled_mma;
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto thread_mma = tiled_mma.get_slice(0);
     // convert X*512|1024 to 32*64*x*8|16 and use (_, sg.get_group_id()[0] / ATOM_N) to index in the (x,8|16) coordinate
     Tensor gV_ = take<0,3>(local_tile(gV, select<1,2>(SubgroupTileShapePV{}), make_coord(_, kv_tile_idx)));
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_decode_softmax_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_decode_softmax_epilogue.hpp
index af4b4277e6..60d0ad4b88 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_decode_softmax_epilogue.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_decode_softmax_epilogue.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -106,7 +107,7 @@ class FlashDecodeSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16, Element_>
 
   template <int FragsN, class FragAcc, class FragMax, class FragSum>
   CUTLASS_DEVICE void scale_exp_log2(FragAcc &frag_s, FragMax const &max, FragSum &sum) {
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     const auto max_scale = max * params.scale;
       const auto max_scale_bcast = group_broadcast(sg, max_scale, 0);
       CUTLASS_PRAGMA_UNROLL
@@ -119,8 +120,8 @@ class FlashDecodeSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16, Element_>
 
   template <int Num_SGs, int FragsN, class FragSrc, class STensorMax>
   CUTLASS_DEVICE void reduce_max(FragSrc &src, STensorMax &stensor_max, Element& max_val) {
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
-    auto group = syclcompat::get_nd_item<1>().get_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
+    auto group = compat::get_nd_item<1>().get_group();
     const int sg_group_id = sg.get_group_id()[0];
     const int sg_local_id = sg.get_local_id()[0];
 
@@ -162,7 +163,7 @@ class FlashDecodeSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16, Element_>
     reduce_max<Num_SGs,FragsNS>(frag_s, shmem_tensor_max, max_val);
 
     if (!is_first) {
-      auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+      auto sg = compat::get_nd_item<1>().get_sub_group();
       const int sg_group_id = sg.get_group_id()[0];
       const int sg_local_id = sg.get_local_id()[0];
       const int sg_size = sg.get_local_range()[0];
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_epilogue.hpp
index a6a3a0e6f1..e8acb77509 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_epilogue.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_epilogue.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -162,7 +163,7 @@ class FlashPrefillEpilogue<epilogue::IntelXeXMX16,  MMAOperation_, TileShapeOutp
     constexpr int FragsM = shape<1>(FragOutLayout{});
     constexpr int FragsN = size(select<2,3>(shape(FragOutLayout{})));
 
-    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto g = compat::get_nd_item<1>().get_sub_group();
     auto out_reg = make_tensor(static_cast<decltype(out) &&>(out).data() , Shape<Int<Vec>, Int<FragsM>, Int<FragsN>>{});
 
     CUTLASS_PRAGMA_UNROLL
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_epilogue_cachedKV.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_epilogue_cachedKV.hpp
index f115bf6005..0a91165712 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_epilogue_cachedKV.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_epilogue_cachedKV.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -163,7 +164,7 @@ class FlashPrefillCachedEpilogue<epilogue::IntelXeXMX16,  MMAOperation_, TileSha
     constexpr int FragsM = shape<1>(FragOutLayout{});
     constexpr int FragsN = size(select<2,3>(shape(FragOutLayout{})));
 
-    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto g = compat::get_nd_item<1>().get_sub_group();
     auto out_reg = make_tensor(static_cast<decltype(out) &&>(out).data() , Shape<Int<Vec>, Int<FragsM>, Int<FragsN>>{});
 
     CUTLASS_PRAGMA_UNROLL
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma.hpp
index fb0049622f..6dcfe4bfa8 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -197,7 +198,7 @@ struct FlashPrefillMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeType_, El
     // Instantiate the MMA object
     TiledMmaQK tiled_mma;
     // To make all threads in a warp have the same global tensors pass in the index of thread 0 in each warp
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
     auto thread_mma_k = tiled_mma.get_slice(0);
     auto thread_mma_q = tiled_mma.get_slice(first_thread_in_sg_idx);
@@ -282,7 +283,7 @@ struct FlashPrefillMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeType_, El
     TiledMmaPV tiled_mma;
     // Tile GV to the shape of <64,64> and loop over the HeadSize/64 to avoid Register spill 
     Tensor gV_ = take<0,3>(local_tile(gV, select<1,2>(TileShapePV{}), make_coord(_, _))); 
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
     auto thread_mma = tiled_mma.get_slice(first_thread_in_sg_idx);  
     Tensor tCgV = thread_mma.partition_B(gV_);
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_cachedKV.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_cachedKV.hpp
index 75ecdc9359..f42fd204dc 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_cachedKV.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_mma_cachedKV.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -217,7 +218,7 @@ struct FlashPrefillCachedMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeTyp
     // Instantiate the MMA object
     TiledMmaQK tiled_mma;
     // To make all threads in a warp have the same global tensors pass in the index of thread 0 in each warp
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
     auto thread_mma_k = tiled_mma.get_slice(0);
     auto thread_mma_q = tiled_mma.get_slice(first_thread_in_sg_idx);
@@ -285,7 +286,7 @@ struct FlashPrefillCachedMma<gemm::MainloopIntelXeXMX16<Stages>, ProblemShapeTyp
     TiledMmaPV tiled_mma;
     // Tile GV to the shape of <64,64> and loop over the HeadSize/64 to avoid Register spill 
     Tensor gV_ = take<0,3>(local_tile(gV, select<1,2>(TileShapePV{}), make_coord(_, _))); 
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_id()[0] * DispatchPolicy::SubgroupSize;
     auto thread_mma = tiled_mma.get_slice(first_thread_in_sg_idx);  
     Tensor tCgV = thread_mma.partition_B(gV_);
diff --git a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_softmax_epilogue.hpp b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_softmax_epilogue.hpp
index 4e5694acb8..e1fd8086d3 100644
--- a/applications/flash_attention_v2/collective/xe_flash_attn_prefill_softmax_epilogue.hpp
+++ b/applications/flash_attention_v2/collective/xe_flash_attn_prefill_softmax_epilogue.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -106,7 +107,7 @@ class FlashPrefillSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16, Element_>
 
   template <int Vec, int FragsM, int FragsN, class FragAcc, class FragMax, class FragSum>
   CUTLASS_DEVICE void scale_exp_log2(FragAcc &frag_s, FragMax const &max, FragSum &sum) {
-    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto g = compat::get_nd_item<1>().get_sub_group();
     const auto max_scale = max * params.scale;
     CUTLASS_PRAGMA_UNROLL
     for (int indx = 0; indx < Vec * FragsM; indx++) {
@@ -123,7 +124,7 @@ class FlashPrefillSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16, Element_>
 
   template <int Vec, int FragsM, int FragsN, class FragSrc, class FragMax>
   CUTLASS_DEVICE void reduce_max(FragSrc &src, FragMax &max) {
-    auto g = syclcompat::get_nd_item<1>().get_sub_group();
+    auto g = compat::get_nd_item<1>().get_sub_group();
     CUTLASS_PRAGMA_UNROLL
     for (int indx = 0; indx < Vec * FragsM; indx++) {
       auto maxptr = group_broadcast(g, max, indx);
@@ -152,7 +153,7 @@ class FlashPrefillSoftmaxEpilogue<CausalMask_, epilogue::IntelXeXMX16, Element_>
     reduce_max<Vec, FragsM, FragsNAcc>(frag_s, max);
     static_assert(Vec * FragsM  % 8 ==0, " No. of attention rows per subgroup should be >= 1 MMA Atom worth of rows.");
     if (!is_first) {
-      auto g = syclcompat::get_nd_item<1>().get_sub_group();
+      auto g = compat::get_nd_item<1>().get_sub_group();
       Element max_scale{max * params.scale};
       Element exp_scale{sycl::native::exp2(max_prev * params.scale - max_scale)};
       CUTLASS_PRAGMA_UNROLL
diff --git a/applications/flash_attention_v2/kernel/tile_scheduler.hpp b/applications/flash_attention_v2/kernel/tile_scheduler.hpp
index 951d1784bf..478e9c3de1 100644
--- a/applications/flash_attention_v2/kernel/tile_scheduler.hpp
+++ b/applications/flash_attention_v2/kernel/tile_scheduler.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -192,7 +193,7 @@ struct XeFlashPersistentTileScheduler {
 
   template <int Num_SGs>
   static dim3 get_grid_shape(Params const& params) {
-    auto queue = syclcompat::get_default_queue();
+    auto queue = compat::get_default_queue();
     auto dev = queue.get_device();
     const size_t maxSubgroups =
       dev.template get_info<sycl::info::device::max_num_sub_groups>();
diff --git a/applications/flash_attention_v2/kernel/tile_scheduler_cachedKV.hpp b/applications/flash_attention_v2/kernel/tile_scheduler_cachedKV.hpp
index f664a349c5..f7fc450b4b 100644
--- a/applications/flash_attention_v2/kernel/tile_scheduler_cachedKV.hpp
+++ b/applications/flash_attention_v2/kernel/tile_scheduler_cachedKV.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -141,7 +142,7 @@ struct XeFlashPersistentTileScheduler {
 
   template <int Num_SGs>
   static dim3 get_grid_shape(Params const& params) {
-    auto queue = syclcompat::get_default_queue();
+    auto queue = compat::get_default_queue();
     auto dev = queue.get_device();
     const size_t maxSubgroups =
       dev.template get_info<sycl::info::device::max_num_sub_groups>();
diff --git a/applications/flash_attention_v2/kernel/tile_scheduler_chunk_prefill.hpp b/applications/flash_attention_v2/kernel/tile_scheduler_chunk_prefill.hpp
new file mode 100644
index 0000000000..6d429d52bc
--- /dev/null
+++ b/applications/flash_attention_v2/kernel/tile_scheduler_chunk_prefill.hpp
@@ -0,0 +1,238 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.h"
+
+namespace cutlass::flash_attention {
+
+namespace kernel {
+
+struct XeFlashIndividualTileScheduler {
+
+  struct Params {
+    dim3 grid;
+    // FastDivmod divmod_num_heads;
+  };
+
+  bool valid_ = true;
+  Params params;
+
+  CUTLASS_DEVICE
+  XeFlashIndividualTileScheduler(Params const &params) : params(params) {}
+
+  template <class ProblemSize, class TileShape>
+  static Params to_underlying_arguments(ProblemSize const &problem_size,
+                                        KernelHardwareInfo hw_info,
+                                        TileShape const &tile_shape) {
+    using namespace cute;
+    // problem_size = [batch, num_heads_q , num_heads_kv, seq_len_qo,
+    // seq_len_kv, seq_len_kv_cache, head_size_qk, head_size_vo]
+
+    // dim3 grid(size(ceil_div(shape<7>(problem_size), shape<1>(tile_shape))),
+    //           size(ceil_div(shape<3>(problem_size), shape<0>(tile_shape))),
+    //           size(shape<0>(problem_size) * shape<1>(problem_size)));
+
+    int batch = size<0>(problem_size);
+    int num_heads_q = size<1>(problem_size);
+    int num_heads_kv = size<2>(problem_size);
+    int seq_len_qo =
+        size<3>(problem_size); // if varlen seq_len_qo = max_seq_len
+    int seq_len_kv =
+        size<4>(problem_size); // if varlen seq_len_qo = max_seq_len
+    int seq_len_kv_cache = size<5>(problem_size);
+    int head_size_qk = size<6>(problem_size);
+    int head_size_vo = size<7>(problem_size);
+    auto group_heads_q = num_heads_q / num_heads_kv;
+
+    dim3 grid(size(ceil_div(shape<3>(problem_size), shape<0>(tile_shape))),
+              size(shape<1>(problem_size)), size(shape<0>(problem_size)));
+    return Params{grid};
+  }
+
+
+  template <int Num_SGs> static dim3 get_grid_shape(Params const &params) {
+    return params.grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() { return valid_; }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    return make_coord(BlockIdxX(), BlockIdxY(), BlockIdxZ());
+  }
+
+  CUTLASS_DEVICE
+  XeFlashIndividualTileScheduler &operator++() {
+    valid_ = false;
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct XeFlashPersistentTileScheduler {
+
+  struct Params {
+    int num_blocks;
+    FastDivmod divmod_seq_len_block;
+    FastDivmod divmod_head_size_block;
+    FastDivmod divmod_num_heads;
+
+    KernelHardwareInfo hw_info;
+  };
+
+  int block_idx = 0;
+  Params params;
+
+  CUTLASS_DEVICE
+  XeFlashPersistentTileScheduler(Params const &params)
+      : block_idx(BlockIdxX()), params(params) {}
+
+  template <class ProblemSize, class TileShape>
+  static Params to_underlying_arguments(ProblemSize const &problem_size,
+                                        KernelHardwareInfo hw_info,
+                                        TileShape const &tile_shape) {
+    using namespace cute;
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST(
+          "  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments "
+          "KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST(
+        "to_underlying_arguments(): Setting persistent grid SM count to "
+        << sm_count);
+    hw_info.sm_count = sm_count;
+
+    // problem_size = [batch, num_heads_q, numhead_kv, seq_len_qo, seq_len_kv,
+    // seq_len_kv_cache, head_size_qk, head_size_vo]
+    int num_head_size_blocks =
+        size(ceil_div(shape<7>(problem_size), shape<1>(tile_shape)));
+    int num_seq_len_blocks =
+        size(ceil_div(shape<3>(problem_size), shape<0>(tile_shape)));
+    int num_blocks = num_seq_len_blocks * num_head_size_blocks *
+                     size(shape<0>(problem_size) * shape<1>(problem_size));
+
+    return Params{num_blocks,
+                  {num_seq_len_blocks},
+                  {num_head_size_blocks},
+                  {shape<1>(problem_size)},
+                  hw_info};
+  }
+
+  template <int Num_SGs> static dim3 get_grid_shape(Params const &params) {
+    auto queue = compat::get_default_queue();
+    auto dev = queue.get_device();
+    const size_t maxSubgroups =
+        dev.template get_info<sycl::info::device::max_num_sub_groups>();
+    // TODO (Codeplay): revert this back to std::min(params.num_blocks,
+    // params.hw_info.sm_count) once performance issue is fixed.
+    dim3 grid(
+        std::min(params.num_blocks,
+                 ceil_div(params.hw_info.sm_count * maxSubgroups, Num_SGs)),
+        1, 1);
+    return grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() { return block_idx < params.num_blocks; }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    int block_decode = block_idx;
+    int seq_len_block, head_size_block, bidh;
+    params.divmod_head_size_block(block_decode, head_size_block, block_decode);
+    params.divmod_seq_len_block(block_decode, seq_len_block, block_decode);
+    params.divmod_num_heads(block_decode, bidh, block_decode);
+    return make_coord(head_size_block, seq_len_block, block_decode, bidh);
+  }
+
+  CUTLASS_DEVICE
+  XeFlashPersistentTileScheduler &operator++() {
+    block_idx += GridDimX();
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+
+struct IndividualScheduler {};
+struct PersistentScheduler {};
+
+namespace detail {
+
+template <class TileSchedulerTag, class ArchTag, class Enable = void>
+struct TileSchedulerSelector {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+                "Could not select a tile scheduler for given parameters.");
+};
+
+// Default (void) maps to XeFlashIndividualTileScheduler
+template <class ArchTag>
+struct TileSchedulerSelector<
+    void, ArchTag,
+    cute::enable_if_t<cute::is_same_v<ArchTag, cutlass::arch::IntelXe>>> {
+  using Scheduler =
+      typename TileSchedulerSelector<IndividualScheduler, ArchTag>::Scheduler;
+};
+
+template <class ArchTag>
+struct TileSchedulerSelector<
+    IndividualScheduler, ArchTag,
+    cute::enable_if_t<cute::is_same_v<ArchTag, cutlass::arch::IntelXe>>> {
+  using Scheduler = kernel::XeFlashIndividualTileScheduler;
+};
+
+template <class ArchTag>
+struct TileSchedulerSelector<
+    PersistentScheduler, ArchTag,
+    cute::enable_if_t<cute::is_same_v<ArchTag, cutlass::arch::IntelXe>>> {
+  using Scheduler = kernel::XeFlashPersistentTileScheduler;
+};
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::flash_attention
diff --git a/applications/flash_attention_v2/kernel/xe_chunk_prefill.hpp b/applications/flash_attention_v2/kernel/xe_chunk_prefill.hpp
new file mode 100644
index 0000000000..9e6ba3c8f5
--- /dev/null
+++ b/applications/flash_attention_v2/kernel/xe_chunk_prefill.hpp
@@ -0,0 +1,677 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/kernel_hardware_info.hpp"
+
+#include "flash_attention_v2/collective/xe_flash_attn_chunk_prefill_mma.hpp"
+namespace cutlass::flash_attention::kernel {
+
+template <class ProblemShape_, class CollectiveMainloop_,
+          class CollectiveSoftmaxEpilogue_, class CollectiveEpilogue_,
+          class TileScheduler_ = void>
+class FMHAPrefillChunk;
+///////////////////////////////////////////////////////////////////////////////
+template <class ProblemShape_, class CollectiveMainloop_,
+          class CollectiveSoftmaxEpilogue_, class CollectiveEpilogue_,
+          class TileScheduler_>
+class FMHAPrefillChunk {
+
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+
+  // ProblemShape: <batch, num_heads_q, num_heads_kv, seq_len_qo, seq_len_kv,
+  // head_size_qk, head_size_vo>
+  static_assert(
+      rank(ProblemShape{}) == 8,
+      "ProblemShape{} should be <batch, num_heads_q, num_heads_kv, seq_len_qo, "
+      "seq_len_kv, seq_len_kv_cache, head_size_qk, head_size_vo>");
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShapeQK = typename CollectiveMainloop::TileShapeQK;
+  using TileShapePV = typename CollectiveMainloop::TileShapePV;
+  using TiledMmaQK = typename CollectiveMainloop::TiledMmaQK;
+  using TiledMmaPV = typename CollectiveMainloop::TiledMmaPV;
+  using ArchTag = typename CollectiveMainloop::ArchTag;
+  using ElementQ = typename CollectiveMainloop::ElementQ;
+  using StrideQ = typename CollectiveMainloop::StrideQ;
+  using ElementK = typename CollectiveMainloop::ElementK;
+  using StrideK = typename CollectiveMainloop::StrideK;
+  using ElementV = typename CollectiveMainloop::ElementV;
+  using StrideV = typename CollectiveMainloop::StrideV;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  using CollectiveSoftmaxEpilogue = CollectiveSoftmaxEpilogue_;
+  using SoftmaxArguments = typename CollectiveSoftmaxEpilogue::Arguments;
+  using SoftmaxParams = typename CollectiveSoftmaxEpilogue::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or
+                    cute::is_same_v<TileScheduler_, PersistentScheduler> or
+                    cute::is_same_v<TileScheduler_, IndividualScheduler>,
+                "Unsupported TileScheduler for Intel Xe.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler =
+      typename detail::TileSchedulerSelector<TileScheduler_,
+                                             ArchTag>::Scheduler;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementO = typename CollectiveEpilogue::ElementO;
+  using StrideO = typename CollectiveEpilogue::StrideO;
+  using ElementLSE = typename CollectiveEpilogue::ElementLSE;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  using TileShapeOutput = typename CollectiveEpilogue::TileShapeOutput;
+  using TiledMmaOutput = typename CollectiveEpilogue::TiledMmaOutput;
+
+  static_assert(
+      cute::is_same_v<ElementAccumulator,
+                      typename CollectiveEpilogue::ElementAccumulator>,
+      "Mainloop and epilogue do not agree on accumulator value type.");
+  // MSVC requires the cast to fix a warning-as-error.
+  static constexpr int SharedStorageSize = 0;
+
+  static constexpr bool CausalMask = CollectiveMainloop::CausalMask;
+  static constexpr bool LocalMask = CollectiveMainloop::LocalMask;
+
+  static_assert(!(CausalMask && LocalMask), "Cannot be both causal and local");
+  static constexpr bool PagedKV = CollectiveMainloop::PagedKV;
+
+
+  static constexpr int SubgroupSize =
+      CollectiveMainloop::SubgroupSize; // sub_group size
+  static constexpr uint32_t MaxThreadsPerBlock =
+      CollectiveMainloop::MaxThreadsPerBlock;
+  using MmaAtomShape = typename CollectiveMainloop::MmaAtomShape; // 8,16,16
+
+  static constexpr int QK_BLK_M = CollectiveMainloop::QK_BLK_M;
+  static constexpr int QK_BLK_N = CollectiveMainloop::QK_BLK_N;
+  static constexpr int QK_BLK_K = CollectiveMainloop::QK_BLK_K;
+
+  static constexpr int QK_ATOM_N = CollectiveMainloop::QK_ATOM_N;
+  static constexpr int QK_ATOM_K = CollectiveMainloop::QK_ATOM_K;
+
+  static constexpr int QK_SG_M = CollectiveMainloop::QK_SG_M;
+
+  static constexpr int Epilogue_BLK_N = get<1>(TileShapeOutput{});
+  static constexpr int Epilogue_BLK_K = get<2>(TileShapeOutput{});
+
+  static constexpr int PV_ATOM_M = CollectiveMainloop::PV_ATOM_M;
+  static constexpr int PV_ATOM_N = CollectiveMainloop::PV_ATOM_N;
+  static constexpr int PV_ATOM_K = CollectiveMainloop::PV_ATOM_K;
+
+  static constexpr auto Num_SGs = PV_ATOM_N * PV_ATOM_M * PV_ATOM_K;
+  static constexpr int Vec = CollectiveMainloop::Vec;
+  static constexpr int FragsM = CollectiveMainloop::FragsM;
+  // The FragsN here used for Creation of S matrix so we use the FragsN for S
+  // shape
+  static constexpr int FragsN = CollectiveMainloop::FragsNS;
+
+  static constexpr int VSlicer =
+      get<1>(TileShapeOutput{}) /
+      (get<1>(TileShapePV{}) * PV_ATOM_N); // ceil_div(FragsNOut,FragsNS);
+  using AccumeShape = decltype(make_shape(
+      Int<Vec>{}, Int<FragsM>{}, get<1>(TileShapePV{}) / get<1>(MmaAtomShape()),
+      Int<VSlicer>{}));
+
+  static constexpr bool is_var_len = CollectiveMainloop::is_var_len;
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+    EpilogueTensorStorage epilogue;
+  };
+
+  // Device side arguments
+  struct Arguments {
+    gemm::GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    SoftmaxArguments softmax{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    gemm::GemmUniversalMode mode;
+    ProblemShape problem_shape;
+    MainloopParams mainloop;
+    SoftmaxParams softmax;
+    EpilogueParams epilogue;
+    TileSchedulerParams scheduler;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const &args,
+                                        void *workspace) {
+    (void)workspace;
+    return {args.mode,
+            args.problem_shape,
+            CollectiveMainloop::to_underlying_arguments(
+                args.problem_shape, args.mainloop, workspace),
+            CollectiveSoftmaxEpilogue::to_underlying_arguments(args.softmax),
+            CollectiveEpilogue::to_underlying_arguments(
+                args.problem_shape, args.epilogue, workspace),
+            TileScheduler::to_underlying_arguments(
+                args.problem_shape, args.hw_info, TileShapeOutput{})};
+  }
+
+  static bool can_implement(Arguments const &args) {
+    bool mode_implementable = args.mode == gemm::GemmUniversalMode::kGemm or
+                              (args.mode == gemm::GemmUniversalMode::kBatched &&
+                               rank(ProblemShape{}) == 4);
+    return mode_implementable;
+  }
+
+  static int get_workspace_size(Arguments const &args) { return 0; }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const &args, void *workspace = nullptr,
+                       cudaStream_t stream = nullptr,
+                       CudaHostAdapter *cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  static dim3 get_grid_shape(Params const &params) {
+    return TileScheduler::template get_grid_shape<Num_SGs>(params.scheduler);
+  }
+
+  static dim3 get_block_shape() { return dim3(MaxThreadsPerBlock, 1, 1); }
+
+  CUTLASS_DEVICE
+  Shape<int, int, int>
+  get_sequence_length_shape(ProblemShape const &problem_shape,
+                            int const &batch) {
+    if constexpr (is_var_len) {
+      return cutlass::fmha::collective::apply_variable_length(
+          select<3, 4, 5>(problem_shape), batch);
+    } else {
+      return select<3, 4, 5>(problem_shape);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const &params, char *smem_buf) {
+    SharedStorage &shared_storage =
+        *reinterpret_cast<SharedStorage *>(smem_buf);
+    // Preconditions
+    CUTE_STATIC_ASSERT(is_static<TileShapeQK>::value);
+    CUTE_STATIC_ASSERT(is_static<TileShapePV>::value);
+    // Separate out problem shape for convenience
+
+    // "ProblemShape{} should be <batch, num_heads_q, num_heads_kv, seq_len_qo,
+    // seq_len_kv, head_size_qk, head_size_vo>");
+    auto batch = get<0>(params.problem_shape);
+    auto num_heads_q = get<1>(params.problem_shape);
+    auto num_heads_kv = get<2>(params.problem_shape);
+
+    auto &head_size_qk = get<6>(params.problem_shape);
+    auto &head_size_vo = get<7>(params.problem_shape);
+    // Preconditions
+    static_assert(cute::rank(StrideQ{}) == 3,
+                  "StrideQ must be rank-3: [seq_len_qo, head_size_qk, batch * "
+                  "num_heads_q].");
+    static_assert(cute::rank(StrideK{}) == 3,
+                  "StrideK must be rank-3: [head_size_qk, seq_len_kv, batch * "
+                  "num_heads_kv].");
+    static_assert(cute::rank(StrideV{}) == 3,
+                  "StrideV must be rank-3: [seq_len_kv, head_size_vo, batch * "
+                  "num_heads_kv].");
+
+    int thread_idx = int(ThreadIdxX());
+    int sub_group_id = thread_idx / SubgroupSize;
+
+    TileScheduler tile_scheduler{params.scheduler};
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+      auto blk_coord =
+          tile_scheduler
+              .get_block_coord(); // head_size_blk_idx, seq_len_blk_idx,
+                                  // batch_blk_idx, num_heads_blk_idx
+
+      auto blk_m_coord = get<0>(blk_coord); // seq_len_blk_idx
+      auto blk_n_coord = 0;                   // nums_head_blk_idx
+      auto q_head_coord = get<1>(blk_coord); // q_heads_idx
+      auto batch_coord = get<2>(blk_coord); // batch_blk_idx
+
+      // For variable sequence length case, batch is considered to be 1 (same
+      // as group gemm). For fixed sequence length case, the l_coord is the
+      // weighted sum of both batch_coord and num_heads_coord. Flash Attention
+      // implementation combines batch and num_heads to calculate the total
+      // batch_size. iff is_var_len: batch_size = num_heads (as each batch
+      // would have it's own seq_len_qo and seq_len_kv) iff !is_var_len:
+      // batch_size = batch * num_heads
+      // auto blk_l_coord = q_head_coord;
+
+      // Get problem shape for the current batch_blk_idx. For variable
+      // sequence length, it loads the sequence length from Global memory for
+      // the given batch_blk_idx and returns the appropriate problem_shape.
+      // For fixed sequence length, sequence_length_shape == select<3, 4,
+      // 5>(params.problem_shape). sequence_length_shape = [batch,
+      // num_heads_q, num_heads_kv, seq_len_qo, seq_len_kv, seq_len_kv_cache,
+      // head_size_qk, head_size_vo]
+      auto sequence_length_shape =
+          get_sequence_length_shape(params.problem_shape, batch_coord);
+
+      auto [seq_len_qo, seq_len_kv, seq_len_kv_cache] = sequence_length_shape;
+      // int seq_len_kv_total = seq_len_kv_cache + seq_len_kv;
+      // For variable sequence length case, batch is considered to be 1 (same
+      // as group gemm). For fixed sequence length case, the l_coord is the
+      // weighted sum of both batch_coord and num_heads_coord. Flash Attention
+      // implementation combines batch and num_heads to calculate the total
+      // batch_size. iff is_var_len: batch_size = num_heads (as each batch
+      // would have it's own seq_len_qo and seq_len_kv) iff !is_var_len:
+      // batch_size = batch * num_heads
+
+      // Calculate the seq_len_idx (blk_m_coord * get<0>(TileShapeOutput{}))
+      // and check if it is still within bounds of the actual seq_len_qo
+      // (get<0>(sequence_length_shape)).
+      if (blk_m_coord * get<0>(TileShapeOutput{}) >=
+          seq_len_qo) {
+        continue;
+      }
+
+      const int seq_coord =
+          cute::min(seq_len_qo, (blk_m_coord * QK_BLK_M + (sub_group_id / PV_ATOM_N) * QK_SG_M) %
+          seq_len_qo);
+      auto offset = cute::min(seq_len_qo, seq_len_kv); //(2048, 1024)
+      auto discard_seq_coord = seq_len_qo - offset;    // 1024
+      auto full_tile_offset = seq_len_kv - offset;     // 0
+
+      const int seq_len =
+          CausalMask
+              ? full_tile_offset +
+                    cute::min(seq_len_kv, seq_coord - discard_seq_coord) +
+                    QK_SG_M
+              : seq_len_kv;
+
+      const int kv_splits_new = cute::ceil_div(seq_len, QK_BLK_N);
+      const int kv_splits_cache = cute::ceil_div(seq_len_kv_cache, QK_BLK_N);
+      const int kv_splits = kv_splits_cache + kv_splits_new;
+
+      int tiles_per_page = params.mainloop.page_size / QK_BLK_N;
+
+      if (CausalMask && seq_coord < discard_seq_coord) { // 1024 =0
+        continue;
+      }
+
+      Tensor mQ_mkl = cute::get_xe_tensor(
+          make_shape(seq_len_qo, head_size_qk, 1)); //(m,k,l)
+
+      Tensor mK_nkl = cute::get_xe_tensor(
+          make_shape(seq_len_kv, head_size_qk, 1)); //(n,k,l)
+      Tensor mV_nkl = cute::get_xe_tensor(
+          make_shape(head_size_vo, seq_len_kv, 1)); //(n,k,l)
+      Tensor mK_cache_nkl = cute::get_xe_tensor(
+          make_shape(seq_len_kv_cache, head_size_qk, 1)); // (n_cache,k,l)
+      Tensor mV_cache_nkl = cute::get_xe_tensor(
+          make_shape(head_size_vo, seq_len_kv_cache, 1)); // (n_cache,k,l)
+
+      // block_size and head_size are the same size. So no coord is needed.
+      Tensor mQ_mk = mQ_mkl(_, _, 0);
+
+      Tensor mK_nk = mK_nkl(_, _, 0); // (n,k)
+      Tensor mV_nk = mV_nkl(_, _, 0);
+
+      Tensor mK_cache_nk = mK_cache_nkl(_, _, 0); // (n_cache, k)
+      Tensor mV_cache_nk = mV_cache_nkl(_, _, 0); // (n_cache, k)
+
+      auto gQ = local_tile(mQ_mk, TileShapeQK{}, make_coord(blk_m_coord, _, _),
+                           Step<_1, X, _1>{});
+      auto gK = local_tile(mK_nk, TileShapeQK{}, make_coord(_, _, _),
+                           Step<X, _1, _1>{});
+
+      auto gV = local_tile(mV_nk, TileShapeOutput{},
+                           make_coord(_, blk_n_coord, _), Step<X, _1, _1>{});
+      auto gK_cache = local_tile(mK_cache_nk, TileShapeQK{},
+                                 make_coord(_, _, _), Step<X, _1, _1>{});
+      auto gV_cache =
+          local_tile(mV_cache_nk, TileShapeOutput{},
+                     make_coord(_, blk_n_coord, _), Step<X, _1, _1>{});
+
+      auto mainloop_params = CollectiveMainloop::get_updated_copies(
+          params.mainloop, params.problem_shape, sequence_length_shape,
+          batch_coord, q_head_coord);
+
+
+      // we limit the horisontal size to two subgroup, the empirical resutls
+      // show that reading the two cacheline side by side in gives better
+      // performance and anything after that does not have an effect on
+      // performance. // (64 here for float b float when possible and loop over
+      // to cover all the data needed)
+      auto tiled_prefetch_q = cute::prefetch_selector<
+          Shape<Int<QK_BLK_M>, Int<cute::max(cute::gcd(QK_BLK_K, 64), 32)>>,
+          Num_SGs>(mainloop_params.gmem_tiled_copy_q);
+      auto tiled_prefetch_k = cute::prefetch_selector<
+          Shape<Int<QK_BLK_N>, Int<cute::max(cute::gcd(QK_BLK_K, 64), 32)>>,
+          Num_SGs>(mainloop_params.gmem_tiled_copy_k);
+      auto tiled_prefetch_v = cute::prefetch_selector<
+          Shape<Int<cute::max(cute::gcd(Epilogue_BLK_N, 64), 32)>,
+                Int<Epilogue_BLK_K>>,
+          Num_SGs>(mainloop_params.gmem_tiled_copy_v);
+      auto tiled_prefetch_k_cache = cute::prefetch_selector<
+          Shape<Int<QK_BLK_N>, Int<cute::max(cute::gcd(QK_BLK_K, 64), 32)>>,
+          Num_SGs>(mainloop_params.gmem_tiled_copy_k_cache);
+      auto tiled_prefetch_v_cache = cute::prefetch_selector<
+          Shape<Int<cute::max(cute::gcd(Epilogue_BLK_N, 64), 32)>,
+                Int<Epilogue_BLK_K>>,
+          Num_SGs>(mainloop_params.gmem_tiled_copy_v_cache);
+      auto thr_prefetch_Q = tiled_prefetch_q.get_slice(thread_idx);
+      auto thr_prefetch_K = tiled_prefetch_k.get_slice(thread_idx);
+      auto thr_prefetch_V = tiled_prefetch_v.get_slice(thread_idx);
+      auto pQgQ = thr_prefetch_Q.partition_S(gQ);
+      auto pKgK = thr_prefetch_K.partition_S(gK);
+      auto pVgV = thr_prefetch_V.partition_S(gV);
+      // assuming the copy function is the same otherwise this need to have its
+      // own tile_prefetch
+      auto pKgK_cache = thr_prefetch_K.partition_S(gK_cache);
+      auto pVgV_cache = thr_prefetch_V.partition_S(gV_cache);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<3>(pQgQ); i++) {
+        prefetch(tiled_prefetch_q, pQgQ(_, _, _, i));
+      }
+      auto &prefetch_K =
+          (seq_len_kv_cache == 0) ? tiled_prefetch_k : tiled_prefetch_k_cache;
+      auto &pKgK1_ = (seq_len_kv_cache == 0) ? pKgK : pKgK_cache;
+
+      int cached_nblock = 0;
+      if constexpr (PagedKV) {
+        int curr_batch_pages = ceil_div(seq_len_kv_cache, mainloop_params.page_size);
+        int batch_offset =
+            is_var_len ? mainloop_params.num_pages_per_seq[batch_coord]
+                       : batch_coord * curr_batch_pages;
+        cached_nblock =
+            mainloop_params
+                .ptr_page_table[batch_offset // page table for this batch
+        ] * tiles_per_page; // base block idx of physical page
+      }
+      // The headsize for both cached and non-cached version is the same
+      for (int j = 0; j < size<4>(pKgK1_); j++) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = cached_nblock; i < cached_nblock + DispatchPolicy::Stages;
+             i++) {
+          prefetch(prefetch_K, pKgK1_(_, _, _, i, j));
+        }
+      }
+
+      // Allocate the tiled_mma and the accumulators for the (M,N)
+      // workgroup_shape
+      Tensor out_reg = make_tensor<ElementAccumulator>(AccumeShape{});
+
+      // There are 16 workitem and 16 max per subgroup, each worktime containt 1
+      // max and cumulatively, they calculate the max per subgroup
+      ElementAccumulator max_reg{-INFINITY};
+      // The sum reg each contains a 2d tesnor for 8 x 2 This is number of
+      // sequence lenght process per subgroup
+      Tensor sum_reg =
+          make_tensor<ElementAccumulator>(Shape<Int<Vec>, Int<FragsM>>{});
+
+      clear(sum_reg);
+      clear(out_reg);
+      // Perform the collective scoped MMA
+      CollectiveMainloop collective_mma;
+
+      auto q_group_size = num_heads_q / num_heads_kv;
+      auto kv_head_coord = q_head_coord / q_group_size;
+
+      // when causal mask is true. It is not possible to set the scope
+      // of the barrier to workgroup level as the number n block is
+      // different for each subgroup due to triangular nature of causal based
+      // operation
+      static constexpr int barrier_scope = CausalMask ? 3 : 2;
+      CUTLASS_PRAGMA_UNROLL
+      for (int split = 0; split < kv_splits - static_cast<int>(CausalMask); split++) {
+        barrier_arrive(barrier_scope);
+
+        bool is_KV_cache = split < kv_splits_cache;
+        // 1) Load KV (performed inside mmaQK)
+        auto gK_ = is_KV_cache ? gK_cache(_, _, cached_nblock, _)
+                               : gK(_, _, split - kv_splits_cache, _);
+        auto gV_ = is_KV_cache ? gV_cache(_, _, cached_nblock)
+                               : gV(_, _, split - kv_splits_cache);
+        // 2) Create Tensor S
+        Tensor tSr = make_tensor<ElementAccumulator>(
+            Shape<Int<Vec>, Int<FragsM>, Int<FragsN>>{}); 
+        clear(tSr);
+        // 3) Perform GEMM S = Q*K
+        // Then modify layout to LayoutQ = ((seq_leq_q, group_head_q),
+        // head_size_qk, batch* num_heads_q / group_head_q), which can be merged
+        // into one gemm for (int i = 0; i < q_group_size; ++i) {
+
+        collective_mma.mmaQK(tSr, gQ, gK_, tSr,
+                             ceil_div(head_size_qk, QK_BLK_K), mainloop_params,
+                             is_KV_cache, q_head_coord, kv_head_coord);
+
+        if constexpr (LocalMask) {
+          // Sliding windows
+          // mask the elements of each tile where j - left > i || j + right < i
+          const int item_id = thread_idx % SubgroupSize;
+          int col_idx;
+          if (split < kv_splits_cache) {
+            col_idx = item_id + split * cute::min(QK_BLK_N, seq_len_kv_cache) ;
+          } else {
+            col_idx = item_id + seq_len_kv_cache + (split - kv_splits_cache) * cute::min(QK_BLK_N, seq_len_kv);
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < FragsN;
+            n++, col_idx += get<1>(MmaAtomShape())) { // 4
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < FragsM; m++) { // 2
+                int row_idx = m * Vec + seq_coord;
+                int col_ref = seq_len_kv_cache + seq_len_kv -  seq_len_qo;
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < Vec; row++) { // 8
+                  bool left_mask = col_idx < cute::max(0, row + row_idx + col_ref - mainloop_params.window_left);
+                  bool right_mask = col_idx > cute::min(seq_len_kv_cache + seq_len_kv, row + row_idx + col_ref + mainloop_params.window_right);
+                  if (left_mask || right_mask) {
+                    tSr(row, m, n) = ElementAccumulator{-INFINITY};
+                  }
+                }
+            }
+          }
+        }
+
+        if constexpr(!(CausalMask || LocalMask) && PagedKV) {
+        // Processing Not divisible, mask padding
+          const int item_id = thread_idx % SubgroupSize;
+          int col_idx = item_id + split * cute::min(QK_BLK_N, seq_len_kv_cache + seq_len_kv);
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < FragsN; n++, col_idx += get<1>(MmaAtomShape())) { // 4
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < FragsM; m++) { // 2
+                int row_idx = m * Vec + seq_coord;
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < Vec; row++) { // 8
+                  if (col_idx >= seq_len_kv_cache + seq_len_kv || row_idx + row >= seq_len_qo) {
+                    tSr(row, m, n) = ElementAccumulator{-INFINITY};
+                }
+              }
+            }
+          }
+        }
+        auto &tiled_prefetch_v_ =
+            is_KV_cache ? tiled_prefetch_v_cache
+                        : tiled_prefetch_v;
+        auto &pVgV_ = is_KV_cache ? pVgV_cache : pVgV;
+        int v_prefetch_idx = is_KV_cache ? PagedKV ? cached_nblock : split
+                                         : split - kv_splits_cache;
+        for (int i = 0; i < size<1>(pVgV_); i++) {
+          prefetch(tiled_prefetch_v_, pVgV_(_, i, _, v_prefetch_idx));
+        }
+        int next_cached_nblock = split + 1;
+        bool is_next_KV_cache = next_cached_nblock < kv_splits_cache;
+        if constexpr (PagedKV) {
+          if (is_next_KV_cache) {
+            int curr_batch_pages = ceil_div(seq_len_kv_cache, mainloop_params.page_size);
+            int next_page_logical_idx =
+                next_cached_nblock * QK_BLK_N / params.mainloop.page_size;
+            int batch_offset =
+                is_var_len ? mainloop_params.num_pages_per_seq[batch_coord]
+                           : batch_coord * curr_batch_pages;
+            bool valid_page = next_page_logical_idx < curr_batch_pages;
+            // get physical page idx from page table
+            if (valid_page) {
+              next_cached_nblock =
+                  params.mainloop.ptr_page_table
+                          [batch_offset +        // page table for this batch
+                           next_page_logical_idx // split (tile idx) to logical
+                                                 // page idx
+              ] * tiles_per_page + // base block idx of physical page
+                  next_cached_nblock % tiles_per_page; // offset within page
+            } else {
+              next_cached_nblock =
+                  curr_batch_pages *
+                  tiles_per_page; // push idx out of bounds to respect the
+                                  // boundary between batches
+            }
+          }
+        }
+
+        // 4) Fused softmax
+        CollectiveSoftmaxEpilogue softmax(params.softmax);
+        softmax(split == 0, tSr, max_reg, sum_reg, out_reg);
+
+        // 5) Perform GEMM O = S*V
+        collective_mma.template mmaPV<VSlicer>(out_reg, tSr, gV_, out_reg,
+                                               mainloop_params, is_KV_cache, kv_head_coord);
+
+        // ... prefetch next tile ...
+        // Prefetch the next Q tile
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<3>(pQgQ); i++) {
+          prefetch(tiled_prefetch_q, pQgQ(_, _, _, i));
+        }
+
+        is_KV_cache = is_next_KV_cache;
+        cached_nblock = next_cached_nblock;
+        // Prefetch the next K tile
+        // there is no need to gaurd it with if statememt as prefetch will
+        // ignore out of bound reading
+        if constexpr (PagedKV) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<4>(pKgK_cache); j++) {
+            prefetch(tiled_prefetch_k_cache, pKgK_cache(_, _, _, cached_nblock, j));
+          }
+        } else {
+          bool sel_prefetch_k =
+              (split + DispatchPolicy::Stages) < kv_splits_cache;
+          auto &prefetch_k_selector =
+              sel_prefetch_k ? tiled_prefetch_k_cache : tiled_prefetch_k;
+          auto &pKgK_ = sel_prefetch_k ? pKgK_cache : pKgK;
+          int k_prefetch_idx =
+            sel_prefetch_k
+                ? PagedKV ? cached_nblock : split + DispatchPolicy::Stages
+                : split + DispatchPolicy::Stages - kv_splits_cache;
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<4>(pKgK_); j++) {
+            prefetch(prefetch_k_selector, pKgK_(_, _, _, k_prefetch_idx, j));
+          }
+        }
+        barrier_wait(barrier_scope);
+      }
+
+      if constexpr (CausalMask) {
+        // BAND Matrix
+        // 1) Load K (performed inside mmaQK)
+        // 2) Create Tensor S
+        Tensor tSr = make_tensor<ElementAccumulator>(
+            Shape<Int<Vec>, Int<FragsM>, Int<FragsN>>{});
+        clear(tSr);
+        // 3) Perform GEMM S = Q*K
+        collective_mma.mmaQK(tSr, gQ, gK(_, _, kv_splits_new - 1, _), tSr,
+                             ceil_div(head_size_qk, QK_BLK_K), mainloop_params,
+                             false, q_head_coord, kv_head_coord);
+
+        // we only need one block ahead, there is enough gap to prefetch it
+        // while doing softmax. because the gap between the two MMA is big,
+        // prefetching it the same way as cutlass K matrix does not make sense
+        for (int i = 0; i < size<1>(pVgV); i++) {
+          prefetch(tiled_prefetch_v, pVgV(_, i, _, kv_splits_new - 1));
+        }
+        // mask the elements of each tile where j > i
+        const int item_id = thread_idx % SubgroupSize;
+        int col_idx = item_id + (kv_splits_new - 1) * QK_BLK_N;
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < FragsN;
+             n++, col_idx += get<1>(MmaAtomShape())) { // 4
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < FragsM; m++) { // 2
+            int row_idx = m * Vec + seq_coord;
+            CUTLASS_PRAGMA_UNROLL
+            for (int row = 0; row < Vec; row++, row_idx++) { // 8
+              if (col_idx - full_tile_offset > row_idx - discard_seq_coord) {
+                tSr(row, m, n) = ElementAccumulator{-INFINITY};
+              }
+            }
+          }
+        }
+
+        CollectiveSoftmaxEpilogue softmax(params.softmax);
+        softmax((kv_splits - 1) == 0, tSr, max_reg, sum_reg, out_reg);
+
+        collective_mma.template mmaPV<VSlicer>(out_reg, tSr,
+                                               gV(_, _, kv_splits_new - 1),
+                                               out_reg, mainloop_params, false, kv_head_coord);
+      }
+
+
+      // Epilogue
+      auto epilogue_params =
+          CollectiveEpilogue::template get_updated_copies<is_var_len>(
+              params.epilogue, params.problem_shape, sequence_length_shape,
+              batch_coord, q_head_coord);
+      CollectiveEpilogue epilogue{epilogue_params, shared_storage.epilogue};
+      auto blk_coord_mnkl = make_coord(blk_m_coord, blk_n_coord, _, 0);
+      epilogue(params.problem_shape, sequence_length_shape, blk_coord_mnkl,
+               out_reg, max_reg, sum_reg);
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::flash_attention::kernel
diff --git a/applications/flash_attention_v2/kernel/xe_flash_attn_decode.hpp b/applications/flash_attention_v2/kernel/xe_flash_attn_decode.hpp
index 1a034bb8c9..5ad367f13c 100644
--- a/applications/flash_attention_v2/kernel/xe_flash_attn_decode.hpp
+++ b/applications/flash_attention_v2/kernel/xe_flash_attn_decode.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -343,7 +344,7 @@ class FMHADecode {
       Tensor out_reg = make_tensor<ElementAccumulator>(AccumShape{});
       clear(out_reg);
 
-      auto smem = syclcompat::local_mem<ElementAccumulator[((Int<size(AccumShape{}) + 1>{}) * Num_SGs * SubgroupSize)]>();
+      auto smem = compat::local_mem<ElementAccumulator[((Int<size(AccumShape{}) + 1>{}) * Num_SGs * SubgroupSize)]>();
       Tensor shmem_max_tensor = make_tensor(make_smem_ptr(smem), make_shape(Int<Num_SGs * FragsM>{}));
 
       bool is_KV_cache = seq_len_kv_cache != 0;
@@ -459,7 +460,7 @@ class FMHADecode {
       collective_mma.template mmaPV<VSlicer>(out_reg, tSr, gV, out_reg, mainloop_params, false, curr_kv_tile_idx);
 
       // need to apply barrier here to avoid race condition
-      auto group = syclcompat::get_nd_item<1>().get_group();
+      auto group = compat::get_nd_item<1>().get_group();
       sycl::group_barrier(group);
 
       Tensor shmem_out_tensor = make_tensor(make_smem_ptr(smem), make_shape(Int<(size(AccumShape{})) * SubgroupSize * Num_SGs>{}));
diff --git a/benchmarks/common.hpp b/benchmarks/common.hpp
index 0825813561..6e889cd989 100644
--- a/benchmarks/common.hpp
+++ b/benchmarks/common.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
-* Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,7 +40,7 @@
 namespace cutlass {
   static inline std::size_t get_llc_size() {
     #if defined(CUTLASS_ENABLE_SYCL)
-      return syclcompat::get_default_queue().get_device().get_info<sycl::info::device::global_mem_cache_size>();   
+      return compat::get_default_queue().get_device().get_info<sycl::info::device::global_mem_cache_size>();   
     #else
       cudaDeviceProp prop_struct;
       auto result = cudaGetDeviceProperties(&prop_struct, 0);
diff --git a/benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp b/benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp
index cd8519b3de..18662154dd 100644
--- a/benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp
+++ b/benchmarks/flash_attention/flash_attention_decode/benchmark_runner.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -190,9 +191,9 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
       int max_seq_len_q = static_cast<int>(get<3>(problem_size));
       int max_seq_len_kv = static_cast<int>(get<4>(problem_size));
       int max_seq_len_kv_cache = static_cast<int>(get<5>(problem_size));
-      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, cumulative_seqlen_q.data()};
-      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, cumulative_seqlen_kv.data()};
-      get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, cumulative_seqlen_kv_cache.data()};
+      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, 0, cumulative_seqlen_q.data()};
+      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, 0, cumulative_seqlen_kv.data()};
+      get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, 0, cumulative_seqlen_kv_cache.data()};
     }
 
     auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] = cute::select<0,1,2,6,7>(problem_size);
@@ -233,29 +234,29 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
             cutlass::DeviceAllocation<ElementV> block_V_concat(seq_len_kv_total * head_size_vo);
 
             // Concatenate K_cache and K
-            syclcompat::memcpy<ElementK>(
+            compat::memcpy<ElementK>(
                 block_K_concat.get(),
                 block_K_cache[0].get() + offset_k_cache,
                 seq_len_kv_cache * head_size_qk
             );
-            syclcompat::memcpy<ElementK>(
+            compat::memcpy<ElementK>(
                 block_K_concat.get() + seq_len_kv_cache * head_size_qk,
                 block_K[0].get() + offset_k,
                 seq_len_kv * head_size_qk
             );
 
             // Concatenate V_cache and V
-            syclcompat::memcpy<ElementV>(
+            compat::memcpy<ElementV>(
                 block_V_concat.get(),
                 block_V_cache[0].get() + offset_v_cache,
                 seq_len_kv_cache * head_size_vo
             );
-            syclcompat::memcpy<ElementV>(
+            compat::memcpy<ElementV>(
                 block_V_concat.get() + seq_len_kv_cache * head_size_vo,
                 block_V[0].get() + offset_v,
                 seq_len_kv * head_size_vo
             );
-            syclcompat::wait();
+            compat::wait();
 
             k_ptr = block_K_concat.get();
             v_ptr = block_V_concat.get();
@@ -280,11 +281,11 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
                                                 seq_len_qo * seq_len_kv_total    // batch_stride_S
         );
 
-        syclcompat::wait();
+        compat::wait();
 
         std::vector<ElementAccumulator> host_S(block_S.size());
-        syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
-        syclcompat::wait();
+        compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
+        compat::wait();
 
         // delete this memory as it is no longer needed
         block_S.reset();
@@ -351,8 +352,8 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
         cutlass::DeviceAllocation<ElementV> block_P;
         block_P.reset(host_P.size());
 
-        syclcompat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
-        syclcompat::wait();
+        compat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
+        compat::wait();
 
         cutlass::TensorRef ref_P(block_P.get(), LayoutQ::packed({seq_len_qo, seq_len_kv_total}));
 
@@ -370,13 +371,13 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
                                                 seq_len_qo * head_size_vo  // batch_stride_O
         );
 
-        syclcompat::wait();
+        compat::wait();
         // delete this memory as it is no longer needed
         block_P.reset();
 
         std::vector<ElementAccumulator> vec_acc(block_acc.size());
-        syclcompat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
-        syclcompat::wait();
+        compat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
+        compat::wait();
 
         // delete this memory as it is no longer needed
         block_acc.reset();
@@ -384,8 +385,8 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
         for(int i = 0; i < vec_out.size(); i++) {
           vec_out[i] = static_cast<ElementOutput>(vec_acc[i]);
         }
-        syclcompat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
-        syclcompat::wait();
+        compat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
+        compat::wait();
 
         offset_q += seq_len_qo * head_size_qk;
         if(kv_group_update % q_group_size == 0) {
@@ -399,7 +400,7 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
       }
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
@@ -546,11 +547,11 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
           page_mapping[logical_idx] = physical_pages[blk];
         }
       }
-      syclcompat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
+      compat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
 
       paged_kv_cache.num_pages_per_seq.reset(num_pages_per_seq.size());
-      syclcompat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
-      syclcompat::wait();
+      compat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
+      compat::wait();
     }
 
     for(int i = 0; i < count; i++) {
@@ -613,24 +614,24 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
     // configure smem size and carveout
     int smem_size = FMHADecodeKernel::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<FMHADecodeKernel>>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<FMHADecodeKernel::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props{
+    compat::experimental::launch_properties launch_props{
       sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size)
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
       sycl::ext::oneapi::experimental::sub_group_size<FMHADecodeKernel::DispatchPolicy::SubgroupSize>
     };
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<cutlass::device_kernel<FMHADecodeKernel>>(policy, params);
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHADecodeKernel>>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
@@ -671,7 +672,7 @@ template <class FMHADecodeConfiguration> struct BenchmarkRunnerFMHADecode {
     // Run the GEMM
     run(params);
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size);
diff --git a/benchmarks/flash_attention/flash_attention_prefill/benchmark_runner.hpp b/benchmarks/flash_attention/flash_attention_prefill/benchmark_runner.hpp
index fff93e4187..3d4bb39aff 100644
--- a/benchmarks/flash_attention/flash_attention_prefill/benchmark_runner.hpp
+++ b/benchmarks/flash_attention/flash_attention_prefill/benchmark_runner.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -164,8 +165,8 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
     if constexpr (isVarLen) {
       int max_seq_len_q = static_cast<int>(get<3>(problem_size));
       int max_seq_len_kv = static_cast<int>(get<4>(problem_size));
-      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, cumulative_seqlen_q.data()};
-      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, cumulative_seqlen_kv.data()};
+      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, 0, cumulative_seqlen_q.data()};
+      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, 0, cumulative_seqlen_kv.data()};
     }
 
     auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] = cute::select<0,1,2,5,6>(problem_size);
@@ -207,10 +208,10 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
                                                 seq_len_qo * seq_len_kv    // batch_stride_S
         );
 
-        syclcompat::wait();
+        compat::wait();
 
         std::vector<ElementAccumulator> host_S(block_S.size());
-        syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
+        compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
 
         // delete this memory as it is no longer needed
         block_S.reset();
@@ -276,7 +277,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
         cutlass::DeviceAllocation<ElementV> block_P;
         block_P.reset(host_P.size());
 
-        syclcompat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
+        compat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
 
         cutlass::TensorRef ref_P(block_P.get(), LayoutQ::packed({seq_len_qo, seq_len_kv}));
 
@@ -294,12 +295,12 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
                                                 seq_len_qo * head_size_vo  // batch_stride_O
         );
 
-        syclcompat::wait();
+        compat::wait();
         // delete this memory as it is no longer needed
         block_P.reset();
 
         std::vector<ElementAccumulator> vec_acc(block_acc.size());
-        syclcompat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
+        compat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
 
         // delete this memory as it is no longer needed
         block_acc.reset();
@@ -307,7 +308,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
         for(int i = 0; i < vec_out.size(); i++) {
           vec_out[i] = static_cast<ElementOutput>(vec_acc[i]);
         }
-        syclcompat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
+        compat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
 
         offset_q += seq_len_qo * head_size_qk;
         if(kv_group_update % q_group_size==0) {
@@ -319,7 +320,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
       }
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
@@ -473,24 +474,24 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
     // configure smem size and carveout
     int smem_size = GemmKernel::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<GemmKernel>>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props{
+    compat::experimental::launch_properties launch_props{
       sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size)
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
       sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
     };
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
@@ -526,7 +527,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
     // Run the GEMM
     run(params);
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size);
diff --git a/benchmarks/flash_attention/flash_attention_prefill_cachedKV/benchmark_runner.hpp b/benchmarks/flash_attention/flash_attention_prefill_cachedKV/benchmark_runner.hpp
index dc81c062bc..bd4cf52018 100644
--- a/benchmarks/flash_attention/flash_attention_prefill_cachedKV/benchmark_runner.hpp
+++ b/benchmarks/flash_attention/flash_attention_prefill_cachedKV/benchmark_runner.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -175,9 +176,9 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
       int max_seq_len_q = static_cast<int>(get<3>(problem_size));
       int max_seq_len_kv = static_cast<int>(get<4>(problem_size));
       int max_seq_len_kv_cache = static_cast<int>(get<5>(problem_size));
-      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, cumulative_seqlen_q.data()};
-      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, cumulative_seqlen_kv.data()};
-      get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, cumulative_seqlen_kv_cache.data()};
+      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, 0, cumulative_seqlen_q.data()};
+      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, 0, cumulative_seqlen_kv.data()};
+      get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, 0, cumulative_seqlen_kv_cache.data()};
     }
 
     auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] = cute::select<0,1,2,6,7>(problem_size);
@@ -218,29 +219,29 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
           cutlass::DeviceAllocation<ElementV> block_V_concat(seq_len_kv_total * head_size_vo);
 
           // Concatenate K_cache and K
-          syclcompat::memcpy<ElementK>(
+          compat::memcpy<ElementK>(
               block_K_concat.get(),
               block_K_cache[0].get() + offset_k_cache,
               seq_len_kv_cache * head_size_qk
           );
-          syclcompat::memcpy<ElementK>(
+          compat::memcpy<ElementK>(
               block_K_concat.get() + seq_len_kv_cache * head_size_qk,
               block_K[0].get() + offset_k,
               seq_len_kv * head_size_qk
           );
 
           // Concatenate V_cache and V
-          syclcompat::memcpy<ElementV>(
+          compat::memcpy<ElementV>(
               block_V_concat.get(),
               block_V_cache[0].get() + offset_v_cache,
               seq_len_kv_cache * head_size_vo
           );
-          syclcompat::memcpy<ElementV>(
+          compat::memcpy<ElementV>(
               block_V_concat.get() + seq_len_kv_cache * head_size_vo,
               block_V[0].get() + offset_v,
               seq_len_kv * head_size_vo
           );
-          syclcompat::wait();
+          compat::wait();
 
           k_ptr = block_K_concat.get();
           v_ptr = block_V_concat.get();
@@ -265,11 +266,11 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
                                                 seq_len_qo * seq_len_kv_total    // batch_stride_S
         );
 
-        syclcompat::wait();
+        compat::wait();
 
         std::vector<ElementAccumulator> host_S(block_S.size());
-        syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
-        syclcompat::wait();
+        compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
+        compat::wait();
 
         // delete this memory as it is no longer needed
         block_S.reset();
@@ -336,8 +337,8 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
         cutlass::DeviceAllocation<ElementV> block_P;
         block_P.reset(host_P.size());
 
-        syclcompat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
-        syclcompat::wait();
+        compat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
+        compat::wait();
 
         cutlass::TensorRef ref_P(block_P.get(), LayoutQ::packed({seq_len_qo, seq_len_kv_total}));
 
@@ -355,13 +356,13 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
                                                 seq_len_qo * head_size_vo  // batch_stride_O
         );
 
-        syclcompat::wait();
+        compat::wait();
         // delete this memory as it is no longer needed
         block_P.reset();
 
         std::vector<ElementAccumulator> vec_acc(block_acc.size());
-        syclcompat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
-        syclcompat::wait();
+        compat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
+        compat::wait();
 
         // delete this memory as it is no longer needed
         block_acc.reset();
@@ -369,8 +370,8 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
         for(int i = 0; i < vec_out.size(); i++) {
           vec_out[i] = static_cast<ElementOutput>(vec_acc[i]);
         }
-        syclcompat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
-        syclcompat::wait();
+        compat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
+        compat::wait();
 
         offset_q += seq_len_qo * head_size_qk;
         if(kv_group_update % q_group_size==0) {
@@ -384,7 +385,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
       }
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
@@ -566,24 +567,24 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
     // configure smem size and carveout
     int smem_size = GemmKernel::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<GemmKernel>>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props{
+    compat::experimental::launch_properties launch_props{
       sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size)
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
       sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
     };
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
@@ -632,7 +633,7 @@ template <class FMHAPrefillConfiguration> struct BenchmarkRunnerFMHA {
     // Run the GEMM
     run(params);
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool use_kv_cache = options.seq_len_kv_cache > 0;
diff --git a/benchmarks/gemm/benchmark_runner.hpp b/benchmarks/gemm/benchmark_runner.hpp
index c7ea70cafa..25b204db33 100644
--- a/benchmarks/gemm/benchmark_runner.hpp
+++ b/benchmarks/gemm/benchmark_runner.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -175,12 +176,12 @@ struct BenchmarkRunnerGemm {
 
   using CollectiveMainloop = typename Gemm::GemmKernel::CollectiveMainloop;
   using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementMma = CollectiveMainloop::TiledMma::ValTypeA;
+  using ElementMma = typename CollectiveMainloop::TiledMma::ValTypeA;
 
-  using ElementScale = ScaleType<CollectiveMainloop>::type;
-  using ElementZero = ZeroType<CollectiveMainloop>::type;
-  using StrideS = ScaleStride<CollectiveMainloop>::type;
-  using StrideZ = ZeroStride<CollectiveMainloop>::type;
+  using ElementScale = typename ScaleType<CollectiveMainloop>::type;
+  using ElementZero = typename ZeroType<CollectiveMainloop>::type;
+  using StrideS = typename ScaleStride<CollectiveMainloop>::type;
+  using StrideZ = typename ZeroStride<CollectiveMainloop>::type;
 
   using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
   using ElementC = typename Gemm::ElementC;
@@ -290,7 +291,7 @@ struct BenchmarkRunnerGemm {
     std::vector<uint8_t> zero(size(zero_layout) * sizeof_bits_v<ElementZero> / 8, 0);
     cutlass::device_memory::copy_to_host(zero.data(), (uint8_t*)zero_buffer, zero.size());
 
-    syclcompat::wait();
+    compat::wait();
 
     auto dst_tensor = make_tensor(make_gmem_ptr(reinterpret_cast<DequantizedElement*>(dst.data())), select<1, 0, 2>(operand_layout));
 
@@ -362,7 +363,7 @@ struct BenchmarkRunnerGemm {
     }
 
     cutlass::device_memory::copy_to_device(dq_buffer, (DequantizedElement*)(raw_pointer_cast(dst_tensor.data())), dst_tensor.size());
-    syclcompat::wait();
+    compat::wait();
     return dq_buffer;
   }
 
@@ -394,7 +395,7 @@ struct BenchmarkRunnerGemm {
     std::vector<uint8_t> zero(size(zero_layout) * sizeof_bits_v<ElementZero> / 8, 0);
     cutlass::device_memory::copy_to_host(zero.data(), (uint8_t*)zero_buffer, zero.size());
 
-    syclcompat::wait();
+    compat::wait();
 
     auto dst_tensor = make_tensor(make_gmem_ptr(reinterpret_cast<DequantizedElement*>(dst.data())), operand_layout);
 
@@ -448,12 +449,15 @@ struct BenchmarkRunnerGemm {
     }
 
     cutlass::device_memory::copy_to_device(dq_buffer, (DequantizedElement*)(raw_pointer_cast(dst_tensor.data())), dst_tensor.size());
-    syclcompat::wait();
+    compat::wait();
     return dq_buffer;
   }
 
   bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
-    auto [M, N, K, L] = problem_size;
+    auto& M = cute::get<0>(problem_size);
+    auto& N = cute::get<1>(problem_size);
+    auto& K = cute::get<2>(problem_size);
+    auto& L = cute::get<3>(problem_size);
 
     TensorRef ref_C(block_C[0].get(), LayoutC::packed({M, N}));
     TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
@@ -526,7 +530,7 @@ struct BenchmarkRunnerGemm {
     );
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::wait();
+    compat::wait();
 #else
     cudaDeviceSynchronize();
 #endif
@@ -543,7 +547,7 @@ struct BenchmarkRunnerGemm {
           block_ref_D.get(), block_ref_D.get(), block_Aux[0].get(), block_D.size());
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = reference::device::BlockCompareEqual(
@@ -686,7 +690,7 @@ struct BenchmarkRunnerGemm {
     gemm_op.run();
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::wait();
+    compat::wait();
 #else
     cudaDeviceSynchronize();
 #endif
diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
index 9f45285cdc..9acaa49f18 100644
--- a/cmake/FindDPCPP.cmake
+++ b/cmake/FindDPCPP.cmake
@@ -40,6 +40,7 @@ add_library(DPCPP::DPCPP INTERFACE IMPORTED)
 
 set(DPCPP_FLAGS "-fsycl;")
 set(DPCPP_COMPILE_ONLY_FLAGS "")
+set(DPCPP_LINK_ONLY_FLAGS "")
 
 if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "")
   list(APPEND DPCPP_FLAGS "-fsycl-targets=${DPCPP_SYCL_TARGET};")
@@ -63,10 +64,10 @@ if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc" OR
    "${DPCPP_SYCL_TARGET}" STREQUAL "spir64" OR
    "${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_bmg_g21")
   if ((CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2025.2) OR CUTLASS_SYCL_BUILTIN_ENABLE)
-    list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier")
+    list(APPEND DPCPP_LINK_ONLY_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier")
   else()
-    list(APPEND DPCPP_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate")
- endif()
+    list(APPEND DPCPP_LINK_ONLY_FLAGS "-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate")
+  endif()
   if(DPCPP_DISABLE_ITT_FOR_CUTLASS)
     list(APPEND DPCPP_FLAGS "-fno-sycl-instrument-device-code")
   endif()
@@ -76,14 +77,16 @@ endif()
 if(UNIX)
   set_target_properties(DPCPP::DPCPP PROPERTIES
     INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}"
-    INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}"
+    INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS};${DPCPP_LINK_ONLY_FLAGS}"
     INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
     INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
   message(STATUS "DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
-  message(STATUS "Using DPCPP flags: ${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}")
+  message(STATUS "Using DPCPP compile flags: ${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}")
+  message(STATUS "Using DPCPP link flags: ${DPCPP_FLAGS};${DPCPP_LINK_ONLY_FLAGS}")
 else()
   set_target_properties(DPCPP::DPCPP PROPERTIES
     INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS};${DPCPP_COMPILE_ONLY_FLAGS}"
+    INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS};${DPCPP_LINK_ONLY_FLAGS}"
     INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
     INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl")
 endif()
@@ -105,7 +108,7 @@ function(add_sycl_to_target)
   )
   get_target_property(target_type ${CUTLASS_ADD_SYCL_TARGET} TYPE)
   if (NOT target_type STREQUAL "OBJECT_LIBRARY")
-    target_link_options(${CUTLASS_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
+    target_link_options(${CUTLASS_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS} ${DPCPP_LINK_ONLY_FLAGS})
   endif()
 endfunction()
 
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
index 5249b328fd..6912bef5d3 100644
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -44,6 +44,15 @@ FetchContent_Declare(
 
 FetchContent_MakeAvailable(googletest)
 
+if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
+  if (TARGET gtest)
+   # Ignore unsupported warning flags on IntelLLVM
+    target_compile_options(gtest PRIVATE -Wno-unknown-warning-option)
+    # Show -Winline warnings, but don’t let them become errors
+    target_compile_options(gtest PRIVATE -Wno-error=inline)
+  endif()
+endif()
+
 if (MSVC)
   set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-endif()
+endif()
\ No newline at end of file
diff --git a/examples/00_bmg_gemm/00_bmg_gemm.cpp b/examples/00_bmg_gemm/00_bmg_gemm.cpp
index 251a4d1f10..7e9291227e 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -210,8 +211,8 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    // CUTLASS on SYCL uses the compatibility library syclcompat for e.g. default in-order queue
-    syclcompat::wait();
+    // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -270,7 +271,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -284,7 +285,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/00_bmg_gemm/00_bmg_gemm_padded.cpp b/examples/00_bmg_gemm/00_bmg_gemm_padded.cpp
index 1551b3027d..b231825fe7 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm_padded.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm_padded.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -235,8 +236,8 @@ struct ExampleRunner {
           M_ACD * N_D  // batch_stride_D
         );
 
-    // CUTLASS on SYCL uses the compatibility library syclcompat for e.g. default in-order queue
-    syclcompat::wait();
+    // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -308,7 +309,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -322,7 +323,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/00_bmg_gemm/00_bmg_gemm_with_sycl_queue.cpp b/examples/00_bmg_gemm/00_bmg_gemm_with_sycl_queue.cpp
index a9c3f246b8..67e1193e75 100644
--- a/examples/00_bmg_gemm/00_bmg_gemm_with_sycl_queue.cpp
+++ b/examples/00_bmg_gemm/00_bmg_gemm_with_sycl_queue.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -243,7 +244,7 @@ struct ExampleRunner {
   cutlass::Status run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
     ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
 
-    auto q = syclcompat::create_queue();
+    auto q = compat::create_queue();
     Memory mem(q, problem_size);
     initialize(problem_size, mem);
 
diff --git a/examples/01_bmg_gemm_with_collective_builder/01_bmg_gemm_with_collective_builder.cpp b/examples/01_bmg_gemm_with_collective_builder/01_bmg_gemm_with_collective_builder.cpp
index c35ad3cfee..2c15047d49 100644
--- a/examples/01_bmg_gemm_with_collective_builder/01_bmg_gemm_with_collective_builder.cpp
+++ b/examples/01_bmg_gemm_with_collective_builder/01_bmg_gemm_with_collective_builder.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -201,7 +202,7 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     using TensorView = cutlass::TensorView<ElementOutput, LayoutD>;
     for (int batch = 0, offset = 0; batch < L; batch++, offset += M * N) {
@@ -209,7 +210,7 @@ struct ExampleRunner {
           block_ref_D.get() + offset, LayoutD::packed({M, N}), cutlass::make_Coord(M, N)));
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -264,7 +265,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -278,7 +279,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_bf16_s8_bf16.cpp b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_bf16_s8_bf16.cpp
index 195d44409a..f0e34f39a4 100755
--- a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_bf16_s8_bf16.cpp
+++ b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_bf16_s8_bf16.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -459,7 +460,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(options);
@@ -473,7 +474,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_s8_f16_tensorwise.cpp b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_s8_f16_tensorwise.cpp
index 845e683df4..37b486f622 100755
--- a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_s8_f16_tensorwise.cpp
+++ b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_s8_f16_tensorwise.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -193,10 +194,10 @@ struct ExampleRunner {
       SrcT* h_src = new SrcT[size * L];
       ElementScale* scale_h = new ElementScale[L];
       ElementZero* zero_h = new ElementZero[L];
-      syclcompat::memcpy(h_src, d_src, size * L * sizeof(SrcT));
-      syclcompat::wait();
-      syclcompat::memcpy(scale_h, scale, L * sizeof(ElementScale));
-      syclcompat::memcpy(zero_h, zero, L * sizeof(ElementZero));
+      compat::memcpy(h_src, d_src, size * L * sizeof(SrcT));
+      compat::wait();
+      compat::memcpy(scale_h, scale, L * sizeof(ElementScale));
+      compat::memcpy(zero_h, zero, L * sizeof(ElementZero));
       
       DstT* h_dst = new DstT[size * L];
       for(size_t j = 0; j < L; ++j) {
@@ -205,8 +206,8 @@ struct ExampleRunner {
         }
       }
 
-      syclcompat::memcpy(d_dst, h_dst, size * sizeof(DstT));
-      syclcompat::wait();
+      compat::memcpy(d_dst, h_dst, size * sizeof(DstT));
+      compat::wait();
   }
 
   bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
@@ -245,7 +246,7 @@ struct ExampleRunner {
           M * N,
           M * N 
       );
-      syclcompat::wait();
+      compat::wait();
 
       bool passed = cutlass::reference::device::BlockCompareEqual(
           block_ref_D.get(), block_D.get(), block_D.size());
@@ -311,7 +312,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -325,7 +326,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp
index 5aa90672ae..335976d8ac 100755
--- a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp
+++ b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_f16.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -392,7 +393,7 @@ struct ExampleRunner {
     std::vector<uint8_t> zero(size(zero_layout) * sizeof_bits_v<ElementZero> / 8, 0);
     cutlass::device_memory::copy_to_host(zero.data(), (uint8_t*)zero_buffer, zero.size());
 
-    syclcompat::wait();
+    compat::wait();
 
     auto dst_tensor = make_tensor(make_gmem_ptr(reinterpret_cast<DequantizedElement*>(dst.data())), operand_layout);
 
@@ -446,13 +447,17 @@ struct ExampleRunner {
     }
 
     cutlass::device_memory::copy_to_device(dq_buffer, (DequantizedElement*)(raw_pointer_cast(dst_tensor.data())), dst_tensor.size());
-    syclcompat::wait();
+    compat::wait();
   }
 
 
   /// Initialize operands to be used in the GEMM and reference GEMM
   void initialize(Options const& options) {
-    auto [M, N, K, L] = ProblemShapeType{options.m, options.n, options.k, options.l};
+    auto problem_shape = ProblemShapeType{options.m, options.n, options.k, options.l};
+    auto& M = cute::get<0>(problem_shape);
+    auto& N = cute::get<1>(problem_shape);
+    auto& K = cute::get<2>(problem_shape);
+    auto& L = cute::get<3>(problem_shape);
 
     auto zero_elements_packed_along_k = get<0>(StrideZero{});
     const int scale_k = cute::ceil_div(options.k, options.g);
@@ -550,7 +555,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(options);
@@ -570,7 +575,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_s8.cpp b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_s8.cpp
index be18a9b170..efd355eeff 100755
--- a/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_s8.cpp
+++ b/examples/02_bmg_gemm_mixed_dtype/02_bmg_gemm_f16_u4_s8.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -374,7 +375,7 @@ struct ExampleRunner {
     std::vector<uint8_t> zero(size(zero_layout) * sizeof_bits_v<ElementZero> / 8, 0);
     cutlass::device_memory::copy_to_host(zero.data(), (uint8_t*)zero_buffer, zero.size());
 
-    syclcompat::wait();
+    compat::wait();
 
     auto dst_tensor = make_tensor(make_gmem_ptr(reinterpret_cast<DequantizedElement*>(dst.data())), operand_layout);
 
@@ -428,7 +429,7 @@ struct ExampleRunner {
     }
 
     cutlass::device_memory::copy_to_device(dq_buffer, (DequantizedElement*)(raw_pointer_cast(dst_tensor.data())), dst_tensor.size());
-    syclcompat::wait();
+    compat::wait();
   }
 
   template <
@@ -463,7 +464,7 @@ struct ExampleRunner {
     std::vector<uint8_t> zero(size(zero_layout) * sizeof_bits_v<ElementZero> / 8, 0);
     cutlass::device_memory::copy_to_host(zero.data(), (uint8_t*)zero_buffer, zero.size());
 
-    syclcompat::wait();
+    compat::wait();
 
     auto dst_tensor = make_tensor(make_gmem_ptr(reinterpret_cast<DequantizedElement*>(dst.data())), select<1, 0, 2>(operand_layout));
 
@@ -536,7 +537,7 @@ struct ExampleRunner {
     }
 
     cutlass::device_memory::copy_to_device(dq_buffer, (DequantizedElement*)(raw_pointer_cast(dst_tensor.data())), dst_tensor.size());
-    syclcompat::wait();
+    compat::wait();
   }
 
   /// Initialize operands to be used in the GEMM and reference GEMM
@@ -594,7 +595,7 @@ struct ExampleRunner {
     auto layout_scale = make_layout(shape_scale, stride_S);
     auto layout_zero = make_layout(shape_zero, stride_Z);
 
-    syclcompat::wait();
+    compat::wait();
 
     // Note that we are overwriting the relevant `block_X_dq` here, both were
     // filled by initialize_mixed_dtype_block above
@@ -641,7 +642,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(options);
@@ -661,7 +662,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/03_bmg_gemm_streamk/03_bmg_gemm_streamk.cpp b/examples/03_bmg_gemm_streamk/03_bmg_gemm_streamk.cpp
index 0116819340..3c0b0a1f8c 100644
--- a/examples/03_bmg_gemm_streamk/03_bmg_gemm_streamk.cpp
+++ b/examples/03_bmg_gemm_streamk/03_bmg_gemm_streamk.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -85,8 +86,6 @@ using namespace cute;
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-#define CUTLASS_SYCL_PROFILING_ENABLED
-
 // Command line options parsing
 struct Options {
 
@@ -233,7 +232,7 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -293,7 +292,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -302,13 +301,13 @@ struct ExampleRunner {
     if(!passed) return cutlass::Status::kErrorInternal;
 
     if (options.iterations > 0) {
-      GPU_Clock timer;
       float elapsed_time_seconds = 0.f;
       for (int i = 0; i < options.iterations; ++i) {
+        GPU_Clock timer;
         gemm_op.initialize(arguments, workspace.get());
         timer.start();
         gemm_op.run();
-        syclcompat::wait();
+        compat::wait();
         elapsed_time_seconds += timer.seconds();
       }
 
diff --git a/examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm.cpp b/examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm.cpp
index bdda0536d2..ffc01d0825 100644
--- a/examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm.cpp
+++ b/examples/04_bmg_grouped_gemm/04_bmg_grouped_gemm.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -96,7 +97,6 @@ using ElementOutput = float;          // <- data type of elements in output matr
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-#define CUTLASS_SYCL_PROFILING_ENABLED
 
 // Command line options parsing
 struct Options {
@@ -289,7 +289,7 @@ struct ExampleRunner {
           );
 
       // Wait for kernel to finish
-      syclcompat::wait();
+      compat::wait();
 
       // Check if output from CUTLASS kernel and reference kernel are equal or not
       passed &= cutlass::reference::device::BlockCompareEqual(block_ref_D.get() + offset_D.at(i), block_D.get() + offset_D.at(i), M * N);
@@ -495,7 +495,7 @@ void initialize(const Options &options) {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(options);
@@ -509,7 +509,7 @@ void initialize(const Options &options) {
       for (int iter = 0; iter < options.iterations; ++iter) {
         CUTLASS_CHECK(gemm_op.run());
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() * 1000;
       double cute_average_time = double(cute_time) / double(options.iterations);
diff --git a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_single_b_with_per_col_bias.cpp b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_single_b_with_per_col_bias.cpp
index aee65fa6fb..44851a64b7 100644
--- a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_single_b_with_per_col_bias.cpp
+++ b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_single_b_with_per_col_bias.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -213,7 +214,7 @@ struct ExampleRunner {
           get<2>(stride_D)  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     for(int batch = 0, offset = 0; batch < L; batch++, offset += M * N) {
       auto D_view = 
@@ -227,7 +228,7 @@ struct ExampleRunner {
       cutlass::reference::device::TensorPerColBias(D_view, bias_view);
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -299,7 +300,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -313,7 +314,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_gelu.cpp b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_gelu.cpp
index 237fd9b417..0d330b0360 100644
--- a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_gelu.cpp
+++ b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_gelu.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -205,7 +206,7 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     using TensorView = cutlass::TensorView<ElementOutput, LayoutD>;
     for(int batch = 0, offset = 0; batch < L; batch++, offset += M * N) {
@@ -213,7 +214,7 @@ struct ExampleRunner {
                                                         cutlass::make_Coord(M, N)));
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -268,7 +269,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -282,7 +283,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_lincombdeeltact.cpp b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_lincombdeeltact.cpp
index 018d1425c5..1cdf5d5f10 100644
--- a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_lincombdeeltact.cpp
+++ b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_lincombdeeltact.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -316,7 +317,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -330,7 +331,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_relu.cpp b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_relu.cpp
index b9d68e1a03..1a21713b34 100644
--- a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_relu.cpp
+++ b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_relu.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -205,7 +206,7 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     using TensorView = cutlass::TensorView<ElementOutput, LayoutD>;
     for(int batch = 0, offset = 0; batch < L; batch++, offset += M * N) {
@@ -213,7 +214,7 @@ struct ExampleRunner {
                                                         cutlass::make_Coord(M, N)));
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -268,7 +269,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -282,7 +283,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_silu.cpp b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_silu.cpp
index f664d9622d..d4f040ad33 100644
--- a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_silu.cpp
+++ b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_silu.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -204,7 +205,7 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     using TensorView = cutlass::TensorView<ElementOutput, LayoutD>;
     for(int batch = 0, offset = 0; batch < L; batch++, offset += M * N) {
@@ -212,7 +213,7 @@ struct ExampleRunner {
                                                         cutlass::make_Coord(M, N)));
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -267,7 +268,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -281,7 +282,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_softmax.cpp b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_softmax.cpp
index 099b17f1fd..1050842066 100644
--- a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_softmax.cpp
+++ b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_softmax.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -205,14 +206,14 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     std::vector<ElementOutput> ptr(M*N*L);
     std::vector<ElementOutput> ptr_refD(M*N*L);
 
-    syclcompat::memcpy(ptr.data(), block_ref_D.get(),
+    compat::memcpy(ptr.data(), block_ref_D.get(),
                        M * N * L * sizeof(ElementOutput));
-    syclcompat::memcpy(ptr_refD.data(), block_D.get(),
+    compat::memcpy(ptr_refD.data(), block_D.get(),
                        (size_t)M * N * L * sizeof(ElementOutput));
 
     // Verify using a manual row-wise softmax on the host
@@ -324,7 +325,7 @@ struct ExampleRunner {
     // Run the GEMM
     gemm_op.run();
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -337,7 +338,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
     double io =
         options.l *
         (options.m * options.k * sizeof(ElementA) + options.k * options.n * sizeof(ElementB) +
diff --git a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_splitk.cpp b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_splitk.cpp
index 1a88ac4583..c4570ec3f3 100644
--- a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_splitk.cpp
+++ b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_epilogue_splitk.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -204,7 +205,7 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     auto D_shape = make_shape(M, N, L);
     auto D1_shape = make_shape(M, NUM_HEAD, NOPE_DIM, L);
@@ -215,8 +216,8 @@ struct ExampleRunner {
     auto D1 = std::vector<ElementOutput>(size(D1_shape));
     // 256x128x128
     auto D2 = std::vector<ElementOutput>(size(D2_shape));
-    syclcompat::memcpy<ElementOutput>(D.data(), block_ref_D.get(), size(D_shape));
-    syclcompat::wait();
+    compat::memcpy<ElementOutput>(D.data(), block_ref_D.get(), size(D_shape));
+    compat::wait();
 
     for (int l = 0; l < L; l++) {
       for (int i = 0; i < M; i++) {
@@ -235,15 +236,15 @@ struct ExampleRunner {
     }
 
     auto test_D = std::vector<ElementOutput>(size(D_shape));
-    syclcompat::memcpy<ElementOutput>(test_D.data(), block_D.get(), size(D_shape));
+    compat::memcpy<ElementOutput>(test_D.data(), block_D.get(), size(D_shape));
 
     // 256x128x64
     auto test_D1 = std::vector<ElementOutput>(size(D1_shape));
     // 256x128x128
     auto test_D2 = std::vector<ElementOutput>(size(D2_shape));
-    syclcompat::memcpy<ElementOutput>(test_D1.data(), block_D1.get(), size(D1_shape));
-    syclcompat::memcpy<ElementOutput>(test_D2.data(), block_D2.get(), size(D2_shape));
-    syclcompat::wait();
+    compat::memcpy<ElementOutput>(test_D1.data(), block_D1.get(), size(D1_shape));
+    compat::memcpy<ElementOutput>(test_D2.data(), block_D2.get(), size(D2_shape));
+    compat::wait();
 
     uint32_t err_cnt = 0;
     constexpr float atol = 1e-4;
@@ -358,7 +359,7 @@ struct ExampleRunner {
     // Run the GEMM
     gemm_op.run();
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, splitk_size, options.alpha, options.beta);
diff --git a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_per_row_bias.cpp b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_per_row_bias.cpp
index 26203992ca..fb177c155a 100644
--- a/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_per_row_bias.cpp
+++ b/examples/05_bmg_gemm_with_epilogues/05_bmg_gemm_with_per_row_bias.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -207,7 +208,7 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     for(int batch = 0, offset = 0; batch < L; batch++, offset += M * N) {
       auto D_view = 
@@ -221,7 +222,7 @@ struct ExampleRunner {
       cutlass::reference::device::TensorPerRowBias(D_view, bias_view);
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -295,7 +296,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -309,7 +310,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/06_bmg_flash_attention/06_bmg_chunk_prefill.cpp b/examples/06_bmg_flash_attention/06_bmg_chunk_prefill.cpp
new file mode 100644
index 0000000000..e3e3c2d9cb
--- /dev/null
+++ b/examples/06_bmg_flash_attention/06_bmg_chunk_prefill.cpp
@@ -0,0 +1,116 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Flash Attention V2 Prefill for Intel BMG
+
+    This example constructs and executes a Flash Attention Prefill with KV cache on Intel BMG. The
+    definition of the GEMM, options etc for this example are defined in the associated
+    bmg_flash_attn_cachedKV_runner.hpp header file.
+
+    See https://arxiv.org/pdf/2307.08691 for details of Flash Attention V2 algorithm
+
+    To run this example:
+      $ ./examples/sycl/06_bmg_flash_attention_cachedKV/06_bmg_prefill_attention_cachedKV --seq_len_qo=512
+        --seq_len_kv=512 --seq_len_kv_cache=512 --head_size_vo=128 --head_size_qk=128
+
+    Causal masking of the first matrix multiplication is supported (`--is_causal`)
+
+    To build & run this example (from your build dir):
+
+      $ ninja 06_bmg_prefill_attention_cachedKV
+      $ ./examples/sycl/06_bmg_flash_attention_cachedKV/06_bmg_prefill_attention_cachedKV
+
+    Call with `--help` for information about available options
+*/
+
+#include "bmg_flash_chunk_prefill_runner.hpp"
+
+int main(int argc, const char **argv) {
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  // Define the work-group tile shape depending on the head-size of the second matmul
+  // Shape<_SequenceLenthOutputBLOCK, _HeadSizeout(NV), SequenceLengthKVBLOCK_KN/KV, HeadSizeQKBLOCK_KQK, HEADSIZEOutSlicerBlock>
+ //
+#if !defined(HEAD_DIM)
+  std::cerr << "HEAD_DIM must be defined" << std::endl;
+  return -1;
+#endif
+  if (options.head_size_vo != HEAD_DIM) {
+    std::cerr << "head_size_vo must be " << HEAD_DIM << ", but got " << options.head_size_vo << std::endl;
+    return -1;
+  }
+
+  constexpr int PipelineStages = 2;
+#if HEAD_DIM == 64
+  using ShapeQK = Shape<_128, _64, _64>;
+  using ShapePV = Shape<_128, _32, _64>;
+  using ShapeOutPut = Shape<_128, _64, _64>;
+  using SubgroupLayout = Layout<Shape<_8, _1, _1>, Stride<_1, _1, _1>>;
+#elif HEAD_DIM == 96
+  using ShapeQK = Shape<_128, _64, _32>;
+  using ShapePV = Shape<_128, _32, _64>;
+  using ShapeOutPut = Shape<_128, _96, _64>;
+  using SubgroupLayout = Layout<Shape<_8, _1, _1>, Stride<_1, _1, _1>>; 
+#elif HEAD_DIM == 128
+  using ShapeQK = Shape<_128, _64, _64>;
+  using ShapePV = Shape<_128, _32, _64>;
+  using ShapeOutPut = Shape<_128, _128, _64>;
+  using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>; 
+#elif HEAD_DIM == 192
+  using ShapeQK = Shape<_256, _64, _64>;
+  using ShapePV = Shape<_256, _32, _64>;
+  using ShapeOutPut = Shape<_256, _192, _64>;
+  using SubgroupLayout = Layout<Shape<_32, _1, _1>, Stride<_1, _1, _1>>; 
+#endif
+  if (options.is_causal) {
+    FMHAConfig<true, false, ShapeQK, ShapePV, ShapeOutPut, SubgroupLayout, PipelineStages>::run(options);
+  } else if (options.is_local_mask) {
+    FMHAConfig<false, true, ShapeQK, ShapePV, ShapeOutPut, SubgroupLayout, PipelineStages>::run(options);
+  } else {
+    FMHAConfig<false, false, ShapeQK, ShapePV, ShapeOutPut, SubgroupLayout, PipelineStages>::run(options);
+  }
+}
diff --git a/examples/06_bmg_flash_attention/06_bmg_chunk_prefill_fp8.cpp b/examples/06_bmg_flash_attention/06_bmg_chunk_prefill_fp8.cpp
new file mode 100644
index 0000000000..0d03b8bc5d
--- /dev/null
+++ b/examples/06_bmg_flash_attention/06_bmg_chunk_prefill_fp8.cpp
@@ -0,0 +1,171 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Flash Attention V2 Prefill for Intel BMG
+
+    This example constructs and executes a Flash Attention Prefill with KV cache on Intel BMG. The
+    definition of the GEMM, options etc for this example are defined in the associated
+    bmg_flash_attn_cachedKV_runner.hpp header file.
+
+    See https://arxiv.org/pdf/2307.08691 for details of Flash Attention V2 algorithm
+
+    To run this example:
+      $ ./examples/sycl/06_bmg_flash_attention_cachedKV/06_bmg_prefill_attention_cachedKV --seq_len_qo=512
+        --seq_len_kv=512 --seq_len_kv_cache=512 --head_size_vo=128 --head_size_qk=128
+
+    Causal masking of the first matrix multiplication is supported (`--is_causal`)
+
+    To build & run this example (from your build dir):
+
+      $ ninja 06_bmg_prefill_attention_cachedKV
+      $ ./examples/sycl/06_bmg_flash_attention_cachedKV/06_bmg_prefill_attention_cachedKV
+
+    Call with `--help` for information about available options
+*/
+
+#include "bmg_flash_chunk_prefill_runner.hpp"
+
+int main(int argc, const char **argv) {
+  //
+  // Parse options
+  //
+
+  Options options;
+  // Override the default data type for this test
+  // options.dtype = "fp8";
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+#if !defined(HEAD_DIM)
+  std::cerr << "HEAD_DIM must be defined" << std::endl;
+  return -1;
+#endif
+  if (options.head_size_vo != HEAD_DIM) {
+    std::cerr << "head_size_vo must be " << HEAD_DIM << ", but got " << options.head_size_vo << std::endl;
+    return -1;
+  }
+
+  // =================================================================================================
+  // Scale Factor Tensor Creation
+  // =================================================================================================
+  // 1. Create FP32 tensors for the scale factors.
+  // The shape is (batch_size, num_heads_q) as each head can have a different scale.
+  size_t scale_tensor_size = options.batch * options.num_heads_q;
+  std::vector<float> q_scale_host(scale_tensor_size);
+  std::vector<float> k_scale_host(scale_tensor_size);
+  std::vector<float> v_scale_host(scale_tensor_size);
+
+  // 2. Fill host vectors with desired values.
+  std::fill(q_scale_host.begin(), q_scale_host.end(), 1.5f);
+  std::fill(k_scale_host.begin(), k_scale_host.end(), 2.0f);
+  std::fill(v_scale_host.begin(), v_scale_host.end(), 2.5f);
+
+  // 3. Create device allocations and copy data from host to device.
+  cutlass::DeviceAllocation<float> q_scale_dev;
+  cutlass::DeviceAllocation<float> k_scale_dev;
+  cutlass::DeviceAllocation<float> v_scale_dev;
+
+  q_scale_dev.reset(scale_tensor_size);
+  k_scale_dev.reset(scale_tensor_size);
+  v_scale_dev.reset(scale_tensor_size);
+
+  q_scale_dev.copy_from_host(q_scale_host.data());
+  k_scale_dev.copy_from_host(k_scale_host.data());
+  v_scale_dev.copy_from_host(v_scale_host.data());
+
+  // 4. Get the raw float* pointers from the device allocations.
+  const float* q_scale = q_scale_dev.get();
+  const float* k_scale = k_scale_dev.get();
+  const float* v_scale = v_scale_dev.get();
+
+  // =================================================================================================
+  // FP8 Type Definitions
+  // =================================================================================================
+  using ElementInputQ = cutlass::float_e5m2_t;     // <- data type of elements in input matrix A
+    using ElementInputKV = cutlass::float_e5m2_t;    // <- data type of elements in input matrix B
+    using MMAOperation = XE_8x16x16_F32F16F16F32_TT;
+    using GmemTiledCopyQ = XE_2D_U8x8x32_LD_N;
+    using GmemTiledCopyK = XE_2D_U8x16x16_LD_T; // _T designates a transposed block load operation
+    using GmemTiledCopyV = XE_2D_U8x32x32_LD_V;
+
+  constexpr int PipelineStages = 2;
+
+  // =================================================================================================
+  // Tile Shape Definitions
+  // =================================================================================================
+#if HEAD_DIM == 64
+  using ShapeQK = Shape<_128, _64, _64>;
+  using ShapePV = Shape<_128, _32, _64>;
+  using ShapeOutPut = Shape<_128, _64, _64>;
+  using SubgroupLayout = Layout<Shape<_8, _1, _1>, Stride<_1, _1, _1>>;
+#elif HEAD_DIM == 96
+  using ShapeQK = Shape<_128, _64, _32>;
+  using ShapePV = Shape<_128, _32, _64>;
+  using ShapeOutPut = Shape<_128, _96, _64>;
+  using SubgroupLayout = Layout<Shape<_8, _1, _1>, Stride<_1, _1, _1>>;
+#elif HEAD_DIM == 128
+  using ShapeQK = Shape<_128, _64, _64>;
+  using ShapePV = Shape<_128, _32, _64>;
+  using ShapeOutPut = Shape<_128, _128, _64>;
+  using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>;
+#elif HEAD_DIM == 192
+  using ShapeQK = Shape<_256, _64, _64>;
+  using ShapePV = Shape<_256, _32, _64>;
+  using ShapeOutPut = Shape<_256, _192, _64>;
+  using SubgroupLayout = Layout<Shape<_32, _1, _1>, Stride<_1, _1, _1>>;
+#endif
+
+  // =================================================================================================
+  // Kernel Launch
+  // =================================================================================================
+  if (options.is_causal) {
+    FMHAConfig<true, false, ShapeQK, ShapePV, ShapeOutPut, SubgroupLayout, PipelineStages, 
+                                          ElementInputQ, ElementInputKV, MMAOperation, 
+                                          GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV>::run(options, q_scale, k_scale, v_scale);
+  } else if (options.is_local_mask) {
+    FMHAConfig<false, true, ShapeQK, ShapePV, ShapeOutPut, SubgroupLayout, PipelineStages, 
+                                          ElementInputQ, ElementInputKV, MMAOperation, 
+                                          GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV>::run(options, q_scale, k_scale, v_scale);
+  } else {
+    FMHAConfig<false, false, ShapeQK, ShapePV, ShapeOutPut, SubgroupLayout, PipelineStages, 
+                                          ElementInputQ, ElementInputKV, MMAOperation, 
+                                          GmemTiledCopyQ, GmemTiledCopyK, GmemTiledCopyV>::run(options, q_scale, k_scale, v_scale);
+  }
+}
diff --git a/examples/06_bmg_flash_attention/CMakeLists.txt b/examples/06_bmg_flash_attention/CMakeLists.txt
index 39752da4ed..e73aa4131f 100644
--- a/examples/06_bmg_flash_attention/CMakeLists.txt
+++ b/examples/06_bmg_flash_attention/CMakeLists.txt
@@ -1,4 +1,5 @@
 # Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+# Copyright (C) 2025 Intel Corporation, All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -63,6 +64,19 @@ foreach(HEAD_DIM 64 96 128 192)
   cutlass_example_add_executable(
     06_bmg_decode_attention_fp8_hdim${HEAD_DIM}
     06_bmg_decode_attention_fp8.cpp
+  )
+  
+  cutlass_example_add_executable(
+    06_bmg_chunk_prefill_hdim${HEAD_DIM}
+    06_bmg_chunk_prefill.cpp
+    TEST_COMMAND_OPTIONS
+    TEST_NO_PAGED
+    TEST_PAGED
+  )
+
+  cutlass_example_add_executable(
+    06_bmg_chunk_prefill_fp8_hdim${HEAD_DIM}
+    06_bmg_chunk_prefill_fp8.cpp
     TEST_COMMAND_OPTIONS
     TEST_NO_PAGED
     TEST_PAGED
@@ -72,4 +86,6 @@ foreach(HEAD_DIM 64 96 128 192)
   target_compile_definitions(06_bmg_decode_attention_hdim${HEAD_DIM} PRIVATE HEAD_DIM=${HEAD_DIM})
   target_compile_definitions(06_bmg_prefill_attention_fp8_hdim${HEAD_DIM} PRIVATE HEAD_DIM=${HEAD_DIM})
   target_compile_definitions(06_bmg_decode_attention_fp8_hdim${HEAD_DIM} PRIVATE HEAD_DIM=${HEAD_DIM})
+  target_compile_definitions(06_bmg_chunk_prefill_hdim${HEAD_DIM} PRIVATE HEAD_DIM=${HEAD_DIM})
+  target_compile_definitions(06_bmg_chunk_prefill_fp8_hdim${HEAD_DIM} PRIVATE HEAD_DIM=${HEAD_DIM})
 endforeach()
diff --git a/examples/06_bmg_flash_attention/bmg_flash_attn_decode_runner.hpp b/examples/06_bmg_flash_attention/bmg_flash_attn_decode_runner.hpp
index da5ab6f302..0f3918f213 100644
--- a/examples/06_bmg_flash_attention/bmg_flash_attn_decode_runner.hpp
+++ b/examples/06_bmg_flash_attention/bmg_flash_attn_decode_runner.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -201,7 +202,7 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
 
   template <typename SrcT, typename DstT>
   void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
-    syclcompat::get_default_queue().parallel_for(size, [=](auto indx) {
+    compat::get_default_queue().parallel_for(size, [=](auto indx) {
       d_dst[indx] = static_cast<DstT>(d_src[indx]);
     }).wait();
   }
@@ -228,9 +229,9 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
       int max_seq_len_q = static_cast<int>(get<3>(problem_size));
       int max_seq_len_kv = static_cast<int>(get<4>(problem_size));
       int max_seq_len_kv_cache = static_cast<int>(get<5>(problem_size));
-      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, cumulative_seqlen_q.data()};
-      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, cumulative_seqlen_kv.data()};
-      get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, cumulative_seqlen_kv_cache.data()};
+      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, 0, cumulative_seqlen_q.data()};
+      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, 0, cumulative_seqlen_kv.data()};
+      get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, 0, cumulative_seqlen_kv_cache.data()};
     }
 
     auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] = cute::select<0,1,2,6,7>(problem_size);
@@ -278,29 +279,29 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
             cutlass::DeviceAllocation<ElementV_> block_V_concat(seq_len_kv_total * head_size_vo);
 
             // Concatenate K_cache and K
-            syclcompat::memcpy<ElementK_>(
+            compat::memcpy<ElementK_>(
                 block_K_concat.get(),
                 block_K_cache_.get() + offset_k_cache,
                 seq_len_kv_cache * head_size_qk
             );
-            syclcompat::memcpy<ElementK_>(
+            compat::memcpy<ElementK_>(
                 block_K_concat.get() + seq_len_kv_cache * head_size_qk,
                 block_K_.get() + offset_k,
                 seq_len_kv * head_size_qk
             );
 
             // Concatenate V_cache and V
-            syclcompat::memcpy<ElementV_>(
+            compat::memcpy<ElementV_>(
                 block_V_concat.get(),
                 block_V_cache_.get() + offset_v_cache,
                 seq_len_kv_cache * head_size_vo
             );
-            syclcompat::memcpy<ElementV_>(
+            compat::memcpy<ElementV_>(
                 block_V_concat.get() + seq_len_kv_cache * head_size_vo,
                 block_V_.get() + offset_v,
                 seq_len_kv * head_size_vo
             );
-            syclcompat::wait();
+            compat::wait();
 
             k_ptr = block_K_concat.get();
             v_ptr = block_V_concat.get();
@@ -325,11 +326,11 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
                                                 seq_len_qo * seq_len_kv_total    // batch_stride_S
         );
 
-        syclcompat::wait();
+        compat::wait();
 
         std::vector<ElementAccumulator> host_S(block_S.size());
-        syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
-        syclcompat::wait();
+        compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
+        compat::wait();
 
         // delete this memory as it is no longer needed
         block_S.reset();
@@ -397,8 +398,8 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
         cutlass::DeviceAllocation<ElementV_> block_P;
         block_P.reset(host_P.size());
 
-        syclcompat::memcpy<ElementV_>(block_P.get(), host_P.data(), host_P.size());
-        syclcompat::wait();
+        compat::memcpy<ElementV_>(block_P.get(), host_P.data(), host_P.size());
+        compat::wait();
 
         cutlass::TensorRef ref_P(block_P.get(), LayoutQ::packed({seq_len_qo, seq_len_kv_total}));
 
@@ -416,13 +417,13 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
                                                 seq_len_qo * head_size_vo  // batch_stride_O
         );
 
-        syclcompat::wait();
+        compat::wait();
         // delete this memory as it is no longer needed
         block_P.reset();
 
         std::vector<ElementAccumulator> vec_acc(block_acc.size());
-        syclcompat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
-        syclcompat::wait();
+        compat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
+        compat::wait();
 
         // delete this memory as it is no longer needed
         block_acc.reset();
@@ -430,8 +431,8 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
         for(int i = 0; i < vec_out.size(); i++) {
           vec_out[i] = static_cast<ElementOutput>(vec_acc[i]);
         }
-        syclcompat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
-        syclcompat::wait();
+        compat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
+        compat::wait();
 
         offset_q += seq_len_qo * head_size_qk;
         if(kv_group_update % q_group_size == 0) {
@@ -445,7 +446,7 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
       }
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
@@ -588,11 +589,11 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
           page_mapping[logical_idx] = physical_pages[blk];
         }
       }
-      syclcompat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
+      compat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
 
       paged_kv_cache.num_pages_per_seq.reset(num_pages_per_seq.size());
-      syclcompat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
-      syclcompat::wait();
+      compat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
+      compat::wait();
     }
     
     initialize_block(block_Q, seed + 2021);
@@ -633,24 +634,24 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
     // configure smem size and carveout
     int smem_size = FMHAKernel::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<FMHAKernel>>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<FMHAKernel::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props {
+    compat::experimental::launch_properties launch_props {
       sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
       sycl::ext::oneapi::experimental::sub_group_size<FMHAKernel::DispatchPolicy::SubgroupSize>
     };
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<cutlass::device_kernel<FMHAKernel>>(policy, params);
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAKernel>>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
@@ -693,7 +694,7 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
     // Run the GEMM
     run(params);
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool use_kv_cache = options.seq_len_kv_cache > 0;
@@ -710,7 +711,7 @@ template <class FMHAKernel, bool isVarLen> struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         run(params);
       }
-      syclcompat::wait();
+      compat::wait();
 
       double cute_time = timer.seconds() / options.iterations;
 
diff --git a/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_cachedKV_runner.hpp b/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_cachedKV_runner.hpp
index 0600b6ce0f..cb59a4d978 100644
--- a/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_cachedKV_runner.hpp
+++ b/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_cachedKV_runner.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -213,9 +214,9 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
       int max_seq_len_q = static_cast<int>(get<3>(problem_size));
       int max_seq_len_kv = static_cast<int>(get<4>(problem_size));
       int max_seq_len_kv_cache = static_cast<int>(get<5>(problem_size));
-      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, cumulative_seqlen_q.data()};
-      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, cumulative_seqlen_kv.data()};
-      get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, cumulative_seqlen_kv_cache.data()};
+      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, 0, cumulative_seqlen_q.data()};
+      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, 0, cumulative_seqlen_kv.data()};
+      get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, 0, cumulative_seqlen_kv_cache.data()};
     }
 
     auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] = cute::select<0,1,2,6,7>(problem_size);
@@ -256,24 +257,24 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
           cutlass::DeviceAllocation<ElementV> block_V_concat(seq_len_kv_total * head_size_vo);
 
           // Concatenate K_cache and K
-          syclcompat::memcpy<ElementK>(
+          compat::memcpy<ElementK>(
               block_K_concat.get(),
               block_K_cache.get() + offset_k_cache,
               seq_len_kv_cache * head_size_qk
           );
-          syclcompat::memcpy<ElementK>(
+          compat::memcpy<ElementK>(
               block_K_concat.get() + seq_len_kv_cache * head_size_qk,
               block_K.get() + offset_k,
               seq_len_kv * head_size_qk
           );
 
           // Concatenate V_cache and V
-          syclcompat::memcpy<ElementV>(
+          compat::memcpy<ElementV>(
               block_V_concat.get(),
               block_V_cache.get() + offset_v_cache,
               seq_len_kv_cache * head_size_vo
           );
-          syclcompat::memcpy<ElementV>(
+          compat::memcpy<ElementV>(
               block_V_concat.get() + seq_len_kv_cache * head_size_vo,
               block_V.get() + offset_v,
               seq_len_kv * head_size_vo
@@ -301,10 +302,10 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
                                                 seq_len_qo * seq_len_kv_total    // batch_stride_S
         );
 
-        syclcompat::wait();
+        compat::wait();
 
         std::vector<ElementAccumulator> host_S(block_S.size());
-        syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
+        compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
 
         // delete this memory as it is no longer needed
         block_S.reset();
@@ -371,7 +372,7 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
         cutlass::DeviceAllocation<ElementV> block_P;
         block_P.reset(host_P.size());
 
-        syclcompat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
+        compat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
 
         cutlass::TensorRef ref_P(block_P.get(), LayoutQ::packed({seq_len_qo, seq_len_kv_total}));
 
@@ -389,12 +390,12 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
                                                 seq_len_qo * head_size_vo  // batch_stride_O
         );
 
-        syclcompat::wait();
+        compat::wait();
         // delete this memory as it is no longer needed
         block_P.reset();
 
         std::vector<ElementAccumulator> vec_acc(block_acc.size());
-        syclcompat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
+        compat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
 
         // delete this memory as it is no longer needed
         block_acc.reset();
@@ -402,7 +403,7 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
         for(int i = 0; i < vec_out.size(); i++) {
           vec_out[i] = static_cast<ElementOutput>(vec_acc[i]);
         }
-        syclcompat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
+        compat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
 
         offset_q += seq_len_qo * head_size_qk;
         if(kv_group_update % q_group_size==0) {
@@ -416,7 +417,7 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
       }
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
@@ -560,10 +561,10 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
           page_mapping[logical_idx] = physical_pages[blk];
         }
       }
-      syclcompat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
+      compat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
 
       paged_kv_cache.num_pages_per_seq.reset(num_pages_per_seq.size());
-      syclcompat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
+      compat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
     }
 
     initialize_block(block_Q, seed + 2023);
@@ -608,25 +609,25 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
     // configure smem size and carveout
     int smem_size = FMHAPrefillCachedKernel::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 // Launch parameters depend on whether SYCL compiler supports work-group scratch memory extension
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<FMHAPrefillCachedKernel>>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<FMHAPrefillCachedKernel::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props {
+    compat::experimental::launch_properties launch_props {
       sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
       sycl::ext::oneapi::experimental::sub_group_size<FMHAPrefillCachedKernel::DispatchPolicy::SubgroupSize>
     };
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<cutlass::device_kernel<FMHAPrefillCachedKernel>>(policy, params);
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillCachedKernel>>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
@@ -671,7 +672,7 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
     // Run the Flash Attention implementation.
     run(params);
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool use_kv_cache = options.seq_len_kv_cache > 0;
@@ -688,7 +689,7 @@ template <class FMHAPrefillCachedKernel, bool isVarLen> struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         run(params);
       }
-      syclcompat::wait();
+      compat::wait();
  
       auto offset = cute::min(options.seq_len_qo, options.seq_len_kv);
       auto discard_seq_coord = options.seq_len_qo - offset;
diff --git a/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_runner.hpp b/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_runner.hpp
index 6d7b5e0401..58310eaa31 100644
--- a/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_runner.hpp
+++ b/examples/06_bmg_flash_attention/bmg_flash_attn_prefill_runner.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -173,7 +174,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
 
   template <typename SrcT, typename DstT>
   void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
-    syclcompat::get_default_queue().parallel_for(size, [=](auto indx) {
+    compat::get_default_queue().parallel_for(size, [=](auto indx) {
       d_dst[indx] = static_cast<DstT>(d_src[indx]);
     }).wait();
   }
@@ -200,8 +201,8 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
     if constexpr (isVarLen) {
       int max_seq_len_q = static_cast<int>(get<3>(problem_size));
       int max_seq_len_kv = static_cast<int>(get<4>(problem_size));
-      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, cumulative_seqlen_q.data()};
-      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, cumulative_seqlen_kv.data()};
+      get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, 0, cumulative_seqlen_q.data()};
+      get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, 0, cumulative_seqlen_kv.data()};
     }
 
     auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] = cute::select<0,1,2,5,6>(problem_size);
@@ -248,10 +249,10 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
                                                 seq_len_qo * seq_len_kv    // batch_stride_S
         );
 
-        syclcompat::wait();
+        compat::wait();
 
         std::vector<ElementAccumulator> host_S(block_S.size());
-        syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
+        compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
 
         // delete this memory as it is no longer needed
         block_S.reset();
@@ -317,7 +318,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
         cutlass::DeviceAllocation<ElementV_> block_P;
         block_P.reset(host_P.size());
 
-        syclcompat::memcpy<ElementV_>(block_P.get(), host_P.data(), host_P.size());
+        compat::memcpy<ElementV_>(block_P.get(), host_P.data(), host_P.size());
 
         cutlass::TensorRef ref_P(block_P.get(), LayoutQ::packed({seq_len_qo, seq_len_kv}));
 
@@ -335,12 +336,12 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
                                                 seq_len_qo * head_size_vo  // batch_stride_O
         );
 
-        syclcompat::wait();
+        compat::wait();
         // delete this memory as it is no longer needed
         block_P.reset();
 
         std::vector<ElementAccumulator> vec_acc(block_acc.size());
-        syclcompat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
+        compat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
 
         // delete this memory as it is no longer needed
         block_acc.reset();
@@ -348,7 +349,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
         for(int i = 0; i < vec_out.size(); i++) {
           vec_out[i] = static_cast<ElementOutput>(vec_acc[i]);
         }
-        syclcompat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
+        compat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
 
         offset_q += seq_len_qo * head_size_qk;
         if(kv_group_update % q_group_size==0) {
@@ -360,7 +361,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
       }
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
@@ -497,25 +498,25 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
     // configure smem size and carveout
     int smem_size = FMHAPrefillKernel::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 // Launch parameters depend on whether SYCL compiler supports work-group scratch memory extension
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<FMHAPrefillKernel>>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<FMHAPrefillKernel::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props {
+    compat::experimental::launch_properties launch_props {
       sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
       sycl::ext::oneapi::experimental::sub_group_size<FMHAPrefillKernel::DispatchPolicy::SubgroupSize>
     };
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<cutlass::device_kernel<FMHAPrefillKernel>>(policy, params);
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAPrefillKernel>>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
@@ -553,7 +554,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
     // Run the GEMM
     run(params);
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.is_causal);
@@ -569,7 +570,7 @@ template <class FMHAPrefillKernel, bool isVarLen> struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         run(params);
       }
-      syclcompat::wait();
+      compat::wait();
     // when seq_len_qo is not equal to seq_len_kv we use bottom up approach for the masking. 
       // Following changes will adjust the effective_seq_len_kv when masking applied for such cases
       auto offset = cute::min(options.seq_len_qo, options.seq_len_kv);
diff --git a/examples/06_bmg_flash_attention/bmg_flash_chunk_prefill_runner.hpp b/examples/06_bmg_flash_attention/bmg_flash_chunk_prefill_runner.hpp
new file mode 100644
index 0000000000..e00dde66fc
--- /dev/null
+++ b/examples/06_bmg_flash_attention/bmg_flash_chunk_prefill_runner.hpp
@@ -0,0 +1,957 @@
+/***************************************************************************************************
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "flash_attention_v2/collective/fmha_fusion.hpp"
+#include "flash_attention_v2/kernel/tile_scheduler_chunk_prefill.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "flash_attention_v2/kernel/xe_chunk_prefill.hpp"
+#include "flash_attention_v2/collective/xe_flash_attn_chunk_prefill_epilogue.hpp"
+#include "flash_attention_v2/collective/xe_flash_attn_chunk_prefill_softmax_epilogue.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+#include "cutlass/util/sycl_event_manager.hpp"
+#include "cutlass/fp8_to_fp16.h"
+
+#include <cute/tensor.hpp>
+#include <random>
+
+#include "helper.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "sycl_common.hpp"
+
+using namespace cute;
+
+// Helper to check for FP8 types
+template <class T>
+constexpr bool is_fp8_v = std::is_same_v<T, cutlass::float_e4m3_t> || std::is_same_v<T, cutlass::float_e5m2_t>;
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool is_causal;
+  bool is_local_mask;
+  bool varlen = false;
+  bool use_paged_kv = false;
+  std::string scheduler;
+
+  int batch, num_heads_q, num_heads_kv, seq_len_qo, seq_len_kv, seq_len_kv_cache, page_size, head_size_qk, head_size_vo, iterations, window_left, window_right;
+  float softmax_scale;
+
+  Options()
+      : help(false), error(false), is_causal(false), is_local_mask(false), varlen(false), use_paged_kv(false), batch(32), num_heads_q(16), num_heads_kv(16), seq_len_qo(512), head_size_qk(128),
+        seq_len_kv(512), seq_len_kv_cache(512), page_size(128), head_size_vo(128), iterations(100), window_left(-1), window_right(-1), softmax_scale(1.f), scheduler("Individual") {}
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    if (cmd.check_cmd_line_flag("is_causal")) {
+      is_causal = true;
+    }
+
+    if (cmd.check_cmd_line_flag("varlen")) {
+      varlen = true;
+    }
+
+    cmd.get_cmd_line_argument("scheduler", scheduler, std::string("Individual"));
+
+    cmd.get_cmd_line_argument("batch", batch, 32);
+    cmd.get_cmd_line_argument("num_heads_q", num_heads_q, 16);
+    cmd.get_cmd_line_argument("num_heads_kv", num_heads_kv, num_heads_q);
+    cmd.get_cmd_line_argument("seq_len_qo", seq_len_qo, 512);
+    cmd.get_cmd_line_argument("seq_len_kv", seq_len_kv, seq_len_qo);
+    cmd.get_cmd_line_argument("seq_len_kv_cache", seq_len_kv_cache, 512);
+    cmd.get_cmd_line_argument("head_size_vo", head_size_vo, HEAD_DIM);
+    cmd.get_cmd_line_argument("head_size_qk", head_size_qk, head_size_vo);
+    cmd.get_cmd_line_argument("window_left", window_left, -1);
+    cmd.get_cmd_line_argument("window_right", window_right, -1);
+    cmd.get_cmd_line_argument("iterations", iterations, 100);
+
+    if (cmd.check_cmd_line_flag("use_paged_kv")) {
+        use_paged_kv = true;
+        cmd.get_cmd_line_argument("page_size", page_size, 128);
+        seq_len_kv = 0; // seq_len_kv is not used when use paged kv
+        if (page_size % 128 != 0) {
+            std::cerr << "Invalid: page_size must be a multiple of 128" << std::endl;
+            return;
+        }
+        if (seq_len_kv_cache % page_size != 0) {
+            std::cerr << "Invalid: seq_len_kv_cache must be divisible by page_size" << std::endl;
+            return;
+        }
+    }
+    if (window_left > -1 && window_right > -1) {
+      is_local_mask = true;
+    }
+    softmax_scale = 1 / sqrt(static_cast<float>(head_size_qk));
+  }
+
+  /// Prints the usage statement.
+  std::ostream &print_usage(std::ostream &out) const {
+
+    out << "BMG Flash Attention v2 Example\n\n"
+        << "Options:\n\n"
+        << "  --help                      If specified, displays this usage statement\n\n"
+        << "  --is_causal                 Apply Causal Mask to the output of first Matmul\n"
+        << "  --window_left=<int>         Set the left borders of the window, If set to -1, calculate all seq_len\n"
+        << "  --window_right=<int>        Set the left borders of the window, If set to -1, calculate all seq_len\n"
+        << "  --varlen                    Enable variable sequence length\n"
+        << "  --scheduler=\"Value\"       Choose between Individual or Persistent Scheduler\n"
+        << "  --batch=<int>               Sets the Batch Size of the Multi-Head Self Attention module\n"
+        << "  --num_heads_q=<int>         Sets the Number of Attention Heads for Key-Value pair the Multi-Head Self Attention module\n"
+        << "  --num_heads_kv=<int>        Sets the Number of Attention Heads for Query input in the Multi-Head Self Attention module\n"
+        << "  --seq_len_qo=<int>          Sets the Sequence length of the Query input in Multi-Head Self Attention module\n"
+        << "  --seq_len_kv=<int>          Sets the Sequence length of the Key-Value pair in Multi-Head Self Attention module\n"
+        << "  --seq_len_kv_cache=<int>    Sets the Sequence length of the cached Key-Value pair in Multi-Head Self Attention module\n"
+        << "  --use_paged_kv              Use paged (non-contiguous) KV cache. Default is contiguous KV Cache\n"
+        << "  --page_size=<int>           Block size for paged KV cache. Default is 128\n"
+        << "  --head_size_qk=<int>        Sets the Attention Head dimension of the 1st Matrix Multiplication in Multi-Head Self Attention module\n"
+        << "  --head_size_vo=<int>        Sets the Attention Head dimension of the 2nd Matrix Multiplication in Multi-Head Self Attention module\n"
+        << "  --iterations=<int>          Iterations\n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Flash Attention takes 3 input matrices: (K)eys, (Q)ueries and (V)alues.
+using LayoutQ = cutlass::layout::RowMajor;
+using LayoutK = cutlass::layout::ColumnMajor;
+using LayoutV = cutlass::layout::RowMajor;
+using LayoutO = cutlass::layout::RowMajor;
+
+template <class FMHAChunkPrefillKernel, bool isVarLen> struct ExampleRunner {
+
+  using StrideQ = typename FMHAChunkPrefillKernel::StrideQ;
+  using StrideK = typename FMHAChunkPrefillKernel::StrideK;
+  using StrideV = typename FMHAChunkPrefillKernel::StrideV;
+  using StrideO = typename FMHAChunkPrefillKernel::StrideO;
+
+  using ElementQ = typename FMHAChunkPrefillKernel::ElementQ;
+  using ElementK = typename FMHAChunkPrefillKernel::ElementK;
+  using ElementV = typename FMHAChunkPrefillKernel::ElementV;
+  using ElementAcc = typename FMHAChunkPrefillKernel::ElementAccumulator;
+
+  using CollectiveEpilogue = typename FMHAChunkPrefillKernel::CollectiveEpilogue;
+  using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+  using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+  using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+  using ProblemShapeType = typename FMHAChunkPrefillKernel::ProblemShape;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  StrideQ stride_Q;
+  StrideK stride_K;
+  StrideV stride_V;
+  StrideK stride_K_cache;
+  StrideV stride_V_cache;
+  StrideO stride_O;
+  uint64_t seed = 0;
+
+  cutlass::DeviceAllocation<ElementQ> block_Q;
+  cutlass::DeviceAllocation<ElementK> block_K;
+  cutlass::DeviceAllocation<ElementV> block_V;
+  cutlass::DeviceAllocation<ElementK> block_K_cache;
+  cutlass::DeviceAllocation<ElementV> block_V_cache;
+  cutlass::DeviceAllocation<ElementOutput> block_O;
+  cutlass::DeviceAllocation<ElementOutput> block_ref_O;
+
+  std::vector<int> cumulative_seqlen_q;
+  std::vector<int> cumulative_seqlen_kv;
+  std::vector<int> cumulative_seqlen_kv_cache;
+  cutlass::DeviceAllocation<int> device_cumulative_seqlen_q;
+  cutlass::DeviceAllocation<int> device_cumulative_seqlen_kv;
+  cutlass::DeviceAllocation<int> device_cumulative_seqlen_kv_cache;
+
+  struct PagedKVParams {
+      cutlass::DeviceAllocation<int> page_table;
+      int page_size = 0;
+      cutlass::DeviceAllocation<int> num_pages_per_seq;
+  };
+  PagedKVParams paged_kv_cache;
+
+  //
+  // Methods
+  //
+
+template <typename SrcType, typename DstType, typename Encoding>
+void run_conversion_kernel(SrcType* src_ptr_in, DstType* dst_ptr_in, int64_t num_elements, float scale) {
+    sycl::queue queue = compat::get_default_queue();
+    int64_t num_threads = 256;
+    int64_t num_blocks = ceil_div(num_elements, num_threads);
+
+    queue.submit([&](sycl::handler& cgh) {
+        SrcType* src_ptr = src_ptr_in;
+        DstType* dst_ptr = dst_ptr_in;
+        cgh.parallel_for(sycl::nd_range<1>(num_blocks * num_threads, num_threads), [=](sycl::nd_item<1> item) {
+            int64_t idx = item.get_global_id(0);
+            if (idx < num_elements) {
+                auto src_tensor = make_tensor(src_ptr + idx, make_shape(1));
+                auto dst_tensor = make_tensor(dst_ptr + idx, make_shape(1));
+                convert_and_descale<Encoding, 1>(src_tensor, dst_tensor, scale);
+            }
+        });
+    });
+}
+
+bool verify(ProblemShapeType problem_size, Options options, const float* q_scale, const float* k_scale, const float* v_scale) {
+    std::vector<ElementOutput> host_O(block_ref_O.size());
+
+    float h_scale_q = 1.0f;
+    float h_scale_k = 1.0f;
+    float h_scale_v = 1.0f;
+
+    if (q_scale != nullptr) {
+        compat::memcpy<float>(&h_scale_q, q_scale, 1);
+    }
+    if (k_scale != nullptr) {
+        compat::memcpy<float>(&h_scale_k, k_scale, 1);
+    }
+    if (v_scale != nullptr) {
+        compat::memcpy<float>(&h_scale_v, v_scale, 1);
+    }
+    compat::wait();
+
+    auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] = cute::select<0,1,2,6,7>(problem_size);
+    int seq_len_qo, seq_len_kv, seq_len_kv_cache;
+
+    int offset_q = 0;
+    int offset_k = 0;
+    int offset_v = 0;
+    int offset_k_cache = 0;
+    int offset_v_cache = 0;
+    int offset_o = 0;
+
+    using namespace cutlass;
+    using RefElement = half_t;
+    DeviceAllocation<RefElement> block_Q_ref, block_K_ref, block_V_ref;
+
+    // loop over the batch dimension to compute the output
+    // to avoid the risk of running out of device memory
+    int q_group_size = num_heads_q / num_heads_kv;
+    for (int b = 0; b < batch; b++) {
+      if constexpr (isVarLen) {
+        auto logical_problem_shape = cutlass::fmha::collective::apply_variable_length(problem_size, b);
+        seq_len_qo = get<3>(logical_problem_shape);
+        seq_len_kv = get<4>(logical_problem_shape);
+        seq_len_kv_cache = get<5>(logical_problem_shape);
+      } else {
+        seq_len_qo = get<3>(problem_size);
+        seq_len_kv = get<4>(problem_size);
+        seq_len_kv_cache = get<5>(problem_size);
+      }
+
+      ElementQ* q_ptr_orig = block_Q.get() + offset_q;
+      ElementK* k_ptr_orig;
+      ElementV* v_ptr_orig;
+
+      void* q_ptr = q_ptr_orig;
+      void* k_ptr;
+      void* v_ptr;
+
+      int seq_len_kv_total = seq_len_kv_cache + seq_len_kv;
+      cutlass::DeviceAllocation<ElementK> block_K_concat;
+      cutlass::DeviceAllocation<ElementV> block_V_concat;
+
+      if (seq_len_kv_cache > 0) { // use_kv_cache
+        if (options.use_paged_kv) {
+          int num_pages = paged_kv_cache.page_table.size();
+          std::vector<int> host_page_table(paged_kv_cache.page_table.size());
+          std::vector<int> host_num_pages_per_seq(paged_kv_cache.num_pages_per_seq.size());
+          compat::memcpy<int>(host_page_table.data(), paged_kv_cache.page_table.get(), paged_kv_cache.page_table.size());
+          compat::memcpy<int>(host_num_pages_per_seq.data(), paged_kv_cache.num_pages_per_seq.get(), paged_kv_cache.num_pages_per_seq.size());
+
+          int curr_batch_pages = isVarLen ? host_num_pages_per_seq[b + 1] - host_num_pages_per_seq[b] : ceil_div(seq_len_kv_cache, paged_kv_cache.page_size);
+          int batch_offset = isVarLen ? host_num_pages_per_seq[b] : b * curr_batch_pages;
+          block_K_concat.reset((seq_len_kv + curr_batch_pages * paged_kv_cache.page_size) * num_heads_kv * head_size_qk);
+          block_V_concat.reset((seq_len_kv + curr_batch_pages * paged_kv_cache.page_size) * num_heads_kv * head_size_vo);
+
+          for (int p = 0; p < curr_batch_pages; p++) {
+            int page_idx = host_page_table[batch_offset + p];
+            // copy the page from KV cache to the concatenated buffer
+            compat::memcpy<ElementK>(
+              block_K_concat.get() + p * paged_kv_cache.page_size * num_heads_kv * head_size_qk,
+              block_K_cache.get() + page_idx * paged_kv_cache.page_size * num_heads_kv * head_size_qk,
+              paged_kv_cache.page_size * num_heads_kv * head_size_qk
+            );
+            compat::memcpy<ElementV>(
+              block_V_concat.get() + p * paged_kv_cache.page_size * num_heads_kv * head_size_vo,
+              block_V_cache.get() + page_idx * paged_kv_cache.page_size * num_heads_kv * head_size_vo,
+              paged_kv_cache.page_size * num_heads_kv * head_size_vo
+            );
+          }
+          if (seq_len_kv > 0) {
+            compat::memcpy<ElementK>(
+              // block_K_concat.get() + curr_batch_pages * paged_kv_cache.page_sze * num_heads_kv *head_size_qk,
+              block_K_concat.get() + seq_len_kv_cache * num_heads_kv * head_size_qk,
+              block_K.get() + offset_k,
+              seq_len_kv * num_heads_kv * head_size_qk
+            );
+            compat::memcpy<ElementV>(
+              block_V_concat.get() + seq_len_kv_cache * num_heads_kv * head_size_vo,
+              block_V.get() + offset_v,
+              seq_len_kv * num_heads_kv * head_size_vo
+            );
+          }
+          compat::wait();
+        } else {
+          block_K_concat.reset(seq_len_kv_total * num_heads_kv * head_size_qk);
+          block_V_concat.reset(seq_len_kv_total * num_heads_kv * head_size_vo);
+          // Concatenate K_cache and K
+          compat::memcpy<ElementK>(
+            block_K_concat.get(),
+            block_K_cache.get() + offset_k_cache,
+            seq_len_kv_cache * num_heads_kv * head_size_qk
+          );
+          compat::memcpy<ElementK>(
+            block_K_concat.get() + seq_len_kv_cache * num_heads_kv * head_size_qk,
+            block_K.get() + offset_k,
+            seq_len_kv * num_heads_kv * head_size_qk
+          );
+          // Concatenate V_cache and V
+          compat::memcpy<ElementV>(
+              block_V_concat.get(),
+              block_V_cache.get() + offset_v_cache,
+              seq_len_kv_cache * num_heads_kv * head_size_vo
+            );
+          compat::memcpy<ElementV>(
+            block_V_concat.get() + seq_len_kv_cache * num_heads_kv * head_size_vo,
+            block_V.get() + offset_v,
+            seq_len_kv * num_heads_kv * head_size_vo
+          );
+          // compat::wait();
+        }
+      k_ptr_orig = block_K_concat.get();
+      v_ptr_orig = block_V_concat.get();
+      } else {
+        k_ptr_orig = block_K.get() + offset_k;
+        v_ptr_orig = block_V.get() + offset_v;
+      }
+
+      k_ptr = k_ptr_orig;
+      v_ptr = v_ptr_orig;
+
+      if constexpr (is_fp8_v<ElementQ>) {
+          block_Q_ref.reset(seq_len_qo * num_heads_q * head_size_qk);
+          run_conversion_kernel<ElementQ, RefElement, ElementQ>(
+              q_ptr_orig, block_Q_ref.get(), block_Q_ref.size(), h_scale_q);
+          q_ptr = block_Q_ref.get();
+      }
+      if constexpr (is_fp8_v<ElementK>) {
+          block_K_ref.reset(seq_len_kv_total * num_heads_kv * head_size_qk);
+          run_conversion_kernel<ElementK, RefElement, ElementK>(
+              k_ptr_orig, block_K_ref.get(), block_K_ref.size(), h_scale_k);
+          k_ptr = block_K_ref.get();
+      }
+      if constexpr (is_fp8_v<ElementV>) {
+          block_V_ref.reset(seq_len_kv_total * num_heads_kv * head_size_vo);
+          run_conversion_kernel<ElementV, RefElement, ElementV>(
+              v_ptr_orig, block_V_ref.get(), block_V_ref.size(), h_scale_v);
+          v_ptr = block_V_ref.get();
+      }
+      compat::wait();
+
+      for (int q_group = 0; q_group < num_heads_q / q_group_size; q_group++) {
+        for (int q_head = 0; q_head < q_group_size; q_head++) {
+          cutlass::DeviceAllocation<ElementAccumulator> block_S;
+          block_S.reset(seq_len_qo * seq_len_kv_total);
+
+          int head_offset_q = (q_group * q_group_size + q_head) * head_size_qk;
+          int head_offset_k = q_group * head_size_qk;
+          int head_offset_v = q_group * head_size_vo;
+
+          cutlass::TensorRef ref_Q_head(reinterpret_cast<RefElement*>(q_ptr) + head_offset_q, LayoutQ(num_heads_q * head_size_qk));
+          cutlass::TensorRef ref_K_head(reinterpret_cast<RefElement*>(k_ptr) + head_offset_k, LayoutK(num_heads_kv * head_size_qk));
+          cutlass::TensorRef ref_V_head(reinterpret_cast<RefElement*>(v_ptr) + head_offset_v, LayoutV(num_heads_kv * head_size_vo));
+          cutlass::TensorRef ref_S(block_S.get(), LayoutQ::packed({seq_len_qo, seq_len_kv_total}));
+
+          cutlass::reference::device::GemmComplex(
+              {seq_len_qo, seq_len_kv_total, head_size_qk},
+              ElementAccumulator{1},
+              ref_Q_head,
+              cutlass::ComplexTransform::kNone,
+              ref_K_head,
+              cutlass::ComplexTransform::kNone,
+              ElementAccumulator{0},
+              ref_S,
+              ref_S
+          );
+          compat::wait();
+
+          std::vector<ElementAccumulator> host_S(block_S.size());
+          compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
+
+          // delete this memory as it is no longer needed
+          block_S.reset();
+          auto offset = cute::min(seq_len_qo, seq_len_kv);
+          auto discard_seq_coord = seq_len_qo - offset;
+          auto full_tile_offset = seq_len_kv - offset;
+          // apply mask to S
+          for (int row = 0; row < seq_len_qo; row++) {
+            for (int col = 0; col < seq_len_kv_total; col++) {
+              // causal mask
+              if (options.is_causal && (col - full_tile_offset > row + seq_len_kv_cache - discard_seq_coord)) {
+                host_S[col + row * seq_len_kv_total] = ElementAccumulator{-INFINITY};
+              }
+              // sliding window mask
+              int col_ref = seq_len_kv_cache + seq_len_kv - seq_len_qo;
+              bool left_mask = col < cute::max(0, col_ref + row - options.window_left);
+              bool right_mask = col > cute::min(seq_len_kv_total, col_ref + row + options.window_right);
+              if (options.is_local_mask && (left_mask || right_mask)) {
+                host_S[col + row * seq_len_kv_total] = ElementAccumulator{-INFINITY};
+              }
+            }
+          }
+
+          // compute max element per row of S
+          std::vector<ElementAccumulator> max_vec(seq_len_qo, ElementAccumulator{-INFINITY});
+          for (int row = 0; row < seq_len_qo; row++) {
+            int idx = row * seq_len_kv_total;
+            int max_idx = row;
+            max_vec[max_idx] = host_S[idx++];
+            for (int col = 1; col < seq_len_kv_total; col++, idx++) {
+              if (max_vec[max_idx] < host_S[idx])
+                max_vec[max_idx] = host_S[idx];
+            }
+          }
+          // compute exp of S
+          for (int row = 0; row < seq_len_qo; row++) {
+            int idx = row * seq_len_kv_total;
+            int max_idx = row;
+            for (int col = 0; col < seq_len_kv_total; col++, idx++) {
+              host_S[idx] = expf((host_S[idx] - max_vec[max_idx]) / options.softmax_scale);
+            }
+          }
+
+          // compute sum per row of S
+          std::vector<ElementAccumulator> sum_vec(seq_len_qo, ElementAccumulator{0});
+          for (int row = 0; row < seq_len_qo; row++) {
+            int idx = row * seq_len_kv_total;
+            int sum_idx = row;
+            for (int col = 0; col < seq_len_kv_total; col++, idx++) {
+              sum_vec[sum_idx] += host_S[idx];
+            }
+
+            // scale each row with the sum to compute softmax
+            idx = row * seq_len_kv_total;
+            sum_idx = row;
+            int col_ref = seq_len_kv_cache + seq_len_kv - seq_len_qo;
+            for (int col = 0; col < seq_len_kv_total; col++, idx++) {
+              if (options.is_causal && row < discard_seq_coord) {
+                host_S[idx] = 0;
+              } else if (options.is_local_mask && (col < cute::max(0, col_ref + row - options.window_left)
+                    || col > cute::min(seq_len_kv_total, col_ref + row + options.window_right))) {
+                host_S[idx] = 0;
+              } else {
+                host_S[idx] /= sum_vec[sum_idx];
+              }
+            }
+          }
+          std::vector<RefElement> host_P(host_S.size());
+          for (int p = 0; p < host_P.size(); p++)
+            host_P[p] = static_cast<RefElement>(host_S[p]);
+
+          cutlass::DeviceAllocation<RefElement> block_P;
+          block_P.reset(host_P.size());
+
+          compat::memcpy<RefElement>(block_P.get(), host_P.data(), host_P.size());
+
+          cutlass::TensorRef ref_P(block_P.get(), LayoutQ::packed({seq_len_qo, seq_len_kv_total}));
+
+          cutlass::DeviceAllocation<ElementAccumulator> block_acc;
+          block_acc.reset(seq_len_qo * head_size_vo);
+          cutlass::TensorRef ref_acc(block_acc.get(), LayoutO::packed({seq_len_qo, head_size_vo}));
+
+          cutlass::reference::device::GemmComplex(
+              {seq_len_qo, head_size_vo, seq_len_kv_total},
+              ElementAccumulator{1},
+              ref_P,
+              cutlass::ComplexTransform::kNone,
+              ref_V_head,
+              cutlass::ComplexTransform::kNone,
+              ElementAccumulator{0},
+              ref_acc,
+              ref_acc
+          );
+
+          compat::wait();
+          // delete this memory as it is no longer needed
+          block_P.reset();
+
+          std::vector<ElementAccumulator> vec_acc(block_acc.size());
+          compat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
+
+          // delete this memory as it is no longer needed
+          block_acc.reset();
+          for (int seq = 0; seq < seq_len_qo; seq++) {
+            for (int hvo = 0; hvo < head_size_vo; hvo++) {
+              int idx = offset_o + seq * num_heads_q * head_size_vo + (q_group * q_group_size + q_head) * head_size_vo + hvo;
+              host_O[idx] = static_cast<ElementOutput>(vec_acc[seq * head_size_vo + hvo]);
+            }
+          }
+        } // end of q_group loop
+      } // end of q_head loop
+      offset_q += seq_len_qo * num_heads_q * head_size_qk;
+      offset_k += seq_len_kv * num_heads_kv * head_size_qk;
+      offset_v += seq_len_kv * num_heads_kv * head_size_vo;
+      offset_k_cache += seq_len_kv_cache * num_heads_kv * head_size_qk;
+      offset_v_cache += seq_len_kv_cache * num_heads_kv * head_size_vo;
+      offset_o += seq_len_qo * num_heads_q * head_size_vo;
+    } // end of batch loop
+
+    compat::wait();
+    compat::memcpy<ElementOutput>(block_ref_O.get(), host_O.data(), host_O.size());
+
+    // Check if output from CUTLASS kernel and reference kernel are equal or not
+    bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
+                                                                          block_O.size(), ElementOutput{0.1}, ElementOutput{0.1});
+
+    return passed;
+  }
+
+  template<class ProblemShape>
+  auto initialize_varlen(const ProblemShape& problem_size) {
+    int num_batches = get<0>(problem_size);
+    int seq_len_kv_cache = get<5>(problem_size);
+
+    // generate Q as --b times
+    //    gaussian (--Q, --Q / 2) sampled positive
+    //    track cumulative
+    std::mt19937 rng(0x202305151552ull);
+    std::normal_distribution<double> dist_q(get<3>(problem_size), get<3>(problem_size) / 2);
+    std::normal_distribution<double> dist_kv(get<4>(problem_size), get<4>(problem_size) / 2);
+    std::normal_distribution<double> dist_kv_cache(get<5>(problem_size), get<5>(problem_size) / 2);
+
+    // Use Cacheline Size to calculate alignment
+    constexpr int cacheline_bytes = 64;
+    constexpr int AlignmentQ = cacheline_bytes / sizeof(ElementQ);    // Alignment of Q matrix in units of elements
+    constexpr int AlignmentKV = cacheline_bytes / sizeof(ElementK);   // Alignment of Kand V matrix in units of elements
+
+    auto generate_positive_int = [](auto& dist, auto& gen) {
+      int result = 0;
+      do {
+        result = static_cast<int>(dist(gen));
+      } while (result <= 0);
+      return result;
+    };
+
+    cumulative_seqlen_q = {0};
+    cumulative_seqlen_kv = {0};
+    cumulative_seqlen_kv_cache = {0};
+
+    int total_seqlen_q = 0;
+    int total_seqlen_kv = 0;
+    int total_seqlen_kv_cache = 0;
+    int max_seqlen_q = 0;
+    int max_seqlen_kv = 0;
+    int max_seqlen_kv_cache = 0;
+
+    for (int i = 0; i < num_batches; i++) {
+      int seqlen_q = cutlass::round_up(generate_positive_int(dist_q, rng), AlignmentQ);
+      int seqlen_kv = cute::get<4>(problem_size) == 0 ? 0 : cutlass::round_up(generate_positive_int(dist_kv, rng), AlignmentKV);
+      int seqlen_kv_cache = cute::get<5>(problem_size) == 0 ? 0 : cutlass::round_up(generate_positive_int(dist_kv_cache, rng), AlignmentKV);
+
+      total_seqlen_q += seqlen_q;
+      total_seqlen_kv += seqlen_kv;
+      total_seqlen_kv_cache += seqlen_kv_cache;
+
+      max_seqlen_q = std::max(max_seqlen_q, seqlen_q);
+      max_seqlen_kv = std::max(max_seqlen_kv, seqlen_kv);
+      max_seqlen_kv_cache = std::max(max_seqlen_kv_cache, seqlen_kv_cache);
+
+      cumulative_seqlen_q.push_back(cumulative_seqlen_q.back() + seqlen_q);
+      cumulative_seqlen_kv.push_back(cumulative_seqlen_kv.back() + seqlen_kv);
+      cumulative_seqlen_kv_cache.push_back(cumulative_seqlen_kv_cache.back() + seqlen_kv_cache);
+    }
+
+    ProblemShape problem_size_for_init = problem_size;
+    get<0>(problem_size_for_init) = 1;
+    get<3>(problem_size_for_init) = total_seqlen_q;
+    get<4>(problem_size_for_init) = total_seqlen_kv;
+    get<5>(problem_size_for_init) = total_seqlen_kv_cache;
+
+    ProblemShapeType problem_size_for_launch;
+
+    get<3>(problem_size_for_launch) = cutlass::fmha::collective::VariableLength{max_seqlen_q, total_seqlen_q};
+    get<4>(problem_size_for_launch) = cutlass::fmha::collective::VariableLength{max_seqlen_kv, total_seqlen_kv};
+    get<5>(problem_size_for_launch) = cutlass::fmha::collective::VariableLength{max_seqlen_kv_cache, total_seqlen_kv_cache};
+    get<6>(problem_size_for_launch) = get<6>(problem_size);
+    get<7>(problem_size_for_launch) = get<7>(problem_size);
+    get<0>(problem_size_for_launch) = get<0>(problem_size);
+    get<1>(problem_size_for_launch) = get<1>(problem_size);
+    get<2>(problem_size_for_launch) = get<2>(problem_size);
+
+
+    return cute::make_tuple(problem_size_for_init, problem_size_for_launch);
+  }
+
+  /// Initialize operands to be used in the GEMM and reference GEMM
+  ProblemShapeType initialize(const Options &options) {
+    auto problem_shape_in =
+        cute::make_tuple(options.batch, options.num_heads_q, options.num_heads_kv, options.seq_len_qo, options.seq_len_kv, options.seq_len_kv_cache, options.head_size_qk, options.head_size_vo);
+
+    ProblemShapeType problem_shape;
+    decltype(problem_shape_in) problem_size;
+
+    if constexpr (isVarLen) {
+      auto [problem_shape_init, problem_shape_launch] = initialize_varlen(problem_shape_in);
+      problem_shape = problem_shape_launch;
+      problem_size = problem_shape_init;
+    }
+    else {
+      problem_size = problem_shape_in;
+      problem_shape = problem_shape_in;
+    }
+
+    auto [batch, num_heads_q, num_heads_kv, seq_len_qo, seq_len_kv, seq_len_kv_cache, head_size_qk, head_size_vo] = problem_size;
+
+    stride_Q = cutlass::make_cute_packed_stride(StrideQ{}, cute::make_shape(seq_len_qo, num_heads_q * head_size_qk, batch));
+    stride_K = cutlass::make_cute_packed_stride(StrideK{}, cute::make_shape(seq_len_kv, num_heads_kv * head_size_qk, batch));
+    stride_V = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo * num_heads_kv, seq_len_kv, batch));
+
+    stride_K_cache = cutlass::make_cute_packed_stride(StrideK{}, cute::make_shape(seq_len_kv_cache, num_heads_kv * head_size_qk, batch));
+    stride_V_cache = cutlass::make_cute_packed_stride(StrideV{}, cute::make_shape(head_size_vo * num_heads_kv, seq_len_kv_cache, batch));
+    stride_O = cutlass::make_cute_packed_stride(StrideO{}, cute::make_shape(seq_len_qo, num_heads_q * head_size_vo, batch));
+
+    block_Q.reset(batch * num_heads_q * seq_len_qo * head_size_qk);
+    block_K.reset(batch * num_heads_kv * seq_len_kv * head_size_qk);
+    block_V.reset(batch * num_heads_kv * seq_len_kv * head_size_vo);
+    if (!options.use_paged_kv) {
+      block_K_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_qk);
+      block_V_cache.reset(batch * num_heads_kv * seq_len_kv_cache * head_size_vo);
+    }
+    block_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
+    block_ref_O.reset(batch * num_heads_q * seq_len_qo * head_size_vo);
+
+    if (options.use_paged_kv) {
+      paged_kv_cache.page_size = options.page_size;
+      std::vector<int> num_pages_per_seq{0};
+      int num_pages = 0;
+      for(int b = 0; b < cute::get<0>(problem_shape); b++) {
+        int seq_len_cache = isVarLen ? cumulative_seqlen_kv_cache[b + 1] - cumulative_seqlen_kv_cache[b] : seq_len_kv_cache;
+        int pages_per_seq = ceil_div(seq_len_cache, paged_kv_cache.page_size);
+        num_pages_per_seq.push_back(num_pages_per_seq.back() + pages_per_seq);
+        num_pages += pages_per_seq;
+      }
+      paged_kv_cache.page_table.reset(num_pages);
+
+
+      // initialize block table with random mapping for non-contiguous layout
+      std::vector<int> page_mapping(num_pages);
+      for (int b = 0; b < cute::get<0>(problem_shape); ++b) {
+        std::vector<int> physical_pages(num_pages_per_seq[b + 1] - num_pages_per_seq[b]);
+        std::iota(physical_pages.begin(), physical_pages.end(), 0);
+        // shuffle physical pages
+        std::shuffle(physical_pages.begin(), physical_pages.end(), std::mt19937{ std::random_device{}() });
+        for (int blk = 0; blk < physical_pages.size(); ++blk) {
+          int logical_idx = num_pages_per_seq[b] + blk;
+          page_mapping[logical_idx] = physical_pages[blk];
+        }
+      }
+      compat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
+
+      paged_kv_cache.num_pages_per_seq.reset(num_pages_per_seq.size());
+      compat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
+
+      block_K_cache.reset(num_pages * paged_kv_cache.page_size * num_heads_kv * head_size_qk);
+      block_V_cache.reset(num_pages * paged_kv_cache.page_size * num_heads_kv * head_size_vo);
+    }
+
+    initialize_block(block_Q, seed + 2023);
+    initialize_block(block_K, seed + 2022);
+    initialize_block(block_V, seed + 2021);
+    initialize_block(block_K_cache, seed + 2024);
+    initialize_block(block_V_cache, seed + 2025);
+
+    if (!cumulative_seqlen_q.empty()) {
+      device_cumulative_seqlen_q.reset(cumulative_seqlen_q.size());
+      device_cumulative_seqlen_q.copy_from_host(
+        cumulative_seqlen_q.data(), cumulative_seqlen_q.size());
+    }
+
+    if (!cumulative_seqlen_kv.empty()) {
+      device_cumulative_seqlen_kv.reset(cumulative_seqlen_kv.size());
+      device_cumulative_seqlen_kv.copy_from_host(
+        cumulative_seqlen_kv.data(), cumulative_seqlen_kv.size());
+    }
+
+    if (!cumulative_seqlen_kv_cache.empty()) {
+      device_cumulative_seqlen_kv_cache.reset(cumulative_seqlen_kv_cache.size());
+      device_cumulative_seqlen_kv_cache.copy_from_host(
+        cumulative_seqlen_kv_cache.data(), cumulative_seqlen_kv_cache.size());
+    }
+
+    if constexpr (isVarLen) {
+      get<3>(problem_shape).max_length = get<3>(problem_shape).max_length;
+      get<3>(problem_shape).total_length = get<3>(problem_shape).total_length;
+      get<3>(problem_shape).cumulative_length = device_cumulative_seqlen_q.get();
+
+      get<5>(problem_shape).max_length = get<5>(problem_shape).max_length;
+      get<5>(problem_shape).total_length = get<5>(problem_shape).total_length;
+      get<5>(problem_shape).cumulative_length = device_cumulative_seqlen_kv_cache.get();
+
+      get<4>(problem_shape).max_length = get<4>(problem_shape).max_length;
+      get<4>(problem_shape).total_length = get<4>(problem_shape).total_length;
+      get<4>(problem_shape).cumulative_length = device_cumulative_seqlen_kv.get();
+
+    }
+
+    return problem_shape;
+  }
+
+  // Note that the GemmUniversalAdapter currently doesn't support flash attention, which is why this
+  // secondary `run` function is required to launch the kernel.
+  static void run(typename FMHAChunkPrefillKernel::Params params) {
+    dim3 const block = FMHAChunkPrefillKernel::get_block_shape();
+    dim3 const grid = FMHAChunkPrefillKernel::get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = FMHAChunkPrefillKernel::SharedStorageSize;
+
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
+
+// Launch parameters depend on whether SYCL compiler supports work-group scratch memory extension
+#if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
+    using namespace compat::experimental;
+    auto event = launch<cutlass::device_kernel<FMHAChunkPrefillKernel>>(
+        launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
+                      kernel_properties{sycl_exp::sub_group_size<FMHAChunkPrefillKernel::DispatchPolicy::SubgroupSize>}},
+        params);
+#else
+    compat::experimental::launch_properties launch_props {
+      sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
+    };
+    compat::experimental::kernel_properties kernel_props{
+      sycl::ext::oneapi::experimental::sub_group_size<FMHAChunkPrefillKernel::DispatchPolicy::SubgroupSize>
+    };
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<FMHAChunkPrefillKernel>>(policy, params);
+#endif
+
+    EventManager::getInstance().addEvent(event);
+  }
+
+  cutlass::Status run(
+      const Options &options,
+      const cutlass::KernelHardwareInfo &hw_info,
+      const float* q_scale,
+      const float* k_scale,
+      const float* v_scale
+  ) {
+
+    ProblemShapeType problem_size = initialize(options);
+
+    typename FMHAChunkPrefillKernel::Arguments arguments{
+        cutlass::gemm::GemmUniversalMode::kGemm,
+        problem_size,
+        {block_Q.get(), stride_Q,
+        block_K.get(), stride_K,
+        block_V.get(), stride_V,
+        q_scale,
+        k_scale,
+        v_scale,
+        block_K_cache.get(), stride_K_cache,
+        block_V_cache.get(), stride_V_cache,
+        options.use_paged_kv ? paged_kv_cache.page_table.get() : nullptr,
+        options.use_paged_kv ? paged_kv_cache.page_size : 0,
+        options.use_paged_kv ? paged_kv_cache.num_pages_per_seq.get() : nullptr,
+        options.window_left,
+        options.window_right},
+        {options.softmax_scale},
+        {block_O.get(), stride_O},
+        hw_info};
+
+    // Define device-global scratch memory
+    size_t workspace_size = FMHAChunkPrefillKernel::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    if (!FMHAChunkPrefillKernel::can_implement(arguments)) {
+      std::cout << "Invalid Problem Size: " << options.batch << 'x' << options.num_heads_q << 'x' <<
+        options.seq_len_qo << 'x' << options.seq_len_kv << 'x' << options.head_size_qk << 'x'  << options.head_size_vo
+        << (options.is_causal ? "xCausal" : "xNonCausal") << (options.is_local_mask ? "xLocalMask" : "xNonLocalMask") << std::endl;
+      return cutlass::Status::kErrorInvalidProblem;
+    }
+
+    // Initialize the workspace
+    CUTLASS_CHECK(FMHAChunkPrefillKernel::initialize_workspace(arguments, workspace.get()));
+
+    // Convert host-side arguments to device-side arguments to be passed to the kernel
+    auto params = FMHAChunkPrefillKernel::to_underlying_arguments(arguments, workspace.get());
+
+    // Run the Flash Attention implementation.
+    run(params);
+
+    compat::wait();
+
+    // Verify that the result is correct
+    bool passed = true; //verify(problem_size, options, q_scale, k_scale, v_scale);
+    std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+    if (!passed) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    if (options.iterations > 0) {
+      GPU_Clock timer;
+      timer.start();
+      for (int i = 0; i < options.iterations; ++i) {
+        run(params);
+      }
+      compat::wait();
+
+      auto offset = cute::min(options.seq_len_qo, options.seq_len_kv);
+      auto discard_seq_coord = options.seq_len_qo - offset;
+      auto full_tile_offset = options.seq_len_kv - offset;
+      // offset + 1 is going to be ceil_div
+      auto effective_seq_len_kv = options.seq_len_kv_cache + (options.is_causal ? full_tile_offset + ((offset + 1) / 2.0) :
+                                                                                  options.is_local_mask ? (options.window_left + options.window_right)
+                                                                                  : options.seq_len_kv);
+      auto effective_seq_len_qo = options.is_causal ? options.seq_len_qo - discard_seq_coord : options.seq_len_qo;
+      double cute_time = timer.seconds() / options.iterations;
+      double flops_qk = 2.0 * options.batch * options.num_heads_q * effective_seq_len_qo * effective_seq_len_kv * options.head_size_qk;
+      double flops_pv = 2.0 *  options.batch * options.num_heads_q * effective_seq_len_qo * options.head_size_vo * effective_seq_len_kv;
+      double tflops = ((flops_qk + flops_pv) * 1e-12) / cute_time;
+      double gbps_qk =  options.batch * (sizeof(ElementQ) * options.num_heads_q * effective_seq_len_qo * options.head_size_qk +
+                                         sizeof(ElementK) * options.num_heads_kv * effective_seq_len_kv * options.head_size_qk);
+      double gbps_pv = sizeof(ElementV) * options.batch * options.num_heads_kv * effective_seq_len_kv * options.head_size_vo +
+                       sizeof(ElementOutput) * options.batch * options.num_heads_q * effective_seq_len_qo * options.head_size_vo;
+      double gbps = ((gbps_qk + gbps_pv)  * 1e-9) / (cute_time);
+      std::cout << "Batch: " << options.batch << "\tNumHeads_q: " << options.num_heads_q << "\tNumHeads_kv: " << options.num_heads_kv << "\tSeq Length QO: " << options.seq_len_qo
+          << "\tSeq Length KV: " << options.seq_len_kv << "\tSeq Length KV Cache: " << options.seq_len_kv_cache
+          << "\tHead Size QK: " << options.head_size_qk << "\tHead Size VO: " << options.head_size_vo
+          << "\tCausal Mask: " << (options.is_causal ? "true" : "false") << "\tVariable Sequence Length: " << (options.varlen ? "true" : "false")
+          << "\t Scheduler: " << options.scheduler << "\t Paged KV cache: " << (options.use_paged_kv ? "true" : "false");
+      printf("\nPerformance:   %4.3f  GB/s,    %4.3f  TFlop/s,   %6.4f  ms\n\n", gbps, tflops, cute_time * 1000);
+    }
+
+    return cutlass::Status::kSuccess;
+  }
+};
+
+// the default value used for the case BF16
+template <bool Causal,
+          bool LocalMask,
+          typename TileShapeQK,
+          typename TileShapePV,
+          typename TileShapeOutput,
+          typename SubgroupLayout,
+          int PipelineStages,
+          typename ElementInputQ = bfloat16_t,
+          typename ElementInputKV = bfloat16_t,
+          typename MMAOperation = XE_8x16x16_F32BF16BF16F32_TT,
+          typename GmemTiledCopyQ = XE_2D_U16x8x32_LD_N,
+          typename GmemTiledCopyK = XE_2D_U16x16x16_LD_T, // _T designates a transposed block load operation
+          typename GmemTiledCopyV = XE_2D_U16x16x32_LD_V,
+          typename ElementAccumulator = float,
+          typename ElementComputeEpilogue = float,
+          typename ElementOutput = bfloat16_t,
+          typename GmemTiledCopyStore = XE_2D_U16x8x16_ST_N> struct FMHAConfig {
+
+  template <bool isVarLen, bool PagedKV, class Scheduler>
+  static int run(const Options &options,
+                 const float* q_scale,
+                 const float* k_scale,
+                 const float* v_scale) {
+    //
+    // Run examples
+    //
+
+    // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+    // information is used by the underlying kernel.
+    cutlass::KernelHardwareInfo hw_info;
+
+    using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
+    using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
+    using CollectiveEpilogue = cutlass::flash_attention::collective::FlashChunkPrefillEpilogue<
+        EpilogueDispatchPolicy, MMAOperation, TileShapeOutput, SubgroupLayout, ElementComputeEpilogue, ElementOutput, cutlass::gemm::TagToStrideC_t<LayoutO>, ElementOutput,
+        GmemTiledCopyStore>;
+    using CollectiveSoftmaxEpilogue = cutlass::flash_attention::collective::FlashChunkPrefillSoftmaxEpilogue<Causal, LocalMask, EpilogueDispatchPolicy, ElementAccumulator>;
+
+    using ProblemShapeRegular = cute::tuple<int, int, int, int, int, int, int, int>;
+    using namespace cutlass::fmha::collective;
+    using ProblemShapeVarlen = cute::tuple<int, int, int, VariableLength, VariableLength, VariableLength, int, int>;
+    using ProblemShapeType = std::conditional_t<isVarLen, ProblemShapeVarlen, ProblemShapeRegular>;
+
+    // Mainloop
+    using CollectiveMainloop = cutlass::flash_attention::collective::FlashChunkPrefillMma<
+        GEMMDispatchPolicy, ProblemShapeType, ElementInputQ, cutlass::gemm::TagToStrideA_t<LayoutQ>, ElementInputKV,
+        cutlass::gemm::TagToStrideB_t<LayoutK>, ElementInputKV, cutlass::gemm::TagToStrideB_t<LayoutV>, MMAOperation, TileShapeQK, TileShapePV, SubgroupLayout,
+        GmemTiledCopyQ, // Q
+        GmemTiledCopyK, // K
+        GmemTiledCopyV, // V,
+        Causal,
+        LocalMask,
+        PagedKV>;
+
+    using FMHAChunkPrefillKernel = cutlass::flash_attention::kernel::FMHAPrefillChunk<ProblemShapeType, CollectiveMainloop,
+                                                                     CollectiveSoftmaxEpilogue, CollectiveEpilogue, Scheduler>;
+
+    ExampleRunner<FMHAChunkPrefillKernel, isVarLen> runner;
+
+    CUTLASS_CHECK(runner.run(options, hw_info, q_scale, k_scale, v_scale));
+    // runner.run(options, hw_info, q_scale, k_scale, v_scale);
+    return 0;
+  }
+
+  static int run(const Options &options,
+                 const float* q_scale = nullptr,
+                 const float* k_scale = nullptr,
+                 const float* v_scale = nullptr) {
+    if (options.use_paged_kv && !options.varlen) {
+      return run<false, true, cutlass::flash_attention::IndividualScheduler>(options, q_scale, k_scale, v_scale);
+    } else if(!options.use_paged_kv && options.varlen) {
+      return run<true, false, cutlass::flash_attention::IndividualScheduler>(options, q_scale, k_scale, v_scale);
+    } else if(!options.use_paged_kv && !options.varlen) {
+      return run<false, false, cutlass::flash_attention::IndividualScheduler>(options, q_scale, k_scale, v_scale);
+    } else {
+      return run<true, true, cutlass::flash_attention::IndividualScheduler>(options, q_scale, k_scale, v_scale);
+    }
+  }
+};
diff --git a/examples/07_bmg_dual_gemm/07_bmg_dual_gemm.cpp b/examples/07_bmg_dual_gemm/07_bmg_dual_gemm.cpp
index 20f0741f47..8fd0a0e7bc 100644
--- a/examples/07_bmg_dual_gemm/07_bmg_dual_gemm.cpp
+++ b/examples/07_bmg_dual_gemm/07_bmg_dual_gemm.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -237,7 +238,7 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     for(int batch = 0, offset = 0; batch < L; batch++, offset += M * N) {
       auto D0_view = cutlass::TensorView(block_ref_D0.get() + offset, LayoutD::packed({M, N}), cutlass::make_Coord(M, N));
@@ -246,7 +247,7 @@ struct ExampleRunner {
         cutlass::reference::device::TensorPerRowBias(D0_view, bias0_view);
       }
 
-      syclcompat::wait();
+      compat::wait();
 
       auto D1_view = cutlass::TensorView(block_ref_D1.get() + offset, LayoutD::packed({M, N}), cutlass::make_Coord(M, N));
       if constexpr (UseBias1) {
@@ -254,13 +255,13 @@ struct ExampleRunner {
         cutlass::reference::device::TensorPerRowBias(D1_view, bias1_view);
       }
 
-      syclcompat::wait();
+      compat::wait();
 
       auto D2_view = cutlass::TensorView(block_ref_D2.get() + offset, LayoutD::packed({M, N}), cutlass::make_Coord(M, N));
       cutlass::reference::device::TensorSiLu(D2_view, D0_view, D1_view);
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed_D0 = WriteEpilogueOutput0 ? cutlass::reference::device::BlockCompareEqual(
@@ -324,24 +325,24 @@ struct ExampleRunner {
     // configure smem size and carveout
     int smem_size = GemmKernel::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<GemmKernel>>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props{
+    compat::experimental::launch_properties launch_props{
       sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size)
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
       sycl::ext::oneapi::experimental::sub_group_size<GemmKernel::DispatchPolicy::SubgroupSize>
     };
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<GemmKernel>>(policy, params);
 #endif
 
     EventManager::getInstance().addEvent(event);
@@ -422,7 +423,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(run(params));
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify<WriteEpilogueOutput0, WriteEpilogueOutput1, UseBias0, UseBias1>(problem_size, options.alpha0, options.alpha1, options.beta0, options.beta1);
@@ -436,7 +437,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         run(params);
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = 2 * (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/08_bmg_gemm_f8/08_bmg_gemm_f8.cpp b/examples/08_bmg_gemm_f8/08_bmg_gemm_f8.cpp
index 59f5f3faa8..b6dfe09a3b 100644
--- a/examples/08_bmg_gemm_f8/08_bmg_gemm_f8.cpp
+++ b/examples/08_bmg_gemm_f8/08_bmg_gemm_f8.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -213,7 +214,7 @@ struct ExampleRunner {
           M * N,
           M * N 
       );
-      syclcompat::wait();
+      compat::wait();
 
       bool passed = cutlass::reference::device::BlockCompareEqual(
           block_ref_D.get(), block_D.get(), block_D.size());
@@ -275,7 +276,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -289,7 +290,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/08_bmg_gemm_f8/08_bmg_gemm_f8_scaling.cpp b/examples/08_bmg_gemm_f8/08_bmg_gemm_f8_scaling.cpp
index 16699dcd5f..e5d16a508b 100644
--- a/examples/08_bmg_gemm_f8/08_bmg_gemm_f8_scaling.cpp
+++ b/examples/08_bmg_gemm_f8/08_bmg_gemm_f8_scaling.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -213,7 +214,7 @@ struct ExampleRunner {
   //
   template <typename SrcT, typename DstT>
   void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
-    syclcompat::get_default_queue().parallel_for(size, [=](auto indx) {
+    compat::get_default_queue().parallel_for(size, [=](auto indx) {
       d_dst[indx] = static_cast<DstT>(d_src[indx]);
     }).wait();
   }
@@ -428,7 +429,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(options);
@@ -442,7 +443,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/09_bmg_grouped_gemm_f8/09_bmg_grouped_gemm_f8.cpp b/examples/09_bmg_grouped_gemm_f8/09_bmg_grouped_gemm_f8.cpp
index 5f241aa568..0010a1d5a5 100644
--- a/examples/09_bmg_grouped_gemm_f8/09_bmg_grouped_gemm_f8.cpp
+++ b/examples/09_bmg_grouped_gemm_f8/09_bmg_grouped_gemm_f8.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -305,7 +306,7 @@ struct ExampleRunner {
           );
 
       // Wait for kernel to finish
-      syclcompat::wait();
+      compat::wait();
 
       // Check if output from CUTLASS kernel and reference kernel are equal or not
       passed &= cutlass::reference::device::BlockCompareEqual(block_ref_D.get() + offset_D.at(i), block_D.get() + offset_D.at(i), M * N);
@@ -513,7 +514,7 @@ void initialize(const Options &options) {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify<ElementType>(options);
@@ -527,7 +528,7 @@ void initialize(const Options &options) {
       for (int iter = 0; iter < options.iterations; ++iter) {
         CUTLASS_CHECK(gemm_op.run());
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() * 1000;
       double cute_average_time = double(cute_time) / double(options.iterations);
diff --git a/examples/10_bmg_grouped_gemm_mixed_dtype/bmg_grouped_gemm_mixed_dtype_runner.hpp b/examples/10_bmg_grouped_gemm_mixed_dtype/bmg_grouped_gemm_mixed_dtype_runner.hpp
index 525fa8ae6d..27020418c0 100644
--- a/examples/10_bmg_grouped_gemm_mixed_dtype/bmg_grouped_gemm_mixed_dtype_runner.hpp
+++ b/examples/10_bmg_grouped_gemm_mixed_dtype/bmg_grouped_gemm_mixed_dtype_runner.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -315,9 +316,9 @@ struct ExampleRunner {
       SrcT* h_src = new SrcT[size * L];
       ElementScale* scale_h = new ElementScale[L];
       ElementZero* zero_h = new ElementZero[L];
-      syclcompat::memcpy(h_src, d_src, size * L * sizeof(SrcT));
-      syclcompat::memcpy(scale_h, scale, L * sizeof(ElementScale));
-      syclcompat::memcpy(zero_h, zero, L * sizeof(ElementZero));
+      compat::memcpy(h_src, d_src, size * L * sizeof(SrcT));
+      compat::memcpy(scale_h, scale, L * sizeof(ElementScale));
+      compat::memcpy(zero_h, zero, L * sizeof(ElementZero));
       
       DstT* h_dst = new DstT[size * L];
       for(size_t j = 0; j < L; ++j) {
@@ -326,7 +327,7 @@ struct ExampleRunner {
         }
       }
 
-      syclcompat::memcpy(d_dst, h_dst, size * sizeof(DstT));
+      compat::memcpy(d_dst, h_dst, size * sizeof(DstT));
   }
 
   /// Populates a Gemm::Arguments structure from the given commandline options
@@ -466,10 +467,10 @@ struct ExampleRunner {
       CUTLASS_CHECK(gemm_ref.initialize(arguments, workspace.get()));
       CUTLASS_CHECK(gemm_ref.run());
 
-      syclcompat::wait();
+      compat::wait();
       // compare_reference
       passed |= cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get() + offset_D[i], block_ref_D.size(), epsilon, non_zero_floor);
-      syclcompat::wait();
+      compat::wait();
     }
 
     return passed;
@@ -615,7 +616,7 @@ struct ExampleRunner {
     std::vector<uint8_t> zero(size(zero_layout) * sizeof_bits_v<ElementZero> / 8, 0);
     cutlass::device_memory::copy_to_host(zero.data(), (uint8_t*)zero_buffer, zero.size());
 
-    syclcompat::wait();
+    compat::wait();
 
     auto dst_tensor = make_tensor(make_gmem_ptr(reinterpret_cast<DequantizedElement*>(dst.data())), operand_layout);
 
@@ -669,7 +670,7 @@ struct ExampleRunner {
     }
 
     cutlass::device_memory::copy_to_device(dq_buffer, (DequantizedElement*)(raw_pointer_cast(dst_tensor.data())), dst_tensor.size());
-    syclcompat::wait();
+    compat::wait();
   }
 
 
@@ -872,7 +873,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(options);
@@ -886,7 +887,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double cute_average_time = double(cute_time) / double(options.iterations);
diff --git a/examples/common/sycl_common.hpp b/examples/common/sycl_common.hpp
index 916653df25..71687b5546 100644
--- a/examples/common/sycl_common.hpp
+++ b/examples/common/sycl_common.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
-* Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -44,7 +45,7 @@ bool is_close(T a, T b, float atol, float rtol) {
 
 template <typename SrcT, typename DstT>
 void convert_dtype(const SrcT* d_src, DstT* d_dst, size_t size) {
-  syclcompat::get_default_queue().parallel_for(size, [=](auto indx) {
+  compat::get_default_queue().parallel_for(size, [=](auto indx) {
     d_dst[indx] = static_cast<DstT>(d_src[indx]);
   }).wait();
 }
diff --git a/examples/cute/tutorial/bgemm_bmg_legacy.cpp b/examples/cute/tutorial/bgemm_bmg_legacy.cpp
index 9303360148..010bdd50e4 100644
--- a/examples/cute/tutorial/bgemm_bmg_legacy.cpp
+++ b/examples/cute/tutorial/bgemm_bmg_legacy.cpp
@@ -31,7 +31,7 @@
  **************************************************************************************************/
 
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include <cute/tensor.hpp>
 
@@ -57,7 +57,7 @@ bool verify(
   char transA,
   char transB
 ) {
-  auto ref_d_C = syclcompat::malloc<TC>(m*n);
+  auto ref_d_C = compat::malloc<TC>(m*n);
   cutlass::TensorRef ref_A_T(d_A, cutlass::layout::RowMajor::packed({m, k}));
   cutlass::TensorRef ref_A_N(d_A, cutlass::layout::ColumnMajor::packed({m, k}));
   
@@ -126,8 +126,8 @@ bool verify(
   }
 
 
-  // CUTLASS on SYCL uses the compatibility library syclcompat for e.g. default in-order queue
-  syclcompat::wait();
+  // CUTLASS on SYCL uses the compatibility library compat for e.g. default in-order queue
+  compat::wait();
 
   // Check if output from CUTLASS kernel and reference kernel are equal or not
   bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -168,7 +168,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler, int stages,
   Tensor mC_coord = cute::get_xe_tensor(C_shape);   //(m,n,l)
 
   // Get the appropriate blocks for this thread block
-  auto cta_coord = make_coord(syclcompat::work_group_id::x(), syclcompat::work_group_id::y(), 0);  // (m,n,k)
+  auto cta_coord = make_coord(compat::work_group_id::x(), compat::work_group_id::y(), 0);  // (m,n,k)
 
   Tensor gA = local_tile(mA_coord, select<0,2>(cta_tiler), make_coord(BlockIdxX(),_,BlockIdxZ()));  // (BLK_M,BLK_K,k)
   Tensor gB = local_tile(mB_coord, select<1,2>(cta_tiler), make_coord(BlockIdxY(),_,BlockIdxZ()));  // (BLK_N,BLK_K,k)
@@ -180,7 +180,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler, int stages,
 
   TiledMma tiled_mma;
   constexpr int sg_size = 16;
-  auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+  auto sg = compat::get_nd_item<1>().get_sub_group();
   auto first_thread_in_sg_idx = sg.get_group_linear_id() * sg_size;
   auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
 
@@ -192,8 +192,8 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler, int stages,
   Tensor tCrA = make_tensor<TA>(make_fragment_layout(copy_a, tCgA(_,_,_,0).shape()));
   Tensor tCrB = make_tensor<TB>(make_fragment_layout(copy_b, tCgB(_,_,_,0).shape()));
 
-  ThrCopy thr_copy_a = copy_a.get_slice(syclcompat::local_id::x());
-  ThrCopy thr_copy_b = copy_b.get_slice(syclcompat::local_id::x());
+  ThrCopy thr_copy_a = copy_a.get_slice(compat::local_id::x());
+  ThrCopy thr_copy_b = copy_b.get_slice(compat::local_id::x());
 
   // Retile registers for copies
   Tensor tArA = thr_copy_a.retile_D(tCrA);
@@ -313,23 +313,23 @@ gemm_nt(int m, int n, int k,
   TiledMMA mmaC = TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<decltype(cta_tiler)>,
                                  Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA{};
 
-  auto dimBlock = syclcompat::dim3(size(mmaC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto dimBlock = compat::dim3(size(mmaC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
 
   constexpr int SubgroupSize = 16;
   constexpr int smem_size = 0;
   auto kernel_props = [] {
-    return syclcompat::experimental::kernel_properties{
+    return compat::experimental::kernel_properties{
       sycl::ext::oneapi::experimental::sub_group_size<SubgroupSize>
     };
   }();
-  syclcompat::experimental::launch_properties launch_props {
+  compat::experimental::launch_properties launch_props {
     sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
   };
-  syclcompat::experimental::launch_policy policy{
+  compat::experimental::launch_policy policy{
     dimGrid, dimBlock, launch_props, kernel_props
   };
-  auto event = syclcompat::experimental::launch<
+  auto event = compat::experimental::launch<
     gemm_device<decltype(prob_shape), decltype(cta_tiler),
                 TA, decltype(dA), decltype(copyA),
                 TB, decltype(dB), decltype(copyB),
@@ -390,23 +390,23 @@ gemm_tn(int m, int n, int k,
   TiledMMA mmaC = TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<decltype(cta_tiler)>,
                                     Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA{};  // 256x128x16 TiledMMA
 
-  auto dimBlock = syclcompat::dim3(size(mmaC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto dimBlock = compat::dim3(size(mmaC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
 
   constexpr int SubgroupSize = 16;
   constexpr int smem_size = 0;
   auto kernel_props = [] {
-    return syclcompat::experimental::kernel_properties{
+    return compat::experimental::kernel_properties{
       sycl::ext::oneapi::experimental::sub_group_size<SubgroupSize>
     };
   }();
-  syclcompat::experimental::launch_properties launch_props {
+  compat::experimental::launch_properties launch_props {
     sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
   };
-  syclcompat::experimental::launch_policy policy{
+  compat::experimental::launch_policy policy{
     dimGrid, dimBlock, launch_props, kernel_props
   };
-  auto event = syclcompat::experimental::launch<
+  auto event = compat::experimental::launch<
     gemm_device<decltype(prob_shape), decltype(cta_tiler),
                 TA, decltype(dA), decltype(copyA),
                 TB, decltype(dB), decltype(copyB),
@@ -465,24 +465,24 @@ gemm_tt(int m, int n, int k,
   TiledMMA mmaC = TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<decltype(cta_tiler)>,
                                  Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA{};
 
-  auto dimBlock = syclcompat::dim3(size(mmaC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto dimBlock = compat::dim3(size(mmaC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
 
   // Cutlass only support simd_16
   constexpr int SubgroupSize = 16;
   constexpr int smem_size = 0;
   auto kernel_props = [] {
-    return syclcompat::experimental::kernel_properties{
+    return compat::experimental::kernel_properties{
       sycl::ext::oneapi::experimental::sub_group_size<SubgroupSize>
     };
   }();
-  syclcompat::experimental::launch_properties launch_props {
+  compat::experimental::launch_properties launch_props {
     sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
   };
-  syclcompat::experimental::launch_policy policy{
+  compat::experimental::launch_policy policy{
     dimGrid, dimBlock, launch_props, kernel_props
   };
-  auto event = syclcompat::experimental::launch<
+  auto event = compat::experimental::launch<
     gemm_device<decltype(prob_shape), decltype(cta_tiler),
                 TA, decltype(dA), decltype(copyA),
                 TB, decltype(dB), decltype(copyB),
@@ -562,13 +562,13 @@ int main(int argc, char** argv)
   for (int j = 0; j < n*k; ++j) h_B[j] = static_cast<TB>( (rand()%21) - 10 );
   for (int j = 0; j < m*n; ++j) h_C[j] = static_cast<TC>(-1);
 
-  auto d_A = syclcompat::malloc<TA>(m*k);
-  auto d_B = syclcompat::malloc<TB>(k*n);
-  auto d_C = syclcompat::malloc<TC>(m*n);
+  auto d_A = compat::malloc<TA>(m*k);
+  auto d_B = compat::malloc<TB>(k*n);
+  auto d_C = compat::malloc<TC>(m*n);
 
-  syclcompat::memcpy<TA>(d_A, h_A.data(), m*k);
-  syclcompat::memcpy<TB>(d_B, h_B.data(), k*n);
-  syclcompat::memcpy<TC>(d_C, h_C.data(), m*n);
+  compat::memcpy<TA>(d_A, h_A.data(), m*k);
+  compat::memcpy<TB>(d_B, h_B.data(), k*n);
+  compat::memcpy<TC>(d_C, h_C.data(), m*n);
 
   int ldA = 0, ldB = 0, ldC = m;
 
@@ -597,7 +597,7 @@ int main(int argc, char** argv)
        d_B, ldB,
        beta,
        d_C, ldC);
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 
   bool passed = verify(
     d_A,
@@ -630,7 +630,7 @@ int main(int argc, char** argv)
          beta,
          d_C, ldC);
   }
-  syclcompat::wait();
+  compat::wait();
   double cute_time = timer.seconds() / timing_iterations;
   printf("CUTE_GEMM:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
 
diff --git a/examples/cute/tutorial/sgemm_1_sycl.cpp b/examples/cute/tutorial/sgemm_1_sycl.cpp
index c8f1435d80..45a5ccd478 100644
--- a/examples/cute/tutorial/sgemm_1_sycl.cpp
+++ b/examples/cute/tutorial/sgemm_1_sycl.cpp
@@ -1,6 +1,7 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +32,7 @@
  **************************************************************************************************/
 
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include <cute/tensor.hpp>
 
@@ -96,14 +97,14 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N)
 
   // Get the appropriate blocks for this thread block
-  auto cta_coord = make_coord(syclcompat::work_group_id::x(), syclcompat::work_group_id::y(), _);  // (m,n,k)
+  auto cta_coord = make_coord(compat::work_group_id::x(), compat::work_group_id::y(), _);  // (m,n,k)
   Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
   Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
   Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
 
   // Shared memory buffers
-  auto smemA = syclcompat::local_mem<TA[cosize_v<ASmemLayout>]>();
-  auto smemB = syclcompat::local_mem<TB[cosize_v<BSmemLayout>]>();
+  auto smemA = compat::local_mem<TA[cosize_v<ASmemLayout>]>();
+  auto smemB = compat::local_mem<TB[cosize_v<BSmemLayout>]>();
   Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K)
   Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K)
 
@@ -113,11 +114,11 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
 
   // TUTORIAL: Example of simple raked partitioning of ThreadLayouts tA|tB over data A|B tiles
 
-  Tensor tAgA = local_partition(gA, tA, syclcompat::local_id::x());    // (THR_M,THR_K,k)
-  Tensor tAsA = local_partition(sA, tA, syclcompat::local_id::x());    // (THR_M,THR_K)
+  Tensor tAgA = local_partition(gA, tA, compat::local_id::x());    // (THR_M,THR_K,k)
+  Tensor tAsA = local_partition(sA, tA, compat::local_id::x());    // (THR_M,THR_K)
 
-  Tensor tBgB = local_partition(gB, tB, syclcompat::local_id::x());    // (THR_N,THR_K,k)
-  Tensor tBsB = local_partition(sB, tB, syclcompat::local_id::x());    // (THR_N,THR_K)
+  Tensor tBgB = local_partition(gB, tB, compat::local_id::x());    // (THR_N,THR_K,k)
+  Tensor tBsB = local_partition(sB, tB, compat::local_id::x());    // (THR_N,THR_K)
 
   CUTE_STATIC_ASSERT_V(size<0>(tAgA) == size<0>(tAsA));                // THR_M
   CUTE_STATIC_ASSERT_V(size<1>(tAgA) == size<1>(tAsA));                // THR_K
@@ -131,11 +132,11 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   // TUTORIAL: Example of partitioning via projections of a ThreadLayout tC
 
   // Partition sA (M,K) by the rows of tC
-  Tensor tCsA = local_partition(sA, tC, syclcompat::local_id::x(), Step<_1, X>{});  // (THR_M,BLK_K)
+  Tensor tCsA = local_partition(sA, tC, compat::local_id::x(), Step<_1, X>{});  // (THR_M,BLK_K)
   // Partition sB (N,K) by the cols of tC
-  Tensor tCsB = local_partition(sB, tC, syclcompat::local_id::x(), Step< X,_1>{});  // (THR_N,BLK_K)
+  Tensor tCsB = local_partition(sB, tC, compat::local_id::x(), Step< X,_1>{});  // (THR_N,BLK_K)
   // Partition gC (M,N) by the tile of tC
-  Tensor tCgC = local_partition(gC, tC, syclcompat::local_id::x(), Step<_1,_1>{});  // (THR_M,THR_N)
+  Tensor tCgC = local_partition(gC, tC, compat::local_id::x(), Step<_1,_1>{});  // (THR_M,THR_N)
 
   // Allocate the accumulators -- same shape/layout as the partitioned data
   Tensor tCrC = make_tensor_like(tCgC);                                // (THR_M,THR_N)
@@ -204,7 +205,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
 
     cp_async_fence();        // Label the end of (potential) cp.async instructions
     cp_async_wait<0>();      // Sync on all (potential) cp.async instructions
-    syclcompat::wg_barrier();// Wait for all threads to write to smem
+    compat::wg_barrier();// Wait for all threads to write to smem
 
     // Compute gemm on tC thread-partitioned smem
     gemm(tCsA, tCsB, tCrC);            // (THR_M,THR_N) += (THR_M,BLK_K) * (THR_N,BLK_K)
@@ -221,7 +222,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
     //     }
     //   }
 
-    syclcompat::wg_barrier();         // Wait for all threads to read from smem
+    compat::wg_barrier();         // Wait for all threads to read from smem
   }
 
 #endif
@@ -280,10 +281,10 @@ gemm_nt(int m, int n, int k,
   auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{}));   // (n,k) -> thr_idx
   auto tC = make_layout(make_shape(Int<16>{}, Int<16>{}));   // (m,n) -> thr_idx
 
-  auto dimBlock = syclcompat::dim3(size(tC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto dimBlock = compat::dim3(size(tC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
 
-  auto event = syclcompat::launch<
+  auto event = compat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler),
                   TA, decltype(dA), decltype(sA), decltype(tA),
                   TB, decltype(dB), decltype(sB), decltype(tB),
@@ -337,10 +338,10 @@ gemm_tn(int m, int n, int k,
   auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{}), LayoutRight{});  // (n,k) -> thr_idx; k-major
   auto tC = make_layout(make_shape(Int<16>{}, Int<16>{}));                 // (m,n) -> thr_idx; m-major
 
-  auto dimBlock = syclcompat::dim3(size(tC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto dimBlock = compat::dim3(size(tC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
 
-  auto event = syclcompat::launch<
+  auto event = compat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler),
                   TA, decltype(dA), decltype(sA), decltype(tA),
                   TB, decltype(dB), decltype(sB), decltype(tB),
@@ -416,13 +417,13 @@ int main(int argc, char** argv)
   for (int j = 0; j < n*k; ++j) h_B[j] = static_cast<TB>( 2*(rand() / double(RAND_MAX)) - 1 );
   for (int j = 0; j < m*n; ++j) h_C[j] = static_cast<TC>(-1);
 
-  auto d_A = syclcompat::malloc<TA>(m*k);
-  auto d_B = syclcompat::malloc<TB>(k*n);
-  auto d_C = syclcompat::malloc<TC>(m*n);
+  auto d_A = compat::malloc<TA>(m*k);
+  auto d_B = compat::malloc<TB>(k*n);
+  auto d_C = compat::malloc<TC>(m*n);
 
-  syclcompat::memcpy<TA>(d_A, h_A.data(), m*k);
-  syclcompat::memcpy<TB>(d_B, h_B.data(), k*n);
-  syclcompat::memcpy<TC>(d_C, h_C.data(), m*n);
+  compat::memcpy<TA>(d_A, h_A.data(), m*k);
+  compat::memcpy<TB>(d_B, h_B.data(), k*n);
+  compat::memcpy<TC>(d_C, h_C.data(), m*n);
   
   double gflops = (2.0*m*n*k) * 1e-9;
 
@@ -453,7 +454,7 @@ int main(int argc, char** argv)
        d_B, ldB,
        beta,
        d_C, ldC);
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 
   // Timing iterations
   timer.start();
diff --git a/examples/cute/tutorial/sgemm_2_sycl.cpp b/examples/cute/tutorial/sgemm_2_sycl.cpp
index 747023f4d1..4a50871d1f 100644
--- a/examples/cute/tutorial/sgemm_2_sycl.cpp
+++ b/examples/cute/tutorial/sgemm_2_sycl.cpp
@@ -1,6 +1,7 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +32,7 @@
  **************************************************************************************************/
 
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 
 #include <cute/tensor.hpp>
@@ -86,14 +87,14 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N)
 
   // Get the appropriate blocks for this thread block
-  auto cta_coord = make_coord(syclcompat::work_group_id::x(), syclcompat::work_group_id::y(), _);  // (m,n,k
+  auto cta_coord = make_coord(compat::work_group_id::x(), compat::work_group_id::y(), _);  // (m,n,k
   Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
   Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
   Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
 
   // Shared memory buffers
-  auto smemA = syclcompat::local_mem<TA[cosize_v<ASmemLayout>]>();
-  auto smemB = syclcompat::local_mem<TB[cosize_v<BSmemLayout>]>();
+  auto smemA = compat::local_mem<TA[cosize_v<ASmemLayout>]>();
+  auto smemB = compat::local_mem<TB[cosize_v<BSmemLayout>]>();
   Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K)
   Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K)
 
@@ -103,13 +104,13 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
 
   // TUTORIAL: Example of partitioning via a TiledCopy
 
-  ThrCopy thr_copy_a = copy_a.get_slice(syclcompat::local_id::x());
+  ThrCopy thr_copy_a = copy_a.get_slice(compat::local_id::x());
   Tensor tAgA = thr_copy_a.partition_S(gA);                            // (CPY,CPY_M,CPY_K,k)
   Tensor tAsA = thr_copy_a.partition_D(sA);                            // (CPY,CPY_M,CPY_K)
   // Allocate registers same shape/layout as partitioned data
   Tensor tArA = make_fragment_like(tAsA);                              // (CPY,CPY_M,CPY_K)
 
-  ThrCopy thr_copy_b = copy_b.get_slice(syclcompat::local_id::x());
+  ThrCopy thr_copy_b = copy_b.get_slice(compat::local_id::x());
   Tensor tBgB = thr_copy_b.partition_S(gB);                            // (CPY,CPY_N,CPY_K,k)
   Tensor tBsB = thr_copy_b.partition_D(sB);                            // (CPY,CPY_N,CPY_K)
   // Allocate registers same shape/layout as partitioned data
@@ -133,7 +134,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
 
   // TUTORIAL: Example of partitioning via a TiledMMA
 
-  ThrMMA thr_mma = mma.get_slice(syclcompat::local_id::x());
+  ThrMMA thr_mma = mma.get_slice(compat::local_id::x());
   Tensor tCsA = thr_mma.partition_A(sA);                               // (MMA,MMA_M,MMA_K)
   Tensor tCsB = thr_mma.partition_B(sB);                               // (MMA,MMA_N,MMA_K)
   Tensor tCgC = thr_mma.partition_C(gC);                               // (MMA,MMA_M,MMA_N)
@@ -194,10 +195,10 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   for (int k_tile = 0; k_tile < K_TILE_MAX; ++k_tile)
   {
     // Copy rmem to smem with tA|tB thread-partitioned tensors
-    syclcompat::wg_barrier();         // Wait for all threads to consume smem
+    compat::wg_barrier();         // Wait for all threads to consume smem
     copy(tArA, tAsA);
     copy(tBrB, tBsB);
-    syclcompat::wg_barrier();         // Wait for all threads to consume smem
+    compat::wg_barrier();         // Wait for all threads to consume smem
 
     // Copy gmem to rmem for k_tile+1 with tA|tB thread-partitioned tensors
     int k_tile_next = (k_tile + 1 < K_TILE_MAX) ? k_tile + 1 : k_tile;
@@ -305,9 +306,9 @@ gemm_nt(int m, int n, int k,
   print_latex(mmaC);
 #endif
 
-  auto dimBlock = syclcompat::dim3(size(mmaC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  auto event = syclcompat::launch<
+  auto dimBlock = compat::dim3(size(mmaC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto event = compat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler),
                   TA, decltype(dA), decltype(sA), decltype(copyA),
                   TB, decltype(dB), decltype(sB), decltype(copyB),
@@ -389,9 +390,9 @@ gemm_tn(int m, int n, int k,
   print_latex(mmaC);
 #endif
 
-  auto dimBlock = syclcompat::dim3(size(mmaC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  auto event = syclcompat::launch<
+  auto dimBlock = compat::dim3(size(mmaC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto event = compat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler),
                   TA, decltype(dA), decltype(sA), decltype(copyA),
                   TB, decltype(dB), decltype(sB), decltype(copyB),
@@ -467,13 +468,13 @@ int main(int argc, char** argv)
   for (int j = 0; j < n*k; ++j) h_B[j] = static_cast<TB>( 2*(rand() / double(RAND_MAX)) - 1 );
   for (int j = 0; j < m*n; ++j) h_C[j] = static_cast<TC>(-1);
 
-  auto d_A = syclcompat::malloc<TA>(m*k);
-  auto d_B = syclcompat::malloc<TB>(k*n);
-  auto d_C = syclcompat::malloc<TC>(m*n);
+  auto d_A = compat::malloc<TA>(m*k);
+  auto d_B = compat::malloc<TB>(k*n);
+  auto d_C = compat::malloc<TC>(m*n);
 
-  syclcompat::memcpy<TA>(d_A, h_A.data(), m*k);
-  syclcompat::memcpy<TB>(d_B, h_B.data(), k*n);
-  syclcompat::memcpy<TC>(d_C, h_C.data(), m*n);
+  compat::memcpy<TA>(d_A, h_A.data(), m*k);
+  compat::memcpy<TB>(d_B, h_B.data(), k*n);
+  compat::memcpy<TC>(d_C, h_C.data(), m*n);
 
   double gflops = (2.0*m*n*k) * 1e-9;
 
@@ -505,7 +506,7 @@ int main(int argc, char** argv)
        d_B, ldB,
        beta,
        d_C, ldC);
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 
   // Timing iterations
   timer.start();
diff --git a/examples/cute/tutorial/sgemm_sm70_sycl.cpp b/examples/cute/tutorial/sgemm_sm70_sycl.cpp
index b62ac600ba..9966f03b40 100644
--- a/examples/cute/tutorial/sgemm_sm70_sycl.cpp
+++ b/examples/cute/tutorial/sgemm_sm70_sycl.cpp
@@ -1,6 +1,7 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +32,7 @@
  **************************************************************************************************/
 
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include <cute/tensor.hpp>
 
@@ -85,14 +86,14 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N)
 
   // Get the appropriate blocks for this thread block
-  auto cta_coord = make_coord(syclcompat::work_group_id::x(), syclcompat::work_group_id::y(), _);  // (m,n,k)
+  auto cta_coord = make_coord(compat::work_group_id::x(), compat::work_group_id::y(), _);  // (m,n,k)
   Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
   Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
   Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
 
   // Shared memory buffers
-  auto smemA = syclcompat::local_mem<TA[cosize_v<ASmemLayout>]>();
-  auto smemB = syclcompat::local_mem<TB[cosize_v<BSmemLayout>]>();
+  auto smemA = compat::local_mem<TA[cosize_v<ASmemLayout>]>();
+  auto smemB = compat::local_mem<TB[cosize_v<BSmemLayout>]>();
   Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K)
   Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K)
 
@@ -102,12 +103,12 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
 
   // TUTORIAL: Example of partitioning via a TiledCopy
 
-  ThrCopy thr_copy_a = copy_a.get_slice(syclcompat::local_id::x());
+  ThrCopy thr_copy_a = copy_a.get_slice(compat::local_id::x());
   Tensor tAgA = thr_copy_a.partition_S(gA);                            // (CPY,CPY_M,CPY_K,k)
   Tensor tAsA = thr_copy_a.partition_D(sA);                            // (CPY,CPY_M,CPY_K)
   Tensor tArA = make_fragment_like(tAsA);                              // (CPY,CPY_M,CPY_K)
 
-  ThrCopy thr_copy_b = copy_b.get_slice(syclcompat::local_id::x());
+  ThrCopy thr_copy_b = copy_b.get_slice(compat::local_id::x());
   Tensor tBgB = thr_copy_b.partition_S(gB);                            // (CPY,CPY_N,CPY_K,k)
   Tensor tBsB = thr_copy_b.partition_D(sB);                            // (CPY,CPY_N,CPY_K)
   Tensor tBrB = make_fragment_like(tBsB);                              // (CPY,CPY_N,CPY_K)
@@ -130,7 +131,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
 
   // TUTORIAL: Example of partitioning via a TiledMMA
 
-  ThrMMA thr_mma = mma.get_slice(syclcompat::local_id::x());
+  ThrMMA thr_mma = mma.get_slice(compat::local_id::x());
   Tensor tCsA = thr_mma.partition_A(sA);                               // (MMA,MMA_M,MMA_K)
   Tensor tCsB = thr_mma.partition_B(sB);                               // (MMA,MMA_N,MMA_K)
   Tensor tCgC = thr_mma.partition_C(gC);                               // (MMA,MMA_M,MMA_N)
@@ -189,7 +190,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   // Copy rmem to smem
   copy(tArA, tAsA);
   copy(tBrB, tBsB);
-  syclcompat::wg_barrier();
+  compat::wg_barrier();
 
   //
   // PIPELINED MAIN LOOP
@@ -215,10 +216,10 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
       if (k_block == K_BLOCK_MAX - 1)
       {
         // Copy rmem to smem
-        syclcompat::wg_barrier();
+        compat::wg_barrier();
         copy(tArA, tAsA);
         copy(tBrB, tBsB);
-        syclcompat::wg_barrier();
+        compat::wg_barrier();
       }
 
       // Copy smem to rmem for k_block+1
@@ -304,9 +305,9 @@ gemm_nt(int m, int n, int k,
   print_latex(mmaC);
 #endif
 
-  auto dimBlock = syclcompat::dim3(size(mmaC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  auto event = syclcompat::launch<
+  auto dimBlock = compat::dim3(size(mmaC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto event = compat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler),
                   TA, decltype(dA), decltype(sA), decltype(copyA),
                   TB, decltype(dB), decltype(sB), decltype(copyB),
@@ -380,9 +381,9 @@ gemm_tn(int m, int n, int k,
   print_latex(mmaC);
 #endif
 
-  auto dimBlock = syclcompat::dim3(size(mmaC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  auto event = syclcompat::launch<
+  auto dimBlock = compat::dim3(size(mmaC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto event = compat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler),
                   TA, decltype(dA), decltype(sA), decltype(copyA),
                   TB, decltype(dB), decltype(sB), decltype(copyB),
@@ -458,13 +459,13 @@ int main(int argc, char** argv)
   for (int j = 0; j < n*k; ++j) h_B[j] = static_cast<TB>( 2*(rand() / double(RAND_MAX)) - 1 );
   for (int j = 0; j < m*n; ++j) h_C[j] = static_cast<TC>(-1);
 
-  auto d_A = syclcompat::malloc<TA>(m*k);
-  auto d_B = syclcompat::malloc<TB>(k*n);
-  auto d_C = syclcompat::malloc<TC>(m*n);
+  auto d_A = compat::malloc<TA>(m*k);
+  auto d_B = compat::malloc<TB>(k*n);
+  auto d_C = compat::malloc<TC>(m*n);
 
-  syclcompat::memcpy<TA>(d_A, h_A.data(), m*k);
-  syclcompat::memcpy<TB>(d_B, h_B.data(), k*n);
-  syclcompat::memcpy<TC>(d_C, h_C.data(), m*n);
+  compat::memcpy<TA>(d_A, h_A.data(), m*k);
+  compat::memcpy<TB>(d_B, h_B.data(), k*n);
+  compat::memcpy<TC>(d_C, h_C.data(), m*n);
 
   double gflops = (2.0*m*n*k) * 1e-9;
 
@@ -496,7 +497,7 @@ int main(int argc, char** argv)
        d_B, ldB,
        beta,
        d_C, ldC);
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 
   // Timing iterations
   timer.start();
diff --git a/examples/cute/tutorial/sgemm_sm80_sycl.cpp b/examples/cute/tutorial/sgemm_sm80_sycl.cpp
index 1012ad79f4..d8f3e8ebd6 100644
--- a/examples/cute/tutorial/sgemm_sm80_sycl.cpp
+++ b/examples/cute/tutorial/sgemm_sm80_sycl.cpp
@@ -1,6 +1,7 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +32,7 @@
  **************************************************************************************************/
 
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include <cute/tensor.hpp>
 
@@ -85,14 +86,14 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   Tensor mC = make_tensor(make_gmem_ptr(C), select<0,1>(shape_MNK), dC); // (M,N)
 
   // Get the appropriate blocks for this thread block
-  auto cta_coord = make_coord(syclcompat::work_group_id::x(), syclcompat::work_group_id::y(), _);  // (m,n,k)
+  auto cta_coord = make_coord(compat::work_group_id::x(), compat::work_group_id::y(), _);  // (m,n,k)
   Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{});  // (BLK_M,BLK_K,k)
   Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{});  // (BLK_N,BLK_K,k)
   Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{});  // (BLK_M,BLK_N)
 
   // Shared memory buffers
-  auto smemA = syclcompat::local_mem<TA[cosize_v<ASmemLayout>]>();
-  auto smemB = syclcompat::local_mem<TB[cosize_v<BSmemLayout>]>();
+  auto smemA = compat::local_mem<TA[cosize_v<ASmemLayout>]>();
+  auto smemB = compat::local_mem<TB[cosize_v<BSmemLayout>]>();
   Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K,PIPE)
   Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K,PIPE)
 
@@ -100,11 +101,11 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   // Partition the copying of A and B tiles across the threads
   //
 
-  ThrCopy thr_copy_a = copy_a.get_slice(syclcompat::local_id::x());
+  ThrCopy thr_copy_a = copy_a.get_slice(compat::local_id::x());
   Tensor tAgA = thr_copy_a.partition_S(gA);                            // (CPY,CPY_M,CPY_K,k)
   Tensor tAsA = thr_copy_a.partition_D(sA);                            // (CPY,CPY_M,CPY_K,PIPE)
 
-  ThrCopy thr_copy_b = copy_b.get_slice(syclcompat::local_id::x());
+  ThrCopy thr_copy_b = copy_b.get_slice(compat::local_id::x());
   Tensor tBgB = thr_copy_b.partition_S(gB);                            // (CPY,CPY_N,CPY_K,k)
   Tensor tBsB = thr_copy_b.partition_D(sB);                            // (CPY,CPY_N,CPY_K,PIPE)
 
@@ -138,7 +139,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   // Define A/B partitioning and C accumulators
   //
 
-  ThrMMA thr_mma = mma.get_slice(syclcompat::local_id::x());
+  ThrMMA thr_mma = mma.get_slice(compat::local_id::x());
   Tensor tCsA = thr_mma.partition_A(sA);                               // (MMA,MMA_M,MMA_K,PIPE)
   Tensor tCsB = thr_mma.partition_B(sB);                               // (MMA,MMA_N,MMA_K,PIPE)
   Tensor tCgC = thr_mma.partition_C(gC);                               // (MMA,MMA_M,MMA_N)
@@ -210,7 +211,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
   if (K_BLOCK_MAX > 1) {
     // Wait until our first prefetched tile is loaded in
     cp_async_wait<K_PIPE_MAX-2>();
-    syclcompat::wg_barrier();
+    compat::wg_barrier();
 
     // Prefetch the first rmem from the first k-tile
     copy(tCsA_p(_,_,Int<0>{}), tCrA(_,_,Int<0>{}));
@@ -244,7 +245,7 @@ gemm_device(ProblemShape shape_MNK, CtaTiler cta_tiler,
 
         // Commit the smem for smem_pipe_read
         cp_async_wait<K_PIPE_MAX-2>();
-        syclcompat::wg_barrier();
+        compat::wg_barrier();
       }
 
       // Load A, B shmem->regs for k_block+1
@@ -342,9 +343,9 @@ gemm_nt(int m, int n, int k,
   print_latex(mmaC);
 #endif
 
-  auto dimBlock = syclcompat::dim3(size(mmaC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  auto event = syclcompat::launch<
+  auto dimBlock = compat::dim3(size(mmaC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto event = compat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler),
                   TA, decltype(dA), decltype(sA), decltype(copyA),
                   TB, decltype(dB), decltype(sB), decltype(copyB),
@@ -421,9 +422,9 @@ gemm_tn(int m, int n, int k,
   print_latex(mmaC);
 #endif
 
-  auto dimBlock = syclcompat::dim3(size(mmaC));
-  auto dimGrid  = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  auto event = syclcompat::launch<
+  auto dimBlock = compat::dim3(size(mmaC));
+  auto dimGrid  = compat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
+  auto event = compat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler),
                   TA, decltype(dA), decltype(sA), decltype(copyA),
                   TB, decltype(dB), decltype(sB), decltype(copyB),
@@ -499,13 +500,13 @@ int main(int argc, char** argv)
   for (int j = 0; j < n*k; ++j) h_B[j] = static_cast<TB>( 2*(rand() / double(RAND_MAX)) - 1 );
   for (int j = 0; j < m*n; ++j) h_C[j] = static_cast<TC>(-1);
 
-  auto d_A = syclcompat::malloc<TA>(m*k);
-  auto d_B = syclcompat::malloc<TB>(k*n);
-  auto d_C = syclcompat::malloc<TC>(m*n);
+  auto d_A = compat::malloc<TA>(m*k);
+  auto d_B = compat::malloc<TB>(k*n);
+  auto d_C = compat::malloc<TC>(m*n);
 
-  syclcompat::memcpy<TA>(d_A, h_A.data(), m*k);
-  syclcompat::memcpy<TB>(d_B, h_B.data(), k*n);
-  syclcompat::memcpy<TC>(d_C, h_C.data(), m*n);
+  compat::memcpy<TA>(d_A, h_A.data(), m*k);
+  compat::memcpy<TB>(d_B, h_B.data(), k*n);
+  compat::memcpy<TC>(d_C, h_C.data(), m*n);
 
   double gflops = (2.0*m*n*k) * 1e-9;
 
@@ -537,7 +538,7 @@ int main(int argc, char** argv)
        d_B, ldB,
        beta,
        d_C, ldC);
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 
   // Timing iterations
   timer.start();
diff --git a/examples/cute/tutorial/tiled_copy_if_sycl.cpp b/examples/cute/tutorial/tiled_copy_if_sycl.cpp
index 79b759051c..7cbc4aa333 100644
--- a/examples/cute/tutorial/tiled_copy_if_sycl.cpp
+++ b/examples/cute/tutorial/tiled_copy_if_sycl.cpp
@@ -30,7 +30,7 @@
  **************************************************************************************************/
 
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include <cute/tensor.hpp>
 
@@ -68,7 +68,7 @@ void copy_if_kernel(TensorS S, TensorD D, BlockShape block_shape, ThreadLayout)
   Tensor P = cute::lazy::transform(C, [&](auto c) { return elem_less(c, shape_S); });
 
   // Tile the input tensor into blocks
-  auto block_coord = make_coord(syclcompat::work_group_id::x(), syclcompat::work_group_id::y());
+  auto block_coord = make_coord(compat::work_group_id::x(), compat::work_group_id::y());
   Tensor tile_S = local_tile(S, block_shape, block_coord); // (BlockShape_M, BlockShape_N)
   Tensor tile_P = local_tile(P, block_shape, block_coord); // (BlockShape_M, BlockShape_N)
   Tensor tile_D = local_tile(D, block_shape, block_coord); // (BlockShape_M, BlockShape_N)
@@ -76,9 +76,9 @@ void copy_if_kernel(TensorS S, TensorD D, BlockShape block_shape, ThreadLayout)
   // Construct a partitioning of the tile among threads with the given thread arrangement.
 
   // Concept:                         Tensor  ThrLayout       ThrIndex
-  Tensor thr_tile_S = local_partition(tile_S, ThreadLayout{}, syclcompat::local_id::x());
-  Tensor thr_tile_P = local_partition(tile_P, ThreadLayout{}, syclcompat::local_id::x());
-  Tensor thr_tile_D = local_partition(tile_D, ThreadLayout{}, syclcompat::local_id::x());
+  Tensor thr_tile_S = local_partition(tile_S, ThreadLayout{}, compat::local_id::x());
+  Tensor thr_tile_P = local_partition(tile_P, ThreadLayout{}, compat::local_id::x());
+  Tensor thr_tile_D = local_partition(tile_D, ThreadLayout{}, compat::local_id::x());
 
   // Copy from GMEM to GMEM using `thr_tile_P` to guard accesses.
   copy_if(thr_tile_P, thr_tile_S, thr_tile_D);
@@ -101,7 +101,7 @@ void copy_if_kernel_vectorized(TensorS S, TensorD D, BlockShape block_shape, Til
   Tensor P = cute::lazy::transform(C, [&](auto c) { return elem_less(c, shape_S); });
 
   // Tile the input tensor into blocks
-  auto block_coord = make_coord(syclcompat::work_group_id::x(), syclcompat::work_group_id::y());
+  auto block_coord = make_coord(compat::work_group_id::x(), compat::work_group_id::y());
   Tensor tile_S = local_tile(S, block_shape, block_coord);       // (BlockShape_M, BlockShape_N)
   Tensor tile_D = local_tile(D, block_shape, block_coord);       // (BlockShape_M, BlockShape_N)
   Tensor tile_P = local_tile(P, block_shape, block_coord);       // (BlockShape_M, BlockShape_N)
@@ -109,7 +109,7 @@ void copy_if_kernel_vectorized(TensorS S, TensorD D, BlockShape block_shape, Til
   //
   // Construct a Tensor corresponding to each thread's slice.
   //
-  ThrCopy thr_copy = tiled_copy.get_thread_slice(syclcompat::local_id::x());
+  ThrCopy thr_copy = tiled_copy.get_thread_slice(compat::local_id::x());
   Tensor thr_tile_S = thr_copy.partition_S(tile_S);              // (CPY, CPY_M, CPY_N)
   Tensor thr_tile_D = thr_copy.partition_D(tile_D);              // (CPY, CPY_M, CPY_N)
   Tensor thr_tile_P = thr_copy.partition_S(tile_P);              // (CPY, CPY_M, CPY_N)
@@ -147,16 +147,16 @@ int main(int argc, char** argv)
   std::vector<Element> h_S(size(tensor_shape));
   std::vector<Element> h_D(size(tensor_shape));
 
-  auto d_S = syclcompat::malloc<Element>(size(tensor_shape));
-  auto d_D = syclcompat::malloc<Element>(size(tensor_shape));
-  auto d_Zero = syclcompat::malloc<Element>(size(tensor_shape));
+  auto d_S = compat::malloc<Element>(size(tensor_shape));
+  auto d_D = compat::malloc<Element>(size(tensor_shape));
+  auto d_Zero = compat::malloc<Element>(size(tensor_shape));
 
   for (size_t i = 0; i < h_S.size(); ++i) {
     h_S[i] = static_cast<Element>(i);
   }
 
-  syclcompat::memcpy<Element>(d_S, h_S.data(), size(tensor_shape));
-  syclcompat::memcpy<Element>(d_D, h_D.data(), size(tensor_shape));
+  compat::memcpy<Element>(d_S, h_S.data(), size(tensor_shape));
+  compat::memcpy<Element>(d_D, h_D.data(), size(tensor_shape));
 
   //
   // Make tensors
@@ -182,22 +182,22 @@ int main(int argc, char** argv)
   // Describes the layout of threads which is then replicated to tile 'block_shape.'
   Layout thr_layout = make_layout(make_shape(Int<32>{}, Int< 8>{}));  // (ThrM, ThrN)
 
-  auto gridDim  = syclcompat::dim3(size<1>(tiled_tensor_D), size<2>(tiled_tensor_D));
-  auto blockDim = syclcompat::dim3(size(thr_layout));
+  auto gridDim  = compat::dim3(size<1>(tiled_tensor_D), size<2>(tiled_tensor_D));
+  auto blockDim = compat::dim3(size(thr_layout));
 
   //
   // Launch the kernel
   //
-  syclcompat::launch<copy_if_kernel<decltype(tensor_S), decltype(tensor_D), 
+  compat::launch<copy_if_kernel<decltype(tensor_S), decltype(tensor_D), 
     decltype(block_shape), decltype(thr_layout)>>(
       gridDim, blockDim, tensor_S, tensor_D, block_shape, thr_layout
     );
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 
   //
   // Verify
   //
-  syclcompat::memcpy<Element>(h_D.data(), d_D, size(tensor_shape));
+  compat::memcpy<Element>(h_D.data(), d_D, size(tensor_shape));
 
   auto verify = [](std::vector<Element> const &S, std::vector<Element> const &D){
 
@@ -228,7 +228,7 @@ int main(int argc, char** argv)
     std::cout << "Success." << std::endl;
   }
 
-  syclcompat::memset(d_D, 0, size(tensor_shape));
+  compat::memset(d_D, 0, size(tensor_shape));
   
   // Construct a TiledCopy with a specific access pattern.
   //   This version uses a
@@ -254,16 +254,16 @@ int main(int argc, char** argv)
                                           thr_layout,         // thread layout (e.g. 32x4 Col-Major)
                                           val_layout);        // value layout (e.g. 4x1)
 
-  syclcompat::launch<copy_if_kernel_vectorized<decltype(tensor_S), decltype(tensor_D),
+  compat::launch<copy_if_kernel_vectorized<decltype(tensor_S), decltype(tensor_D),
     decltype(block_shape), decltype(tiled_copy)>>(
       gridDim, blockDim, tensor_S, tensor_D, block_shape, tiled_copy
     );
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 
   //
   // Verify
   //
-  syclcompat::memcpy(h_D.data(), d_D, size(tensor_shape));
+  compat::memcpy(h_D.data(), d_D, size(tensor_shape));
   
   if (verify(h_D, h_S)) {
     return -1;
diff --git a/examples/cute/tutorial/tiled_copy_sycl.cpp b/examples/cute/tutorial/tiled_copy_sycl.cpp
index 77c5484832..61a0b7aa0c 100644
--- a/examples/cute/tutorial/tiled_copy_sycl.cpp
+++ b/examples/cute/tutorial/tiled_copy_sycl.cpp
@@ -1,6 +1,7 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +32,7 @@
  **************************************************************************************************/
 
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include <cute/tensor.hpp>
 
@@ -72,16 +73,16 @@ void copy_kernel(TensorS S, TensorD D, ThreadLayout)
   using namespace cute;
 
   // Slice the tiled tensors
-  Tensor tile_S = S(make_coord(_,_), syclcompat::work_group_id::x(),
-                    syclcompat::work_group_id::y());            // (BlockShape_M, BlockShape_N)
-  Tensor tile_D = D(make_coord(_,_), syclcompat::work_group_id::x(),
-                    syclcompat::work_group_id::y());            // (BlockShape_M, BlockShape_N)
+  Tensor tile_S = S(make_coord(_,_), compat::work_group_id::x(),
+                    compat::work_group_id::y());            // (BlockShape_M, BlockShape_N)
+  Tensor tile_D = D(make_coord(_,_), compat::work_group_id::x(),
+                    compat::work_group_id::y());            // (BlockShape_M, BlockShape_N)
 
   // Construct a partitioning of the tile among threads with the given thread arrangement.
 
   // Concept:                         Tensor  ThrLayout       ThrIndex
-  Tensor thr_tile_S = local_partition(tile_S, ThreadLayout{}, syclcompat::local_id::x());  // (ThrValM, ThrValN)
-  Tensor thr_tile_D = local_partition(tile_D, ThreadLayout{}, syclcompat::local_id::x());  // (ThrValM, ThrValN)
+  Tensor thr_tile_S = local_partition(tile_S, ThreadLayout{}, compat::local_id::x());  // (ThrValM, ThrValN)
+  Tensor thr_tile_D = local_partition(tile_D, ThreadLayout{}, compat::local_id::x());  // (ThrValM, ThrValN)
 
   // Construct a register-backed Tensor with the same shape as each thread's partition
   // Use make_tensor to try to match the layout of thr_tile_S
@@ -105,10 +106,10 @@ void copy_kernel_vectorized(TensorS S, TensorD D, ThreadLayout, VecLayout)
   using Element = typename TensorS::value_type;
 
   // Slice the tensors to obtain a view into each tile.
-  Tensor tile_S = S(make_coord(_, _), syclcompat::work_group_id::x(),
-                    syclcompat::work_group_id::y());  // (BlockShape_M, BlockShape_N)
-  Tensor tile_D = D(make_coord(_, _), syclcompat::work_group_id::x(),
-                    syclcompat::work_group_id::y());  // (BlockShape_M, BlockShape_N)
+  Tensor tile_S = S(make_coord(_, _), compat::work_group_id::x(),
+                    compat::work_group_id::y());  // (BlockShape_M, BlockShape_N)
+  Tensor tile_D = D(make_coord(_, _), compat::work_group_id::x(),
+                    compat::work_group_id::y());  // (BlockShape_M, BlockShape_N)
 
   // Define `AccessType` which controls the size of the actual memory access.
   using AccessType = cutlass::AlignedArray<Element, size(VecLayout{})>;
@@ -129,7 +130,7 @@ void copy_kernel_vectorized(TensorS S, TensorD D, ThreadLayout, VecLayout)
       VecLayout{});                 // vector layout (e.g. 4x1)
 
   // Construct a Tensor corresponding to each thread's slice.
-  auto thr_copy = tiled_copy.get_thread_slice(syclcompat::local_id::x());
+  auto thr_copy = tiled_copy.get_thread_slice(compat::local_id::x());
 
   Tensor thr_tile_S = thr_copy.partition_S(tile_S);             // (CopyOp, CopyM, CopyN)
   Tensor thr_tile_D = thr_copy.partition_D(tile_D);             // (CopyOp, CopyM, CopyN)
@@ -162,15 +163,15 @@ int main(int argc, char** argv)
   std::vector<Element> h_S(size(tensor_shape));
   std::vector<Element> h_D(size(tensor_shape));
 
-  auto d_S = syclcompat::malloc<Element>(size(tensor_shape));
-  auto d_D = syclcompat::malloc<Element>(size(tensor_shape));
+  auto d_S = compat::malloc<Element>(size(tensor_shape));
+  auto d_D = compat::malloc<Element>(size(tensor_shape));
 
   for (size_t i = 0; i < h_S.size(); ++i) {
     h_S[i] = static_cast<Element>(i);
   }
 
-  syclcompat::memcpy<Element>(d_S, h_S.data(), size(tensor_shape));
-  syclcompat::memcpy<Element>(d_D, h_D.data(), size(tensor_shape));
+  compat::memcpy<Element>(d_S, h_S.data(), size(tensor_shape));
+  compat::memcpy<Element>(d_D, h_D.data(), size(tensor_shape));
 
   //
   // Make tensors
@@ -214,22 +215,22 @@ int main(int argc, char** argv)
   // Determine grid and block dimensions
   //
 
-  auto gridDim  = syclcompat::dim3(size<1>(tiled_tensor_D), size<2>(tiled_tensor_D));  // Grid shape corresponds to modes m' and n'
-  auto blockDim = syclcompat::dim3(size(thr_layout));
+  auto gridDim  = compat::dim3(size<1>(tiled_tensor_D), size<2>(tiled_tensor_D));  // Grid shape corresponds to modes m' and n'
+  auto blockDim = compat::dim3(size(thr_layout));
 
   //
   // Launch the kernel
   //
-  syclcompat::launch<copy_kernel_vectorized<decltype(tiled_tensor_S), decltype(tiled_tensor_D),
+  compat::launch<copy_kernel_vectorized<decltype(tiled_tensor_S), decltype(tiled_tensor_D),
                                             decltype(thr_layout), decltype(vec_layout)>>(
       gridDim, blockDim, tiled_tensor_S, tiled_tensor_D, thr_layout, vec_layout);
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 
   //
   // Verify
   //
 
-  syclcompat::memcpy<Element>(h_D.data(), d_D, size(tensor_shape));
+  compat::memcpy<Element>(h_D.data(), d_D, size(tensor_shape));
 
   int32_t errors = 0;
   int32_t const kErrorLimit = 10;
diff --git a/examples/cute/tutorial/xe_gemm.cpp b/examples/cute/tutorial/xe_gemm.cpp
index f23ff199e7..1ebeba7ffa 100644
--- a/examples/cute/tutorial/xe_gemm.cpp
+++ b/examples/cute/tutorial/xe_gemm.cpp
@@ -30,7 +30,7 @@
 **************************************************************************************************/
 
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 #include <sycl/ext/intel/experimental/grf_size_properties.hpp>
 
 #include <cute/tensor.hpp>
diff --git a/examples/generics/device_agnostic/device_agnostic_collective_builder.cpp b/examples/generics/device_agnostic/device_agnostic_collective_builder.cpp
index 541cc83aa2..053d111fcd 100644
--- a/examples/generics/device_agnostic/device_agnostic_collective_builder.cpp
+++ b/examples/generics/device_agnostic/device_agnostic_collective_builder.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -181,7 +182,7 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     using TensorView = cutlass::TensorView<ElementOutput, LayoutD>;
 
@@ -252,7 +253,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -266,7 +267,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/generics/device_agnostic/device_agnostic_gemm.cpp b/examples/generics/device_agnostic/device_agnostic_gemm.cpp
index 586e70e39a..c3767c7db1 100644
--- a/examples/generics/device_agnostic/device_agnostic_gemm.cpp
+++ b/examples/generics/device_agnostic/device_agnostic_gemm.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -179,7 +180,7 @@ struct ExampleRunner {
           M * N  // batch_stride_D
         );
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareEqual(
@@ -248,7 +249,7 @@ struct ExampleRunner {
     // Run the GEMM
     CUTLASS_CHECK(gemm_op.run());
 
-    syclcompat::wait();
+    compat::wait();
 
     // Verify that the result is correct
     bool passed = verify(problem_size, options.alpha, options.beta);
@@ -262,7 +263,7 @@ struct ExampleRunner {
       for (int i = 0; i < options.iterations; ++i) {
         gemm_op.run();
       }
-      syclcompat::wait();
+      compat::wait();
 
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
diff --git a/examples/nv_sycl/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu b/examples/nv_sycl/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu
index 1280ad5b40..08625263d0 100644
--- a/examples/nv_sycl/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu
+++ b/examples/nv_sycl/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cu
@@ -1,6 +1,7 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -266,7 +267,7 @@ struct ExampleRunner {
     );
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
 #else
     cudaError_t result = cudaDeviceSynchronize();
     if (result != cudaSuccess) {
diff --git a/examples/nv_sycl/35_gemm_softmax/gemm_softmax_adapter.hpp b/examples/nv_sycl/35_gemm_softmax/gemm_softmax_adapter.hpp
index fac68b0688..8a9aac9e23 100644
--- a/examples/nv_sycl/35_gemm_softmax/gemm_softmax_adapter.hpp
+++ b/examples/nv_sycl/35_gemm_softmax/gemm_softmax_adapter.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -454,43 +455,43 @@ class GemmSoftmaxAdapter
       else {
         CUTLASS_ASSERT(cuda_adapter == nullptr);
 #if defined(CUTLASS_ENABLE_SYCL)
-        const syclcompat::dim3 sycl_grid(grid.x, grid.y, grid.z);
-        const syclcompat::dim3 sycl_block(block.x, block.y, block.z);
+        const compat::dim3 sycl_grid(grid.x, grid.y, grid.z);
+        const compat::dim3 sycl_block(block.x, block.y, block.z);
 #if defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
         sycl::ext::oneapi::experimental::properties smem_prop{
           sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size)
         };
-        syclcompat::experimental::launch_properties launch_props{smem_prop};
-        auto event = syclcompat::experimental::launch<device_kernel<GemmKernel>>(syclcompat::experimental::launch_policy{
+        compat::experimental::launch_properties launch_props{smem_prop};
+        auto event = compat::experimental::launch<device_kernel<GemmKernel>>(compat::experimental::launch_policy{
           sycl_grid,
           sycl_block,
           launch_props
 #if defined(SYCL_INTEL_TARGET)
-          , syclcompat::experimental::kernel_properties{sycl_exp::sub_group_size<DispatchPolicy::SubgroupSize>}
+          , compat::experimental::kernel_properties{sycl_exp::sub_group_size<DispatchPolicy::SubgroupSize>}
 #endif // defined(SYCL_INTEL_TARGET)
         }, params.gemm_params);
 
-        syclcompat::experimental::launch_properties kernel_launch_props_finalize{
+        compat::experimental::launch_properties kernel_launch_props_finalize{
           sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size_finalize)
         };
-        const syclcompat::dim3 sycl_grid_finalize(grid_finalize.x, grid_finalize.y, grid_finalize.z);
-        const syclcompat::dim3 sycl_block_finalize(block_finalize.x, block_finalize.y, block_finalize.z);
-        auto event_finalize = syclcompat::experimental::launch<device_kernel<SoftmaxFinalizeKernel>>(syclcompat::experimental::launch_policy{
+        const compat::dim3 sycl_grid_finalize(grid_finalize.x, grid_finalize.y, grid_finalize.z);
+        const compat::dim3 sycl_block_finalize(block_finalize.x, block_finalize.y, block_finalize.z);
+        auto event_finalize = compat::experimental::launch<device_kernel<SoftmaxFinalizeKernel>>(compat::experimental::launch_policy{
             sycl_grid_finalize,
             sycl_block_finalize,
             kernel_launch_props_finalize,
           }, params.softmax_params);
         EventManager::getInstance().addEvent(event_finalize);
 #else
-        using namespace syclcompat::experimental;
+        using namespace compat::experimental;
         auto event = launch<device_kernel<GemmKernel>>(launch_policy{
           sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)}
 #if defined (SYCL_INTEL_TARGET)
           , kernel_properties{sycl_exp::sub_group_size<DispatchPolicy::SubgroupSize>}
 #endif
         }, params.gemm_params);
-        const auto sycl_block_finalize = syclcompat::dim3(block_finalize.x, block_finalize.y, block_finalize.z);
-        const auto sycl_grid_finalize = syclcompat::dim3(grid_finalize.x, grid_finalize.y, grid_finalize.z);
+        const auto sycl_block_finalize = compat::dim3(block_finalize.x, block_finalize.y, block_finalize.z);
+        const auto sycl_grid_finalize = compat::dim3(grid_finalize.x, grid_finalize.y, grid_finalize.z);
         auto event2 = launch<device_kernel<SoftmaxFinalizeKernel>>(launch_policy{
           sycl_grid_finalize, sycl_block_finalize, local_mem_size{static_cast<std::size_t>(smem_size_finalize)}},
           params.softmax_params);
diff --git a/include/cute/arch/copy_xe_legacy_U4.hpp b/include/cute/arch/copy_xe_legacy_U4.hpp
index cab43b5c9b..91b2d76698 100644
--- a/include/cute/arch/copy_xe_legacy_U4.hpp
+++ b/include/cute/arch/copy_xe_legacy_U4.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -124,7 +125,7 @@ struct XE_2D_U4x32x64_LD_N {
     static constexpr auto copy_W = decltype(size<1>(BlockShape{}))::value / subgroup_size;
     static constexpr auto copy_H = decltype(size<0>(BlockShape{}))::value;
 
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto id = int(ThreadIdxX()) % subgroup_size;
 
     cute::subbyte_iterator<int4_t> dst_iter(dst);
@@ -174,7 +175,7 @@ struct XE_2D_U4x16x64_LD_N {
     static constexpr auto copy_W = decltype(size<1>(BlockShape{}))::value / subgroup_size;
     static constexpr auto copy_H = decltype(size<0>(BlockShape{}))::value;
 
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto id = int(ThreadIdxX()) % subgroup_size;
 
     cute::subbyte_iterator<int4_t> dst_iter(dst);
diff --git a/include/cute/util/debug.hpp b/include/cute/util/debug.hpp
index a23091ae6a..144f9c18f3 100644
--- a/include/cute/util/debug.hpp
+++ b/include/cute/util/debug.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,7 +38,7 @@
 
 #if defined(CUTLASS_ENABLE_SYCL)
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 #else
 #include <cuda_runtime_api.h>
 #endif
@@ -130,7 +131,7 @@ block([[maybe_unused]] int bid)
 #if defined(__CUDA_ARCH__)
   return blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y == static_cast<unsigned int>(bid);
 #elif defined(__SYCL_DEVICE_ONLY__)
-  using namespace syclcompat;
+  using namespace compat;
   return (work_group_id::x() + work_group_id::y() * work_group_range::x() +
           work_group_id::z() * work_group_range::y() * work_group_range::x() == bid);
 #else
@@ -145,7 +146,7 @@ thread([[maybe_unused]] int tid, [[maybe_unused]] int bid)
 #if defined(__CUDA_ARCH__)
   return (threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y == static_cast<unsigned int>(tid)) && block(bid);
 #elif defined(__SYCL_DEVICE_ONLY__)
-  using namespace syclcompat;
+  using namespace compat;
   return (local_id::x() + local_id::y() * local_range::x() +
           local_id::z() * local_range::x() * local_range::y() == tid) && block(bid);
 #else
diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h
index dba24463f5..39deb35654 100644
--- a/include/cutlass/cutlass.h
+++ b/include/cutlass/cutlass.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -103,7 +104,7 @@ CUTLASS_HOST_DEVICE bool thread0() {
   #if defined(__CUDA_ARCH__)
     return (!threadIdx.x && !threadIdx.y && !threadIdx.z) && (!blockIdx.x && !blockIdx.y && !blockIdx.z);
   #elif defined(__SYCL_DEVICE_ONLY__)
-    return (!syclcompat::global_id::x() && !syclcompat::global_id::y() && !syclcompat::global_id::z());
+    return (!compat::global_id::x() && !compat::global_id::y() && !compat::global_id::z());
   #else
     return false;
   #endif
diff --git a/include/cutlass/epilogue/fusion/xe_visitor.hpp b/include/cutlass/epilogue/fusion/xe_visitor.hpp
index a2407f34b5..8ab2ee60e3 100644
--- a/include/cutlass/epilogue/fusion/xe_visitor.hpp
+++ b/include/cutlass/epilogue/fusion/xe_visitor.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -420,7 +421,7 @@ struct XeRowBroadcast {
       ptr_row = params.ptr_row;
     }
   // TODO(Codeplay): id_in_sg instead of thread_idx here because incorrect tiled copy definition
-    int id_in_sg = syclcompat::get_nd_item<1>().get_sub_group().get_local_id();
+    int id_in_sg = compat::get_nd_item<1>().get_sub_group().get_local_id();
     Tensor mRow = make_tensor(make_gmem_ptr(ptr_row), make_layout(layout_M,layout_N,layout_L));
     Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
       mRow, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, id_in_sg);
diff --git a/include/cutlass/epilogue/fusion/xe_visitor_softmax.hpp b/include/cutlass/epilogue/fusion/xe_visitor_softmax.hpp
index 5aea585183..2e243610ae 100644
--- a/include/cutlass/epilogue/fusion/xe_visitor_softmax.hpp
+++ b/include/cutlass/epilogue/fusion/xe_visitor_softmax.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,8 +49,8 @@ namespace detail {
 template <class STensor, uint32_t row, uint32_t SgN, class RTensor, class OutTensor>
 CUTLASS_DEVICE
 void group_reduce_sum_partial(STensor &stensor, RTensor &vec, OutTensor &out) {
-  auto sg = syclcompat::get_nd_item<1>().get_sub_group();
-  auto group = syclcompat::get_nd_item<1>().get_group();
+  auto sg = compat::get_nd_item<1>().get_sub_group();
+  auto group = compat::get_nd_item<1>().get_group();
 
   CUTLASS_PRAGMA_UNROLL
   for (int i = 0; i < size(vec); i++) {
@@ -126,8 +127,8 @@ void group_reduce_sum_partial(STensor &stensor, RTensor &vec, OutTensor &out) {
 template <class STensor, uint32_t row, uint32_t SgN, class RTensor, class OutTensor>
 CUTLASS_DEVICE
 void group_reduce_max_partial(STensor &stensor, RTensor &vec, OutTensor &out) {
-  auto sg = syclcompat::get_nd_item<1>().get_sub_group();
-  auto group = syclcompat::get_nd_item<1>().get_group();
+  auto sg = compat::get_nd_item<1>().get_sub_group();
+  auto group = compat::get_nd_item<1>().get_group();
 
   CUTLASS_PRAGMA_UNROLL
   for (int i = 0; i < size(vec); i++) {
@@ -388,7 +389,7 @@ struct XeSoftmaxRowReduction
       constexpr auto vec_size = min(Epi_M, Sg_N);
       constexpr auto vec_folds = Epi_M / vec_size;
 
-      auto smem = syclcompat::local_mem<float[Sg_Nums * vec_size]>();
+      auto smem = compat::local_mem<float[Sg_Nums * vec_size]>();
       Tensor stensor = make_tensor(make_smem_ptr(smem), make_shape(Int<vec_size>{}, Int<Sg_N>{}, Int<Sg_M>{}));
 
       Tensor res =
diff --git a/include/cutlass/fp8_to_fp16.h b/include/cutlass/fp8_to_fp16.h
index b2dd1c564e..7f70bc2acb 100644
--- a/include/cutlass/fp8_to_fp16.h
+++ b/include/cutlass/fp8_to_fp16.h
@@ -39,6 +39,211 @@
 #include <cute/util/sycl_vec.hpp>
 #include <cutlass/detail/helper_macros.hpp>
 #include <cutlass/half.h>
+#include <cutlass/numeric_conversion.h>
+
+//
+//
+// Helper device function for E4M3 -> FP16 bitwise conversion
+CUTLASS_DEVICE inline uint16_t
+fp8_e4m3_to_fp16_bitwise(uint8_t const& src) {
+    // E4M3 (1-4-3) constants
+    constexpr uint32_t e4m3_exp_bias = 7;
+    // FP16 (1-5-10) constants
+    constexpr uint32_t fp16_exp_bias = 15;
+
+    // Unpack FP8 bits
+    uint16_t sign = static_cast<uint16_t>(src & 0x80);
+    uint16_t exponent = static_cast<uint16_t>(src & 0x78) >> 3;
+    uint16_t mantissa = static_cast<uint16_t>(src & 0x07);
+
+    // Reconstruct FP16 bits
+    uint16_t fp16_sign = sign << 8;
+    // Re-bias exponent and shift to FP16 position
+    uint16_t fp16_exponent = (exponent - e4m3_exp_bias + fp16_exp_bias) << 10;
+    // Shift mantissa to FP16 position
+    uint16_t fp16_mantissa = mantissa << 7;
+
+    return fp16_sign | fp16_exponent | fp16_mantissa;
+}
+
+// Helper device function for E5M2 -> FP16 bitwise conversion
+CUTLASS_DEVICE inline uint16_t
+fp8_e5m2_to_fp16_bitwise(uint8_t const& src) {
+    // E5M2 (1-5-2) constants
+    constexpr uint32_t e5m2_exp_bias = 15;
+    // FP16 (1-5-10) constants
+    constexpr uint32_t fp16_exp_bias = 15;
+
+    // Unpack FP8 bits
+    uint16_t sign = static_cast<uint16_t>(src & 0x80);
+    uint16_t exponent = static_cast<uint16_t>(src & 0x7C) >> 2;
+    uint16_t mantissa = static_cast<uint16_t>(src & 0x03);
+
+    // Reconstruct FP16 bits (Exponent bias is the same, so no re-biasing needed)
+    uint16_t fp16_sign = sign << 8;
+    uint16_t fp16_exponent = exponent << 10;
+    // Shift mantissa to FP16 position
+    uint16_t fp16_mantissa = mantissa << 8;
+
+    return fp16_sign | fp16_exponent | fp16_mantissa;
+}
+
+template <
+    typename Encoding,
+    int VectorizeSize = 8,
+    typename SrcTensor,
+    typename DstTensor
+>
+CUTLASS_DEVICE void
+convert_and_descale(
+    SrcTensor const& src,
+    DstTensor& dst,
+    float scale) {
+
+    using DstElementType = sycl::half;
+    using SrcVec_u8 = sycl::vec<uint8_t, VectorizeSize>;
+    using DstVec_half = sycl::vec<DstElementType, VectorizeSize>;
+    using u16Vec = sycl::vec<uint16_t, VectorizeSize>;
+
+    auto src_ptr = reinterpret_cast<SrcVec_u8 const*>(src.data());
+    auto dst_ptr = reinterpret_cast<DstVec_half*>(dst.data());
+
+    // Create a sycl::half vector for scaling
+    const DstVec_half scale_vec_half(static_cast<sycl::half>(scale));
+
+    #pragma unroll
+    for (int i = 0; i < cute::size(src) / VectorizeSize; ++i) {
+        SrcVec_u8 const src_vec_u8 = src_ptr[i];
+        u16Vec val_fp16_bits;
+
+        #pragma unroll
+        for (int j = 0; j < VectorizeSize; ++j) {
+            if constexpr (std::is_same_v<Encoding, cutlass::float_e4m3_t>) {
+                val_fp16_bits[j] = fp8_e4m3_to_fp16_bitwise(src_vec_u8[j]);
+            } else {
+                val_fp16_bits[j] = fp8_e5m2_to_fp16_bitwise(src_vec_u8[j]);
+            }
+        }
+
+        // Reinterpret the FP16 bits as a sycl::half vector
+        DstVec_half val_fp16_vec = val_fp16_bits.template as<DstVec_half>();
+
+        // Apply scaling DIRECTLY on the sycl::half vector.
+        // This is the critical change.
+        val_fp16_vec *= scale_vec_half;
+
+        // Store the result. No more conversions needed.
+        dst_ptr[i] = val_fp16_vec;
+    }
+}
+
+/*
+template <
+    typename Encoding,
+    int VectorizeSize = 8,
+    typename SrcTensor,
+    typename DstTensor
+>
+CUTLASS_DEVICE void
+convert_and_descale(
+    SrcTensor const& src,
+    DstTensor& dst,
+    float scale) {
+
+    using DstElementType = sycl::half;
+    using SrcVec_u8 = sycl::vec<uint8_t, VectorizeSize>;
+    using DstVec_half = sycl::vec<DstElementType, VectorizeSize>;
+    using Fp32Vec = sycl::vec<float, VectorizeSize>;
+    using u16Vec = sycl::vec<uint16_t, VectorizeSize>;
+
+    auto src_ptr = reinterpret_cast<SrcVec_u8 const*>(src.data());
+    auto dst_ptr = reinterpret_cast<DstVec_half*>(dst.data());
+
+    const Fp32Vec scale_vec(scale);
+
+    #pragma unroll
+    for (int i = 0; i < cute::size(src) / VectorizeSize; ++i) {
+        SrcVec_u8 const src_vec_u8 = src_ptr[i];
+        u16Vec val_fp16_bits;
+
+        // Perform the bitwise conversion in a simple, vectorizable loop
+        #pragma unroll
+        for (int j = 0; j < VectorizeSize; ++j) {
+            if constexpr (std::is_same_v<Encoding, cutlass::float_e4m3_t>) {
+                val_fp16_bits[j] = fp8_e4m3_to_fp16_bitwise(src_vec_u8[j]);
+            } else { // E5M2
+                val_fp16_bits[j] = fp8_e5m2_to_fp16_bitwise(src_vec_u8[j]);
+            }
+        }
+
+        // Reinterpret the FP16 bits as sycl::half and convert to FP32 for scaling
+        Fp32Vec val_fp32_vec = val_fp16_bits.template as<DstVec_half>().template convert<float>();
+
+        // Apply scaling
+        val_fp32_vec *= scale_vec;
+
+        // Convert back to FP16 and store
+        dst_ptr[i] = val_fp32_vec.template convert<DstElementType, sycl::rounding_mode::rte>();
+    }
+}
+*/
+
+/*
+template <
+    typename Encoding,
+    int VectorizeSize = 8,
+    typename SrcTensor,
+    typename DstTensor
+>
+CUTLASS_DEVICE void
+convert_and_descale(
+    SrcTensor const& src,
+    DstTensor& dst,
+    float scale) {
+
+    using DstElementType = sycl::half;
+    static_assert(sizeof(DstElementType) == sizeof(typename DstTensor::value_type));
+
+    // Define SYCL vector types for all stages of the computation
+    using SrcVec_u8 = sycl::vec<uint8_t, VectorizeSize>;
+    using DstVec_half = sycl::vec<DstElementType, VectorizeSize>;
+    using Fp32Vec = sycl::vec<float, VectorizeSize>;
+
+    // Pointers for wide memory access
+    auto src_ptr = reinterpret_cast<SrcVec_u8 const*>(src.data());
+    auto dst_ptr = reinterpret_cast<DstVec_half*>(dst.data());
+
+    // Create a vector of scaling factors
+    const Fp32Vec scale_vec(scale);
+
+    #pragma unroll
+    for (int i = 0; i < cute::size(src) / VectorizeSize; ++i) {
+        // 1. Wide load of FP8 data (as uint8_t)
+        SrcVec_u8 const src_vec_u8 = src_ptr[i];
+
+        // 2. Convert FP8 vector to FP32 vector (MANUAL VECTORIZATION)
+        // This replaces the slow scalar loop.
+        Fp32Vec val_fp32_vec;
+        #pragma unroll
+        for (int j = 0; j < VectorizeSize; ++j) {
+            // This simpler sequence is much easier for the compiler to optimize.
+            val_fp32_vec[j] = static_cast<float>(
+                reinterpret_cast<Encoding const*>(&src_vec_u8[j])[0]
+            );
+        }
+
+        // 3. Perform a vectorized multiplication
+        val_fp32_vec *= scale_vec;
+
+        // 4. Perform a vectorized conversion from FP32 vector to FP16 vector
+        // The .convert() method is highly optimized for this.
+        DstVec_half dst_vec = val_fp32_vec.template convert<DstElementType, sycl::rounding_mode::rte>();
+
+        // 5. Perform a single, wide store
+        dst_ptr[i] = dst_vec;
+    }
+}
+*/
 
 template <typename EncodingType, typename TensorIn, typename TensorOut>
 CUTLASS_DEVICE void
diff --git a/include/cutlass/gemm/collective/xe_array_mma.hpp b/include/cutlass/gemm/collective/xe_array_mma.hpp
index 3a1e84ae84..5c12f02e9a 100644
--- a/include/cutlass/gemm/collective/xe_array_mma.hpp
+++ b/include/cutlass/gemm/collective/xe_array_mma.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -200,7 +201,7 @@ struct CollectiveMma<MainloopIntelXeXMX16Group<Stages, Schedule>, TileShape_, El
     TiledMma tiled_mma;
     // TODO(Codeplay): see if we can make this nicer
     // To make all work items in a subgroup have the same global tensors pass in the index of work item 0 in each subgroup
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
     auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
 
diff --git a/include/cutlass/gemm/collective/xe_array_mma_fp8.hpp b/include/cutlass/gemm/collective/xe_array_mma_fp8.hpp
index 6b1bfd67d7..25ee8b26ad 100644
--- a/include/cutlass/gemm/collective/xe_array_mma_fp8.hpp
+++ b/include/cutlass/gemm/collective/xe_array_mma_fp8.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -202,7 +203,7 @@ struct CollectiveMma<MainloopIntelXeXMX16GroupFP8<Stages, Schedule>, TileShape_,
     TiledMma tiled_mma;
     // TODO(Codeplay): see if we can make this nicer
     // To make all work items in a subgroup have the same global tensors pass in the index of work item 0 in each subgroup
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
     auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
 
diff --git a/include/cutlass/gemm/collective/xe_array_mma_mixed_input.hpp b/include/cutlass/gemm/collective/xe_array_mma_mixed_input.hpp
index 4fd967e7c9..d17b01aab9 100644
--- a/include/cutlass/gemm/collective/xe_array_mma_mixed_input.hpp
+++ b/include/cutlass/gemm/collective/xe_array_mma_mixed_input.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -569,7 +570,7 @@ CUTLASS_DEVICE auto create_copies(LoadTensors const& load_tensors) {
 
     // Instantiate the MMA object and get thread slice
     TiledMma tiled_mma;
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
     auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
 
diff --git a/include/cutlass/gemm/collective/xe_mma.hpp b/include/cutlass/gemm/collective/xe_mma.hpp
index 55e29e0067..47df13d20c 100644
--- a/include/cutlass/gemm/collective/xe_mma.hpp
+++ b/include/cutlass/gemm/collective/xe_mma.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -183,7 +184,7 @@ struct CollectiveMma<MainloopIntelXeXMX16<Stages, Schedule>, TileShape_, Element
     TiledMma tiled_mma;
     // TODO(Codeplay): see if we can make this nicer
     // To make all work items in a subgroup have the same global tensors pass in the index of work item 0 in each subgroup
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
     auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
 
diff --git a/include/cutlass/gemm/collective/xe_mma_fp8_scaling.hpp b/include/cutlass/gemm/collective/xe_mma_fp8_scaling.hpp
index 8b1aaf3f53..c19e3765f7 100644
--- a/include/cutlass/gemm/collective/xe_mma_fp8_scaling.hpp
+++ b/include/cutlass/gemm/collective/xe_mma_fp8_scaling.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -533,7 +534,7 @@ struct CollectiveMma<
 
     // Instantiate the MMA object and get thread slice
     TiledMma tiled_mma;
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
     auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
 
diff --git a/include/cutlass/gemm/collective/xe_mma_mixed_input.hpp b/include/cutlass/gemm/collective/xe_mma_mixed_input.hpp
index 7901545b0b..38b9f2f65c 100644
--- a/include/cutlass/gemm/collective/xe_mma_mixed_input.hpp
+++ b/include/cutlass/gemm/collective/xe_mma_mixed_input.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -328,7 +329,7 @@ struct CollectiveMma<
       }();
 
       if constexpr (ModeScale) {
-        return Params{tiled_copy_a, tiled_copy_b, tiled_copy_scale, {}, args.group_size};
+        return Params{tiled_copy_a, tiled_copy_b, {tiled_copy_scale}, {}, args.group_size};
       } else {
         auto ptr_Z = [&]() {
           if constexpr (sizeof_bits_v<NonVoidElementZero> < 8) {
@@ -353,7 +354,7 @@ struct CollectiveMma<
           }
         }();
 
-        return Params{tiled_copy_a, tiled_copy_b, tiled_copy_scale, tiled_copy_zero, args.group_size};
+        return Params{tiled_copy_a, tiled_copy_b, {tiled_copy_scale}, {tiled_copy_zero}, args.group_size};
       }
     }
   }
@@ -650,7 +651,7 @@ struct CollectiveMma<
 
     // Instantiate the MMA object and get thread slice
     TiledMma tiled_mma;
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
     auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
 
diff --git a/include/cutlass/gemm/collective/xe_mma_w8a8.hpp b/include/cutlass/gemm/collective/xe_mma_w8a8.hpp
index b8323ba739..360143ee51 100644
--- a/include/cutlass/gemm/collective/xe_mma_w8a8.hpp
+++ b/include/cutlass/gemm/collective/xe_mma_w8a8.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -194,7 +195,7 @@ struct CollectiveMma<MainloopIntelW8A8<Stages, Schedule>, TileShape_, ElementA_,
     TiledMma tiled_mma;
     // TODO(Codeplay): see if we can make this nicer
     // To make all work items in a subgroup have the same global tensors pass in the index of work item 0 in each subgroup
-    auto sg = syclcompat::get_nd_item<1>().get_sub_group();
+    auto sg = compat::get_nd_item<1>().get_sub_group();
     auto first_thread_in_sg_idx = sg.get_group_linear_id() * DispatchPolicy::SubgroupSize;
     auto thr_mma = tiled_mma.get_slice(first_thread_in_sg_idx);
 
diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h
index 5d612cfd5c..98e65bb62a 100644
--- a/include/cutlass/gemm/device/gemm.h
+++ b/include/cutlass/gemm/device/gemm.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -492,17 +493,17 @@ class Gemm {
     }
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
-    auto q = stream ? *stream : syclcompat::get_default_queue();
-    syclcompat::experimental::launch<cutlass::Kernel<GemmKernel>>(
-      syclcompat::experimental::launch_policy{
+    auto q = stream ? *stream : compat::get_default_queue();
+    compat::experimental::launch<cutlass::Kernel<GemmKernel>>(
+      compat::experimental::launch_policy{
         sycl_grid, sycl_block,
 #if defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
           sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size)
 #else
-          syclcompat::experimental::local_mem_size{static_cast<std::size_t>(smem_size)}
+          compat::experimental::local_mem_size{static_cast<std::size_t>(smem_size)}
 #endif
       },
       q, params_
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index 889f274819..cc715f421b 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -380,8 +381,8 @@ class GemmUniversalAdapter<
     dim3 const grid = get_grid_shape(params);
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    const syclcompat::dim3 sycl_block(block.x, block.y, block.z);
-    const syclcompat::dim3 sycl_grid(grid.x, grid.y, grid.z);
+    const compat::dim3 sycl_block(block.x, block.y, block.z);
+    const compat::dim3 sycl_grid(grid.x, grid.y, grid.z);
 #endif
 
     // configure smem size and carveout
@@ -547,9 +548,9 @@ class GemmUniversalAdapter<
       else {
         CUTLASS_ASSERT(cuda_adapter == nullptr);
 #if defined(CUTLASS_ENABLE_SYCL)
-        sycl::queue q = stream ? *stream : syclcompat::get_default_queue();
+        sycl::queue q = stream ? *stream : compat::get_default_queue();
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-        using namespace syclcompat::experimental;
+        using namespace compat::experimental;
         if constexpr (cute::is_same_v<DispatchPolicy, MainloopDeviceAgnostic>) {
           auto event = launch<device_kernel<GemmKernel>>(launch_policy{
             sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)}
@@ -575,20 +576,20 @@ class GemmUniversalAdapter<
             cute::is_same_v<DispatchPolicy, MainloopDeviceAgnostic>;
           if constexpr (!allow_subgroup_size_prop or is_device_agnostic) {
             using EmptyProperties = decltype(sycl::ext::oneapi::experimental::properties());
-            return syclcompat::experimental::kernel_properties<EmptyProperties>{};
+            return compat::experimental::kernel_properties<EmptyProperties>{};
           } else {
-            return syclcompat::experimental::kernel_properties{
+            return compat::experimental::kernel_properties{
               sycl::ext::oneapi::experimental::sub_group_size<DispatchPolicy::SubgroupSize>
             };
           }
         }();
-        syclcompat::experimental::launch_properties launch_props {
+        compat::experimental::launch_properties launch_props {
           sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
         };
-        syclcompat::experimental::launch_policy policy{
+        compat::experimental::launch_policy policy{
           sycl_grid, sycl_block, launch_props, kernel_props
         };
-        auto event = syclcompat::experimental::launch<device_kernel<GemmKernel>>(policy, q, params);
+        auto event = compat::experimental::launch<device_kernel<GemmKernel>>(policy, q, params);
         EventManager::getInstance().addEvent(event);
 #endif // !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
 #else
diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h
index 51a9bdc09b..c0b4a117fe 100644
--- a/include/cutlass/gemm/device/gemm_universal_base.h
+++ b/include/cutlass/gemm/device/gemm_universal_base.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -458,17 +459,17 @@ class GemmUniversalBase {
       CUTLASS_ASSERT(cuda_adapter == nullptr);
 
 #if defined(CUTLASS_ENABLE_SYCL)
-      const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-      const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+      const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+      const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
-      sycl::queue q = stream ? *stream : syclcompat::get_default_queue();
-      syclcompat::experimental::launch<Kernel2<GemmKernel>>(
-        syclcompat::experimental::launch_policy{
+      sycl::queue q = stream ? *stream : compat::get_default_queue();
+      compat::experimental::launch<Kernel2<GemmKernel>>(
+        compat::experimental::launch_policy{
           sycl_grid, sycl_block,
 #if defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
           sycl::ext::oneapi::experimental::work_group_scratch_size(kSharedStorageSize)
 #else
-          syclcompat::experimental::local_mem_size{static_cast<std::size_t>(kSharedStorageSize)}
+          compat::experimental::local_mem_size{static_cast<std::size_t>(kSharedStorageSize)}
 #endif
         },
         q, params_);
diff --git a/include/cutlass/gpu_generics.h b/include/cutlass/gpu_generics.h
index aac30deef5..c42faee58e 100644
--- a/include/cutlass/gpu_generics.h
+++ b/include/cutlass/gpu_generics.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,7 +39,7 @@
 
 #if defined(CUTLASS_ENABLE_SYCL)
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -63,7 +64,7 @@ unsigned int ThreadIdxX() {
 #if defined(__CUDA_ARCH__)
   return threadIdx.x;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::local_id::x();
+  return compat::local_id::x();
 #else
   return 0;
 #endif
@@ -74,7 +75,7 @@ unsigned int ThreadIdxY() {
 #if defined(__CUDA_ARCH__)
   return threadIdx.y;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::local_id::y();
+  return compat::local_id::y();
 #else
   return 0;
 #endif
@@ -85,7 +86,7 @@ unsigned int ThreadIdxZ() {
 #if defined(__CUDA_ARCH__)
   return threadIdx.z;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::local_id::z();
+  return compat::local_id::z();
 #else
   return 0;
 #endif
@@ -96,7 +97,7 @@ unsigned int BlockIdxX() {
 #if defined(__CUDA_ARCH__)
   return blockIdx.x;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::work_group_id::x();
+  return compat::work_group_id::x();
 #else
   return 0;
 #endif
@@ -107,7 +108,7 @@ unsigned int BlockIdxY() {
 #if defined(__CUDA_ARCH__)
   return blockIdx.y;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::work_group_id::y();
+  return compat::work_group_id::y();
 #else
   return 0;
 #endif
@@ -118,7 +119,7 @@ unsigned int BlockIdxZ() {
 #if defined(__CUDA_ARCH__)
   return blockIdx.z;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::work_group_id::z();
+  return compat::work_group_id::z();
 #else
   return 0;
 #endif
@@ -129,7 +130,7 @@ unsigned int BlockDimX() {
 #if defined(__CUDA_ARCH__)
   return blockDim.x;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::local_range::x();
+  return compat::local_range::x();
 #else
   return 0;
 #endif
@@ -140,7 +141,7 @@ unsigned int BlockDimY() {
 #if defined(__CUDA_ARCH__)
   return blockDim.y;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::local_range::y();
+  return compat::local_range::y();
 #else
   return 0;
 #endif
@@ -151,7 +152,7 @@ unsigned int BlockDimZ() {
 #if defined(__CUDA_ARCH__)
   return blockDim.z;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::local_range::z();
+  return compat::local_range::z();
 #else
   return 0;
 #endif
@@ -162,7 +163,7 @@ unsigned int GridDimX() {
 #if defined(__CUDA_ARCH__)
   return gridDim.x;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::work_group_range::x();
+  return compat::work_group_range::x();
 #else
   return 0;
 #endif
@@ -173,7 +174,7 @@ unsigned int GridDimY() {
 #if defined(__CUDA_ARCH__)
   return gridDim.y;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::work_group_range::y();
+  return compat::work_group_range::y();
 #else
   return 0;
 #endif
@@ -184,7 +185,7 @@ unsigned int GridDimZ() {
 #if defined(__CUDA_ARCH__)
   return gridDim.z;
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::work_group_range::z();
+  return compat::work_group_range::z();
 #else
   return 0;
 #endif
@@ -203,7 +204,7 @@ void syncthreads() {
 #if defined(__CUDA_ARCH__)
   __syncthreads();
 #elif defined(__SYCL_DEVICE_ONLY__)
-  syclcompat::wg_barrier();
+  compat::wg_barrier();
 #endif
 }
 
@@ -212,7 +213,7 @@ int syncthreads_and(int cond) {
 #if defined(__CUDA_ARCH__)
   return __syncthreads_and(cond);
 #elif defined(__SYCL_DEVICE_ONLY__)
-  auto group = syclcompat::get_nd_item<1>().get_group();
+  auto group = compat::get_nd_item<1>().get_group();
   sycl::group_barrier(group);
   return sycl::all_of_group(group, cond);
 #else
@@ -225,7 +226,7 @@ void syncwarp() {
 #if defined(__CUDA_ARCH__)
   __syncwarp();
 #elif defined(__SYCL_DEVICE_ONLY__)
-  sycl::group_barrier(syclcompat::get_nd_item<1>().get_sub_group());
+  sycl::group_barrier(compat::get_nd_item<1>().get_sub_group());
 #endif
 }
 
@@ -244,7 +245,7 @@ unsigned int byte_perm(unsigned int x, unsigned int y, unsigned int s) {
 #if defined(__CUDA_ARCH__)
   return __byte_perm(x, y, s);
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::byte_level_permute(x, y, s);
+  return compat::byte_level_permute(x, y, s);
 #else
   return 0;
 #endif
@@ -262,7 +263,7 @@ T shfl_up_sync(
 #if defined(__CUDA_ARCH__)
   return __shfl_up_sync(mask, var, delta, width);
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::shift_sub_group_right(syclcompat::get_nd_item<1>().get_sub_group(), var, delta, width);
+  return compat::shift_sub_group_right(compat::get_nd_item<1>().get_sub_group(), var, delta, width);
 #else
   return static_cast<T>(0);
 #endif
@@ -278,7 +279,7 @@ T shfl_down_sync(
 #if defined(__CUDA_ARCH__)
   return __shfl_down_sync(mask, var, delta, width);
 #elif defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::shift_sub_group_left(syclcompat::get_nd_item<1>().get_sub_group(), var, delta, width);
+  return compat::shift_sub_group_left(compat::get_nd_item<1>().get_sub_group(), var, delta, width);
 #else
   return static_cast<T>(0);
 #endif
@@ -294,7 +295,7 @@ T shfl_sync(
 #if defined(__CUDA_ARCH__)
   return __shfl_sync(mask, var, delta, width);
 #elif defined(__SYCL_DEVICE_ONLY__)
-  auto g = syclcompat::get_nd_item<1>().get_sub_group();
+  auto g = compat::get_nd_item<1>().get_sub_group();
   unsigned int start_index = (g.get_local_linear_id() / width) * width;
   return sycl::select_from_group(g, var, start_index + delta % width);
 #else
@@ -312,8 +313,8 @@ T shfl_xor_sync(
 #if defined(__CUDA_ARCH__)
   return __shfl_xor_sync(mask, var, laneMask, width);
 #elif defined(__SYCL_DEVICE_ONLY__)
-  auto g = syclcompat::get_nd_item<1>().get_sub_group();
-  return syclcompat::permute_sub_group_by_xor(g, var, laneMask);
+  auto g = compat::get_nd_item<1>().get_sub_group();
+  return compat::permute_sub_group_by_xor(g, var, laneMask);
 #else
   return static_cast<T>(0);
 #endif
@@ -353,13 +354,13 @@ namespace cutlass {
 // Stream
 using cudaStream_t = sycl::queue *;
 
-using dim3 = syclcompat::dim3;
+using dim3 = compat::dim3;
 
 // Atomic
 template <typename T>
 CUTLASS_DEVICE T atomicAdd(T *address, T val) {
 #if defined(__SYCL_DEVICE_ONLY__)
-  return syclcompat::atomic_fetch_add<sycl::access::address_space::global_space>(address, val);
+  return compat::atomic_fetch_add<sycl::access::address_space::global_space>(address, val);
 #endif
   return static_cast<T>(0);
 }
@@ -367,7 +368,7 @@ CUTLASS_DEVICE T atomicAdd(T *address, T val) {
 CUTLASS_DEVICE int atomicCAS(int *address, int compare, int val) {
   int result = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
-  result = syclcompat::atomic_compare_exchange_strong(address, compare, val);
+  result = compat::atomic_compare_exchange_strong(address, compare, val);
 #endif
   return result;
 }
@@ -409,8 +410,8 @@ CUTLASS_HOST_DEVICE
 cudaError_t cudaMemsetAsync(void *devPtr, unsigned int value, size_t count, cudaStream_t stream = nullptr) {
   static_assert(std::is_same_v<T, void>, "cudaMemsetAsync takes a dummy template parameter, T = "
                                          "void, to instantiate copy kernel only if it is used.");
-  sycl::queue q = stream ? *stream : syclcompat::get_default_queue();
-  syclcompat::fill_async(devPtr, value, count, q);
+  sycl::queue q = stream ? *stream : compat::get_default_queue();
+  compat::fill_async(devPtr, value, count, q);
   return cudaSuccess;
 }
 
@@ -424,8 +425,8 @@ CUresult cuMemsetD32Async(CUdeviceptr devPtr, uint32_t value, size_t count, cuda
   static_assert(std::is_same_v<T, void>, "cuMemsetD32Async takes a dummy template parameter, T = "
                                          "void, to instantiate copy kernel only if it is used.");
   void *ptr = reinterpret_cast<void *>(devPtr);
-  sycl::queue q = stream ? *stream : syclcompat::get_default_queue();
-  syclcompat::fill_async(ptr, value, count, q);
+  sycl::queue q = stream ? *stream : compat::get_default_queue();
+  compat::fill_async(ptr, value, count, q);
   return cudaSuccess;
 }
 
@@ -435,8 +436,8 @@ CUresult cuMemsetD16Async(CUdeviceptr devPtr, uint16_t value, size_t count, cuda
   static_assert(std::is_same_v<T, void>, "cuMemsetD16Async takes a dummy template parameter, T = "
                                          "void, to instantiate copy kernel only if it is used.");
   void *ptr = reinterpret_cast<void *>(devPtr);
-  sycl::queue q = stream ? *stream : syclcompat::get_default_queue();
-  syclcompat::fill_async(ptr, value, count, q);
+  sycl::queue q = stream ? *stream : compat::get_default_queue();
+  compat::fill_async(ptr, value, count, q);
   return cudaSuccess;
 }
 
@@ -446,8 +447,8 @@ CUresult cuMemsetD8Async(CUdeviceptr devPtr, uint8_t value, size_t count, cudaSt
   static_assert(std::is_same_v<T, void>, "cuMemsetD8Async takes a dummy template parameter, T = "
                                          "void, to instantiate copy kernel only if it is used.");
   void *ptr = reinterpret_cast<void *>(devPtr);
-  sycl::queue q = stream ? *stream : syclcompat::get_default_queue();
-  syclcompat::fill_async(ptr, value, count, q);
+  sycl::queue q = stream ? *stream : compat::get_default_queue();
+  compat::fill_async(ptr, value, count, q);
   return cudaSuccess;
 }
 
@@ -480,7 +481,7 @@ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
 
 // Expose dim3 in the cute namespace
 namespace cute {
-  using dim3 = syclcompat::dim3;
+  using dim3 = compat::dim3;
 }
 #endif
 
diff --git a/include/cutlass/kernel_hardware_info.h b/include/cutlass/kernel_hardware_info.h
index 9afdf0fc2f..6c2f1f307b 100644
--- a/include/cutlass/kernel_hardware_info.h
+++ b/include/cutlass/kernel_hardware_info.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,7 +62,7 @@ struct KernelHardwareInfo {
 #if defined (CUTLASS_ENABLE_SYCL)
   static inline int
   query_device_multiprocessor_count(int device_id = 0) {
-    auto& dev = syclcompat::get_device(device_id);
+    auto& dev = compat::get_device(device_id);
     int multiprocessor_count = 1;
     //TODO (Codeplay): Replace with device.get_info<sycl::ext::oneapi::info::device::num_compute_units>() once available 
 #if defined __SYCL_CUDA_ARCH__
diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h
index 59d6c56a93..a29301ab35 100644
--- a/include/cutlass/numeric_conversion.h
+++ b/include/cutlass/numeric_conversion.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -5559,7 +5560,7 @@ struct NumericArrayConverter<float, int8_t, N, Round> {
     CUTLASS_PRAGMA_UNROLL
     for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
 #if defined(CUTLASS_ENABLE_SYCL)
-      t[ii] = syclcompat::dp4a(x, mask[ii], 0);
+      t[ii] = compat::dp4a(x, mask[ii], 0);
 #else
       t[ii] = __dp4a(x, mask[ii], 0);
 #endif
diff --git a/include/cutlass/workspace.h b/include/cutlass/workspace.h
index c41a821031..54491522d7 100644
--- a/include/cutlass/workspace.h
+++ b/include/cutlass/workspace.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -66,8 +67,8 @@ zero_workspace(
     CUTLASS_TRACE_HOST("  clearing workspace");
 
 #if defined (CUTLASS_ENABLE_SYCL)
-    auto q = stream ? *stream : syclcompat::get_default_queue();
-    syclcompat::memset_async(workspace, 0, workspace_size, q);
+    auto q = stream ? *stream : compat::get_default_queue();
+    compat::memset_async(workspace, 0, workspace_size, q);
 #elif defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
     //
     // Use the cuda host adapter
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index d70867a193..2fa422ad5a 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -95,6 +95,8 @@ function(cutlass_test_unit_add_executable NAME)
       ${NAME}
       PUBLIC
       GTest::gtest 
+      # TODO: This change works for resolving 'compat.hpp' not found issue, fix this if it blocks merging
+      cutlass_tools_util_includes
     )
   else()
     target_link_libraries(
diff --git a/test/unit/common/filter_architecture.cpp b/test/unit/common/filter_architecture.cpp
index d629fbce1e..34482d70b2 100644
--- a/test/unit/common/filter_architecture.cpp
+++ b/test/unit/common/filter_architecture.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -113,7 +114,7 @@ void FilterArchitecture() {
     {architecture::intel_gpu_bmg_g21, 1}
   };
   auto device_architecture =
-        syclcompat::get_default_queue().get_device().get_info<info::device::architecture>();
+        compat::get_default_queue().get_device().get_info<info::device::architecture>();
   if (device_architecture == architecture::unknown) {
     throw std::runtime_error("Encountered Unknown architecture.");
   }
diff --git a/test/unit/common/util.hpp b/test/unit/common/util.hpp
index cfdcef2a0f..f47b063fa4 100644
--- a/test/unit/common/util.hpp
+++ b/test/unit/common/util.hpp
@@ -1,6 +1,7 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,7 +32,7 @@
  **************************************************************************************************/
 
 #if defined(CUTLASS_ENABLE_SYCL)
-#include <syclcompat/syclcompat.hpp>
+#include <compat/compat.hpp>
 
 #include <vector>
 #else
@@ -48,7 +49,7 @@ namespace cutlass {
   namespace kernel {
     template<typename T>
     void memset(T* ptr, T init_val, std::size_t num_elements) {
-      auto global_id = syclcompat::global_id::x();
+      auto global_id = compat::global_id::x();
       if (global_id  < num_elements) {
         ptr[global_id] = init_val;
       }
@@ -88,9 +89,9 @@ class device_vector {
   device_vector(std::size_t num_elements, T init_value) { 
     n_elements = num_elements;
     dev_ptr = make_shared(num_elements);
-    syclcompat::launch<kernel::memset<T>>(sycl::range<1>(num_elements), 
+    compat::launch<kernel::memset<T>>(sycl::range<1>(num_elements), 
       sycl::range<1>(32), dev_ptr.get(), init_value, num_elements);
-    syclcompat::wait_and_throw(); 
+    compat::wait_and_throw(); 
   }
 
   device_vector<T>& operator=(host_vector<T> host_vec);
@@ -102,7 +103,7 @@ class device_vector {
 
  private:
   T* safe_malloc(std::size_t size) {
-    T* ptr = syclcompat::malloc<T>(size * sizeof(T));
+    T* ptr = compat::malloc<T>(size * sizeof(T));
     if(!ptr) {
       throw std::runtime_error("Allocation Failed.");
     }
@@ -111,8 +112,8 @@ class device_vector {
   std::shared_ptr<T> make_shared(std::size_t size) {
     return std::shared_ptr<T>(safe_malloc(size), [=](T* ptr) {
       if (ptr != nullptr) {
-        syclcompat::wait_and_throw();
-        syclcompat::free(ptr);
+        compat::wait_and_throw();
+        compat::free(ptr);
       }
     });
   }
@@ -122,9 +123,9 @@ class device_vector {
 
 template<typename T>
 host_vector<T>& host_vector<T>::operator=(device_vector<T> device_vec) {
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_vector host_vec(device_vec.size());
-    syclcompat::memcpy(host_vec.data(), device_vec.data(),
+    compat::memcpy(host_vec.data(), device_vec.data(),
                        device_vec.size() * sizeof(T));
     *this = host_vec;
     return *this;
@@ -132,9 +133,9 @@ host_vector<T>& host_vector<T>::operator=(device_vector<T> device_vec) {
 
 template<typename T>
 host_vector<T>::host_vector(device_vector<T> device_vec) {
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_vector host_vec(device_vec.size());
-    syclcompat::memcpy(host_vec.data(), device_vec.data(),
+    compat::memcpy(host_vec.data(), device_vec.data(),
                        device_vec.size() * sizeof(T));
     *this = host_vec;
 }
@@ -142,8 +143,8 @@ host_vector<T>::host_vector(device_vector<T> device_vec) {
 template<typename T>
 device_vector<T>& device_vector<T>::operator=(host_vector<T> host_vec) {
     device_vector device_vec(host_vec.size());
-    syclcompat::memcpy(device_vec.data(), host_vec.data(), host_vec.size() * sizeof(T));
-    syclcompat::wait_and_throw();
+    compat::memcpy(device_vec.data(), host_vec.data(), host_vec.size() * sizeof(T));
+    compat::wait_and_throw();
     *this = device_vec;
     return *this;
 }
@@ -151,8 +152,8 @@ device_vector<T>& device_vector<T>::operator=(host_vector<T> host_vec) {
 template<typename T>
 device_vector<T>::device_vector(host_vector<T> host_vec) {
     device_vector device_vec(host_vec.size());
-    syclcompat::memcpy(device_vec.data(), host_vec.data(), host_vec.size() * sizeof(T));
-    syclcompat::wait_and_throw();
+    compat::memcpy(device_vec.data(), host_vec.data(), host_vec.size() * sizeof(T));
+    compat::wait_and_throw();
     *this = device_vec;
 }
 
diff --git a/test/unit/cute/ampere/cooperative_copy.cu b/test/unit/cute/ampere/cooperative_copy.cu
index e0b70bfa61..059f39ca33 100644
--- a/test/unit/cute/ampere/cooperative_copy.cu
+++ b/test/unit/cute/ampere/cooperative_copy.cu
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -53,8 +54,8 @@ namespace cooperative_copy_mode {
   struct shared_shared {};
 }
 #if defined(CUTLASS_ENABLE_SYCL)
-namespace sc = syclcompat;
-namespace sc_exp = syclcompat::experimental;
+namespace sc = compat;
+namespace sc_exp = compat::experimental;
 namespace sycl_ext = sycl::ext::oneapi::experimental;
 
 // gs --> global to/from shared
diff --git a/test/unit/cute/ampere/cp_sync.cu b/test/unit/cute/ampere/cp_sync.cu
index 2eaf92a394..f8e88b6b07 100644
--- a/test/unit/cute/ampere/cp_sync.cu
+++ b/test/unit/cute/ampere/cp_sync.cu
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,8 +44,8 @@
 using namespace cute;
 
 #ifdef CUTLASS_ENABLE_SYCL
-namespace sc = syclcompat;
-namespace sc_exp = syclcompat::experimental;
+namespace sc = compat;
+namespace sc_exp = compat::experimental;
 namespace sycl_ext = sycl::ext::oneapi::experimental;
 
 CUTLASS_GLOBAL void
diff --git a/test/unit/cute/ampere/ldsm.cu b/test/unit/cute/ampere/ldsm.cu
index e822c6a082..2d80ffbc81 100644
--- a/test/unit/cute/ampere/ldsm.cu
+++ b/test/unit/cute/ampere/ldsm.cu
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,8 +42,8 @@
 using namespace cute;
 
 #if defined(CUTLASS_ENABLE_SYCL)
-namespace sc = syclcompat;
-namespace sc_exp = syclcompat::experimental;
+namespace sc = compat;
+namespace sc_exp = compat::experimental;
 namespace sycl_ext = sycl::ext::oneapi::experimental;
 
 template <class T>
diff --git a/test/unit/cute/ampere/tiled_cp_async_testbed.hpp b/test/unit/cute/ampere/tiled_cp_async_testbed.hpp
index d299cbfe24..5189e7782e 100644
--- a/test/unit/cute/ampere/tiled_cp_async_testbed.hpp
+++ b/test/unit/cute/ampere/tiled_cp_async_testbed.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,8 +49,8 @@ struct SharedStorage
 };
 
 #if defined(CUTLASS_ENABLE_SYCL)
-namespace sc = syclcompat;
-namespace sc_exp = syclcompat::experimental;
+namespace sc = compat;
+namespace sc_exp = compat::experimental;
 namespace sycl_ext = sycl::ext::oneapi::experimental;
 
 template <class T, class TiledCopy, class GmemLayout, class SmemLayout>
diff --git a/test/unit/cute/cooperative_gemm_common.hpp b/test/unit/cute/cooperative_gemm_common.hpp
index 51d7e8f624..f49d316b40 100644
--- a/test/unit/cute/cooperative_gemm_common.hpp
+++ b/test/unit/cute/cooperative_gemm_common.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -166,11 +167,11 @@ void verify_gemm_correctness(cute::Tensor<EngineC, CLayout> const& h_c_out_tenso
 
 #if defined(CUTLASS_ENABLE_SYCL)
 #include <sycl/sycl.hpp>
-#include <syclcompat/syclcompat.hpp>
+#include <compat/compat.hpp>
 #include <cutlass/sycl_vector_types.h>
 
-namespace sc = syclcompat;
-namespace sc_exp = syclcompat::experimental;
+namespace sc = compat;
+namespace sc_exp = compat::experimental;
 namespace sycl_ext = sycl::ext::oneapi::experimental;
 
 template<uint32_t ThreadBlockSize,
diff --git a/test/unit/cute/hopper/bulk_load.cu b/test/unit/cute/hopper/bulk_load.cu
index e04d0ebd06..9362d2d914 100644
--- a/test/unit/cute/hopper/bulk_load.cu
+++ b/test/unit/cute/hopper/bulk_load.cu
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,8 +41,8 @@
 #include <cute/tensor.hpp>
 
 #if defined(CUTLASS_ENABLE_SYCL)
-namespace sc = syclcompat;
-namespace sc_exp = syclcompat::experimental;
+namespace sc = compat;
+namespace sc_exp = compat::experimental;
 namespace sycl_ext = sycl::ext::oneapi::experimental;
 #endif
 
diff --git a/test/unit/cute/hopper/bulk_store.cu b/test/unit/cute/hopper/bulk_store.cu
index af31095095..1dedc640d1 100644
--- a/test/unit/cute/hopper/bulk_store.cu
+++ b/test/unit/cute/hopper/bulk_store.cu
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,8 +41,8 @@
 #include <cute/tensor.hpp>
 
 #if defined(CUTLASS_ENABLE_SYCL)
-namespace sc = syclcompat;
-namespace sc_exp = syclcompat::experimental;
+namespace sc = compat;
+namespace sc_exp = compat::experimental;
 namespace sycl_ext = sycl::ext::oneapi::experimental;
 #endif
 
diff --git a/test/unit/cute/hopper/stsm.cu b/test/unit/cute/hopper/stsm.cu
index 5b026af5d5..dd2cc989b5 100644
--- a/test/unit/cute/hopper/stsm.cu
+++ b/test/unit/cute/hopper/stsm.cu
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,8 +38,8 @@
 #include <cute/arch/copy_sm90.hpp>
 
 #if defined(CUTLASS_ENABLE_SYCL)
-namespace sc = syclcompat;
-namespace sc_exp = syclcompat::experimental;
+namespace sc = compat;
+namespace sc_exp = compat::experimental;
 namespace sycl_ext = sycl::ext::oneapi::experimental;
 #endif
 
diff --git a/test/unit/cute/hopper/tma_load_testbed.hpp b/test/unit/cute/hopper/tma_load_testbed.hpp
index 8f9aaf3171..5241ade5b4 100644
--- a/test/unit/cute/hopper/tma_load_testbed.hpp
+++ b/test/unit/cute/hopper/tma_load_testbed.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,8 +40,8 @@
 #include <cute/tensor.hpp>
 
 #if defined(CUTLASS_ENABLE_SYCL)
-namespace sc = syclcompat;
-namespace sc_exp = syclcompat::experimental;
+namespace sc = compat;
+namespace sc_exp = compat::experimental;
 namespace sycl_ext = sycl::ext::oneapi::experimental;
 #endif
 
diff --git a/test/unit/cute/intel_xe/copy_1d.cpp b/test/unit/cute/intel_xe/copy_1d.cpp
index 59ffaf5e43..95b5a41eae 100644
--- a/test/unit/cute/intel_xe/copy_1d.cpp
+++ b/test/unit/cute/intel_xe/copy_1d.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,13 +34,13 @@
 
 #include <cute/tensor.hpp>
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include "cutlass_unit_test.h"
 
 using namespace cute;
 using namespace cutlass;
-using namespace syclcompat::experimental;
+using namespace compat::experimental;
 
 #define SUBGROUP_SIZE (16)
 
@@ -50,7 +51,7 @@ void copy_kernel_vectorized(TensorS tile_S, TensorD tile_D) {
   using Element = typename TensorS::value_type;
 
   // Shared memory buffers
-  auto smem = syclcompat::local_mem<Element[size(tile_S)]>();
+  auto smem = compat::local_mem<Element[size(tile_S)]>();
   Tensor sTensor = make_tensor(make_smem_ptr(smem), tile_S.layout());
 
   // Define `AccessType` which controls the size of the actual memory access.
@@ -180,15 +181,15 @@ TEST(PVC_1d_copy, copy_double) {
                     make_layout(Shape<Int<M>, Int<N>>{}, Stride<Int<N>, _1>{}));
 
     static constexpr auto subgroup_size = 16;
-    auto blockDim = syclcompat::dim3(subgroup_size);
+    auto blockDim = compat::dim3(subgroup_size);
 
     launch<copy_kernel_vectorized<decltype(S), decltype(D)>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * N; ++i) {
       // printf("%d  %d\n", int(h_in[i]), int(h_out[i]));
@@ -221,17 +222,17 @@ TEST(PVC_1d_copy, copy_double) {
                     make_layout(Shape<Int<M>, Int<N>>{}, Stride<Int<N>, _1>{}));
 
     static constexpr auto subgroup_size = 16;
-    auto blockDim = syclcompat::dim3(subgroup_size);
+    auto blockDim = compat::dim3(subgroup_size);
     //
     // Launch the kernel
     //
     launch<copy_kernel_vectorized<decltype(S), decltype(D)>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * N; ++i) {
       EXPECT_EQ(host_output[i], host_src[i]);
@@ -263,17 +264,17 @@ TEST(PVC_1d_copy, copy_double) {
                     make_layout(Shape<Int<M>, Int<N>>{}, Stride<Int<N>, _1>{}));
 
     static constexpr auto subgroup_size = 16;
-    auto blockDim = syclcompat::dim3(subgroup_size);
+    auto blockDim = compat::dim3(subgroup_size);
     //
     // Launch the kernel
     //
     launch<copy_kernel_vectorized<decltype(S), decltype(D)>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * N; ++i) {
       EXPECT_EQ(host_output[i], host_src[i]);
diff --git a/test/unit/cute/intel_xe/copy_block.cpp b/test/unit/cute/intel_xe/copy_block.cpp
index 4f287617ce..40cdc20ec9 100644
--- a/test/unit/cute/intel_xe/copy_block.cpp
+++ b/test/unit/cute/intel_xe/copy_block.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,13 +34,13 @@
 
 #include <cute/tensor.hpp>
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include "cutlass_unit_test.h"
 
 using namespace cute;
 using namespace cutlass;
-using namespace syclcompat::experimental;
+using namespace compat::experimental;
 
 #define SUBGROUP_SIZE (16)
 constexpr int row_alignment = 16; // Alignment requirement for Xe 2D Block Copy Instructions
@@ -133,7 +134,7 @@ struct copy_op<dtype, load, store, M, N, false> {
         Layout<Shape<_1, Int<SUBGROUP_SIZE>>>{},
         make_layout(shape_div(typename Copy_Traits<store, decltype(S)>::BlockShape{}, Shape<_1, _16>{})));
 
-    auto blockDim = syclcompat::dim3(size(tiled_load));
+    auto blockDim = compat::dim3(size(tiled_load));
     //
     // Launch the kernel
     //
@@ -141,11 +142,11 @@ struct copy_op<dtype, load, store, M, N, false> {
         copy_kernel_vectorized<decltype(S), decltype(D), decltype(tiled_load),
                                decltype(tiled_store), load>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D, tiled_load, tiled_store);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * N; ++i) {
       EXPECT_EQ(host_output[i], host_src[i]);
@@ -185,7 +186,7 @@ struct copy_op<char, load, XE_2D_U8x2x32_ST_N, M, N, false> {
         Copy_Atom<Copy_Traits<XE_2D_U8x2x32_ST_N, decltype(D)>, dtype>{}.with(D), Layout<Shape<_1, _16>>{},
         make_layout(shape_div(typename Copy_Traits<XE_2D_U8x2x32_ST_N, decltype(S)>::BlockShape{}, Shape<_1, _16>{})));
 
-    auto blockDim = syclcompat::dim3(size(tiled_load));
+    auto blockDim = compat::dim3(size(tiled_load));
     //
     // Launch the kernel
     //
@@ -193,11 +194,11 @@ struct copy_op<char, load, XE_2D_U8x2x32_ST_N, M, N, false> {
         copy_kernel_vectorized<decltype(S), decltype(D), decltype(tiled_load),
                                decltype(tiled_store), load>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D, tiled_load, tiled_store);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * N; ++i) {
       EXPECT_EQ(host_output[i], host_src[i]);
@@ -237,7 +238,7 @@ struct copy_op<uint16_t, load, XE_2D_U16x2x16_ST_N, M, N, false> {
         Copy_Atom<Copy_Traits<XE_2D_U16x2x16_ST_N, decltype(D)>, uint16_t>{}.with(
             device_output.data(), M * 2, N / 2), Layout<Shape<_1, _16>>{},
         make_layout(shape_div(typename Copy_Traits<XE_2D_U16x2x16_ST_N, decltype(S)>::BlockShape{}, Shape<_1, _16>{})));
-    auto blockDim = syclcompat::dim3(size(tiled_load));
+    auto blockDim = compat::dim3(size(tiled_load));
     //
     // Launch the kernel
     //
@@ -245,11 +246,11 @@ struct copy_op<uint16_t, load, XE_2D_U16x2x16_ST_N, M, N, false> {
         copy_kernel_vectorized<decltype(S), decltype(D), decltype(tiled_load),
                                decltype(tiled_store), load>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D, tiled_load, tiled_store);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * 2; ++i) {
       for (int j = 0; j < N / 2; ++j) {
@@ -299,7 +300,7 @@ struct copy_op<uint32_t, load, store, M_, N_, true> {
         Copy_Atom<Copy_Traits<store, decltype(D)>, dtype>{}.with(D),
         Layout<Shape<_1, Int<SUBGROUP_SIZE>>>{},
         make_layout(shape_div(typename Copy_Traits<store, decltype(D)>::BlockShape{}, Shape<_1, _16>{})));
-    auto blockDim = syclcompat::dim3(size(tiled_load));
+    auto blockDim = compat::dim3(size(tiled_load));
     //
     // Launch the kernel
     //
@@ -307,11 +308,11 @@ struct copy_op<uint32_t, load, store, M_, N_, true> {
         copy_kernel_vectorized<decltype(S), decltype(D), decltype(tiled_load),
                                decltype(tiled_store), load>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D, tiled_load, tiled_store);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < N; ++i) {
       for (int j = 0; j < M; ++j) {
diff --git a/test/unit/cute/intel_xe/copy_scatter.cpp b/test/unit/cute/intel_xe/copy_scatter.cpp
index dce587f150..7fe011d213 100644
--- a/test/unit/cute/intel_xe/copy_scatter.cpp
+++ b/test/unit/cute/intel_xe/copy_scatter.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,13 +34,13 @@
 
 #include <cute/tensor.hpp>
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include "cutlass_unit_test.h"
 
 using namespace cute;
 using namespace cutlass;
-using namespace syclcompat::experimental;
+using namespace compat::experimental;
 
 #define SUBGROUP_SIZE (16)
 
@@ -125,18 +126,18 @@ TEST(PVC_2d_copy, load_store_global) {
                         Layout<Shape<_1, _16>, Stride<_16, _1>>{},
                         Layout<Shape<_8, _1>, Stride<_1, _8>>{});
     static constexpr auto subgroup_size = 16;
-    auto blockDim = syclcompat::dim3(size(tiled_copy));
+    auto blockDim = compat::dim3(size(tiled_copy));
     //
     // Launch the kernel
     //
     launch<copy_kernel_global<decltype(S), decltype(D), decltype(tiled_copy),
                               decltype(tiled_copy)>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D, tiled_copy, tiled_copy);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * N; ++i) {
       EXPECT_EQ(host_output[i], host_src[i]);
@@ -174,18 +175,18 @@ TEST(PVC_2d_copy, load_store_global_V) {
                         Layout<Shape<_1, _16>, Stride<_16, _1>>{},
                         Layout<Shape<_8, _2>, Stride<_1, _8>>{});
     static constexpr auto subgroup_size = 16;
-    auto blockDim = syclcompat::dim3(size(tiled_copy));
+    auto blockDim = compat::dim3(size(tiled_copy));
     //
     // Launch the kernel
     //
     launch<copy_kernel_global<decltype(S), decltype(D), decltype(tiled_copy),
                               decltype(tiled_copy)>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D, tiled_copy, tiled_copy);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * N; ++i) {
       EXPECT_EQ(host_output[i], host_src[i]);
@@ -199,7 +200,7 @@ void copy_kernel_local(TensorS S, TensorD D, TiledCopy Op) {
   // Shared memory buffers
   using Element = typename TensorS::value_type;
   ;
-  auto smem = syclcompat::local_mem<Element[size(S)]>();
+  auto smem = compat::local_mem<Element[size(S)]>();
   Tensor sTensor = make_tensor(make_smem_ptr(smem), S.layout());
 
   auto thr_copy = Op.get_thread_slice(ThreadIdxX());
@@ -250,17 +251,17 @@ TEST(PVC_2d_copy, load_store_local) {
                         Layout<Shape<_1, _16>, Stride<_16, _1>>{},
                         Layout<Shape<_8, _1>, Stride<_1, _8>>{});
     static constexpr auto subgroup_size = 16;
-    auto blockDim = syclcompat::dim3(size(tiled_copy));
+    auto blockDim = compat::dim3(size(tiled_copy));
     //
     // Launch the kernel
     //
     launch<copy_kernel_local<decltype(S), decltype(D), decltype(tiled_copy)>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D, tiled_copy);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * N; ++i) {
       EXPECT_EQ(host_output[i], host_src[i]);
@@ -346,18 +347,18 @@ TEST(PVC_2d_copy, load_store_stomic_float) {
                                       Layout<Shape<_1, _16>, Stride<_16, _1>>{},
                                       Layout<Shape<_8, _1>, Stride<_1, _8>>{});
     static constexpr auto subgroup_size = 16;
-    auto blockDim = syclcompat::dim3(size(tiled_load));
+    auto blockDim = compat::dim3(size(tiled_load));
     //
     // Launch the kernel
     //
     launch<copy_kernel_atomic<decltype(S), decltype(D), decltype(tiled_load),
                               decltype(tiled_atom)>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D, tiled_load, tiled_atom);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * N; ++i) {
       EXPECT_EQ(host_output[i], 2 * host_src[i]);
@@ -398,18 +399,18 @@ TEST(PVC_2d_copy, load_store_stomic_int) {
                                       Layout<Shape<_1, _16>, Stride<_16, _1>>{},
                                       Layout<Shape<_8, _1>, Stride<_1, _8>>{});
     static constexpr auto subgroup_size = 16;
-    auto blockDim = syclcompat::dim3(size(tiled_load));
+    auto blockDim = compat::dim3(size(tiled_load));
     //
     // Launch the kernel
     //
     launch<copy_kernel_atomic<decltype(S), decltype(D), decltype(tiled_load),
                               decltype(tiled_atom)>>(
         launch_policy{
-            syclcompat::dim3(1), blockDim,
+            compat::dim3(1), blockDim,
             kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
         S, D, tiled_load, tiled_atom);
 
-    syclcompat::wait_and_throw();
+    compat::wait_and_throw();
     host_output = device_output;
     for (int i = 0; i < M * N; ++i) {
       EXPECT_EQ(host_output[i], 2 * host_src[i]);
diff --git a/test/unit/cute/intel_xe/copy_subgroup_block.cpp b/test/unit/cute/intel_xe/copy_subgroup_block.cpp
index 1f42c35dd6..4d065d1afc 100644
--- a/test/unit/cute/intel_xe/copy_subgroup_block.cpp
+++ b/test/unit/cute/intel_xe/copy_subgroup_block.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,11 +34,11 @@
 
 #include <cute/tensor.hpp>
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include "cutlass_unit_test.h"
 
-using namespace syclcompat::experimental;
+using namespace compat::experimental;
 
 #define SUBGROUP_SIZE (16)
 
@@ -144,8 +145,8 @@ void copy_kernel_vectorized(TensorS S, TensorD D) {
 #endif
 
   // onlt run first subgroup
-  if (syclcompat::global_id::x() < 16 && !syclcompat::global_id::y() &&
-      !syclcompat::global_id::z()) {
+  if (compat::global_id::x() < 16 && !compat::global_id::y() &&
+      !compat::global_id::z()) {
     copy(tiled_copy_store, fragment, thr_tile_store_D);
   }
 }
@@ -207,9 +208,9 @@ bool copy(uint32_t M, uint32_t N) {
   // Determine grid and block dimensions
   //
 
-  auto gridDim = syclcompat::dim3(cute::ceil_div(M, wg_tile_m),
+  auto gridDim = compat::dim3(cute::ceil_div(M, wg_tile_m),
                                   cute::ceil_div(N, wg_tile_n));
-  auto blockDim = syclcompat::dim3(size(thr_layout));
+  auto blockDim = compat::dim3(size(thr_layout));
 
   //
   // Launch the kernel
@@ -220,7 +221,7 @@ bool copy(uint32_t M, uint32_t N) {
                     kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
       tensor_S, tensor_D);
 
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 
   //
   // Verify
diff --git a/test/unit/cute/intel_xe/mma.cpp b/test/unit/cute/intel_xe/mma.cpp
index 1c0e3d8a61..1894021530 100755
--- a/test/unit/cute/intel_xe/mma.cpp
+++ b/test/unit/cute/intel_xe/mma.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,14 +34,14 @@
 
 #include <cute/tensor.hpp>
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include "cutlass_unit_test.h"
 #include "utils.hpp"
 
 using namespace cute;
 using namespace cutlass;
-using namespace syclcompat::experimental;
+using namespace compat::experimental;
 
 #define SUBGROUP_SIZE (16)
 
@@ -89,6 +90,7 @@ void gemm_device(TA const *A, TB const *B, TC *C, uint32_t m, uint32_t n,
 
 #define CUTLASS_ENABLE_DEBUG_PRINTS (0)
 
+#undef LOG_THREAD
 #define LOG_THREAD (16)
 
 #if CUTLASS_ENABLE_DEBUG_PRINTS
@@ -169,9 +171,9 @@ template <class MMA, uint32_t wg_tile_m, uint32_t wg_tile_n, uint32_t sg_tile_m,
 void gemm(int m, int n, int k, TA *A, TB *B, TC *C) {
   using namespace cute;
 
-  auto dimBlock = syclcompat::dim3(SUBGROUP_SIZE * (wg_tile_m * wg_tile_n) /
+  auto dimBlock = compat::dim3(SUBGROUP_SIZE * (wg_tile_m * wg_tile_n) /
                                    (sg_tile_m * sg_tile_n));
-  auto dimGrid = syclcompat::dim3(size(ceil_div(m, wg_tile_m)),
+  auto dimGrid = compat::dim3(size(ceil_div(m, wg_tile_m)),
                                   size(ceil_div(n, wg_tile_n)));
 
   launch<gemm_device<MMA, wg_tile_m, wg_tile_n, sg_tile_m, sg_tile_n, sg_tile_k,
@@ -197,7 +199,7 @@ void MMA_Test(int m, int n, int k) {
 
   ::gemm<MMA, wg_tile_m, wg_tile_n, sg_tile_m, sg_tile_n, sg_tile_k>(
       m, n, k, d_A.data(), d_B.data(), d_C.data());
-  syclcompat::wait();
+  compat::wait();
 
   h_C = d_C;
   verify(m, n, k, h_A.data(), h_B.data(), h_C.data());
diff --git a/test/unit/cute/intel_xe/utils.hpp b/test/unit/cute/intel_xe/utils.hpp
index e109d9fe27..5ca97c56da 100755
--- a/test/unit/cute/intel_xe/utils.hpp
+++ b/test/unit/cute/intel_xe/utils.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +36,7 @@
 
 #include <cute/tensor.hpp>
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include "cutlass_unit_test.h"
 
@@ -46,12 +47,13 @@ using namespace cutlass;
 using namespace cutlass::layout;
 using namespace cutlass::detail;
 
-using namespace syclcompat::experimental;
+using namespace compat::experimental;
 
 #define SUBGROUP_SIZE (16)
 
 #define CUTLASS_ENABLE_DEBUG_PRINTS (0)
 #define LOG_GROUP (0)
+#undef LOG_THREAD
 #define LOG_THREAD (0)
 
 template <class atype, class btype, class ctype>
@@ -135,10 +137,10 @@ template <class kernel> void run(uint32_t m, uint32_t n, uint32_t k) {
   cutlass::device_vector<TB> d_B = h_B;
   cutlass::device_vector<TC> d_C = h_C;
 
-  auto dimBlock = syclcompat::dim3(
+  auto dimBlock = compat::dim3(
       ceil_div(kernel::wg_tile_m, kernel::sg_tile_m),
       SUBGROUP_SIZE * ceil_div(kernel::wg_tile_n, kernel::sg_tile_n));
-  auto dimGrid = syclcompat::dim3(size(ceil_div(m, kernel::wg_tile_m)),
+  auto dimGrid = compat::dim3(size(ceil_div(m, kernel::wg_tile_m)),
                                   size(ceil_div(n, kernel::wg_tile_n)));
 
   launch<kernel::func>(
@@ -146,7 +148,7 @@ template <class kernel> void run(uint32_t m, uint32_t n, uint32_t k) {
                     kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
       d_A.data(), d_B.data(), d_C.data(), m, n, k);
 
-  syclcompat::wait();
+  compat::wait();
 
   h_C = d_C;
   verify(m, n, k, h_A.data(), h_B.data(), h_C.data(), kernel::is_a_row_major,
diff --git a/test/unit/cute/turing/movm.cu b/test/unit/cute/turing/movm.cu
index f6fe20164d..0f9879397b 100644
--- a/test/unit/cute/turing/movm.cu
+++ b/test/unit/cute/turing/movm.cu
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,8 +42,8 @@
 using namespace cute;
 
 #ifdef CUTLASS_ENABLE_SYCL
-namespace sc = syclcompat;
-namespace sc_exp = syclcompat::experimental;
+namespace sc = compat;
+namespace sc_exp = compat::experimental;
 namespace sycl_ext = sycl::ext::oneapi::experimental;
 #endif
 
diff --git a/test/unit/cute/volta/vectorization_auto.cu b/test/unit/cute/volta/vectorization_auto.cu
index aca75610c5..5a1cdc8c44 100644
--- a/test/unit/cute/volta/vectorization_auto.cu
+++ b/test/unit/cute/volta/vectorization_auto.cu
@@ -1,6 +1,7 @@
 
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,8 +77,8 @@ test_copy_vectorization(CopyPolicy policy, GmemLayout gmem_layout, RmemTiler rme
   device_vector<T> d_in = h_in;
   Tensor m_in = make_tensor(make_gmem_ptr(raw_pointer_cast(d_in.data())), gmem_layout);
   #if defined(CUTLASS_ENABLE_SYCL)
-  syclcompat::launch<kernel<decltype(m_in),decltype(rmem_tiler),  decltype(policy)>>(
-    syclcompat::dim3(1), syclcompat::dim3(1),
+  compat::launch<kernel<decltype(m_in),decltype(rmem_tiler),  decltype(policy)>>(
+    compat::dim3(1), compat::dim3(1),
     m_in, rmem_tiler, policy
   );
   #else
diff --git a/test/unit/flash_attention/flash_attention_decode/flash_decode_testbed_3x.hpp b/test/unit/flash_attention/flash_attention_decode/flash_decode_testbed_3x.hpp
index 832a5216cd..38ae48c86e 100644
--- a/test/unit/flash_attention/flash_attention_decode/flash_decode_testbed_3x.hpp
+++ b/test/unit/flash_attention/flash_attention_decode/flash_decode_testbed_3x.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -289,11 +290,11 @@ struct TestbedImpl {
           page_mapping[logical_idx] = physical_pages[blk];
         }
       }
-      syclcompat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
+      compat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
 
       paged_kv_cache.num_pages_per_seq.reset(num_pages_per_seq.size());
-      syclcompat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
-      syclcompat::wait();
+      compat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
+      compat::wait();
     }
 
     initialize_block(block_Q, seed + 2023);
@@ -410,9 +411,9 @@ struct TestbedImpl {
       int max_seq_len_q = static_cast<int>(cute::get<3>(problem_size));
       int max_seq_len_kv = static_cast<int>(cute::get<4>(problem_size));
       int max_seq_len_kv_cache = static_cast<int>(cute::get<5>(problem_size));
-      cute::get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, cumulative_seqlen_q.data()};
-      cute::get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, cumulative_seqlen_kv.data()};
-      cute::get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, cumulative_seqlen_kv_cache.data()};
+      cute::get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, 0, cumulative_seqlen_q.data()};
+      cute::get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, 0, cumulative_seqlen_kv.data()};
+      cute::get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, 0, cumulative_seqlen_kv_cache.data()};
     }
 
     auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] = cute::select<0,1,2,6,7>(problem_size);
@@ -455,29 +456,29 @@ struct TestbedImpl {
             cutlass::DeviceAllocation<ElementV> block_V_concat(seq_len_kv_total * head_size_vo);
 
             // Concatenate K_cache and K
-            syclcompat::memcpy<ElementK>(
+            compat::memcpy<ElementK>(
                 block_K_concat.get(),
                 block_K_cache.get() + offset_k_cache,
                 seq_len_kv_cache * head_size_qk
             );
-            syclcompat::memcpy<ElementK>(
+            compat::memcpy<ElementK>(
                 block_K_concat.get() + seq_len_kv_cache * head_size_qk,
                 block_K.get() + offset_k,
                 seq_len_kv * head_size_qk
             );
 
             // Concatenate V_cache and V
-            syclcompat::memcpy<ElementV>(
+            compat::memcpy<ElementV>(
                 block_V_concat.get(),
                 block_V_cache.get() + offset_v_cache,
                 seq_len_kv_cache * head_size_vo
             );
-            syclcompat::memcpy<ElementV>(
+            compat::memcpy<ElementV>(
                 block_V_concat.get() + seq_len_kv_cache * head_size_vo,
                 block_V.get() + offset_v,
                 seq_len_kv * head_size_vo
             );
-            syclcompat::wait();
+            compat::wait();
 
             k_ptr = block_K_concat.get();
             v_ptr = block_V_concat.get();
@@ -502,11 +503,11 @@ struct TestbedImpl {
                                                 seq_len_qo * seq_len_kv_total    // batch_stride_S
         );
 
-        syclcompat::wait();
+        compat::wait();
 
         std::vector<ElementAccumulator> host_S(block_S.size());
-        syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
-        syclcompat::wait();
+        compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
+        compat::wait();
 
         // delete this memory as it is no longer needed
         block_S.reset();
@@ -575,8 +576,8 @@ struct TestbedImpl {
         cutlass::DeviceAllocation<ElementV> block_P;
         block_P.reset(host_P.size());
 
-        syclcompat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
-        syclcompat::wait();
+        compat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
+        compat::wait();
 
         cutlass::TensorRef ref_P(block_P.get(), LayoutQ::packed({seq_len_qo, seq_len_kv_total}));
 
@@ -594,13 +595,13 @@ struct TestbedImpl {
                                                 seq_len_qo * head_size_vo  // batch_stride_O
         );
 
-        syclcompat::wait();
+        compat::wait();
         // delete this memory as it is no longer needed
         block_P.reset();
 
         std::vector<ElementAccumulator> vec_acc(block_acc.size());
-        syclcompat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
-        syclcompat::wait();
+        compat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
+        compat::wait();
 
         // delete this memory as it is no longer needed
         block_acc.reset();
@@ -608,8 +609,8 @@ struct TestbedImpl {
         for(int i = 0; i < vec_out.size(); i++) {
           vec_out[i] = static_cast<ElementOutput>(vec_acc[i]);
         }
-        syclcompat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
-        syclcompat::wait();
+        compat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
+        compat::wait();
 
         offset_q += seq_len_qo * head_size_qk;
         if(kv_group_update % q_group_size == 0) {
@@ -623,7 +624,7 @@ struct TestbedImpl {
       }
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
@@ -716,29 +717,29 @@ struct TestbedImpl {
     // configure smem size and carveout
     int smem_size = FlashDecode::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<FlashDecode>>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<FlashDecode::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props {
+    compat::experimental::launch_properties launch_props {
       sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
       sycl::ext::oneapi::experimental::sub_group_size<FlashDecode::DispatchPolicy::SubgroupSize>
     };
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<cutlass::device_kernel<FlashDecode>>(policy, params);
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<FlashDecode>>(policy, params);
 #endif
     EventManager::getInstance().addEvent(event);
 
     try {
-      syclcompat::wait_and_throw();
+      compat::wait_and_throw();
     } catch (std::exception const &e) {
       ADD_FAILURE() << "Error at Kernel Sync.";
       return false;
diff --git a/test/unit/flash_attention/flash_attention_prefill/flash_prefill_testbed_3x.hpp b/test/unit/flash_attention/flash_attention_prefill/flash_prefill_testbed_3x.hpp
index 627bc5846e..69a6a32f39 100644
--- a/test/unit/flash_attention/flash_attention_prefill/flash_prefill_testbed_3x.hpp
+++ b/test/unit/flash_attention/flash_attention_prefill/flash_prefill_testbed_3x.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -228,7 +229,7 @@ struct TestbedImpl {
 
   template <typename SrcT, typename DstT>
   void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
-    syclcompat::get_default_queue().parallel_for(size, [=](auto indx) {
+    compat::get_default_queue().parallel_for(size, [=](auto indx) {
       d_dst[indx] = static_cast<DstT>(d_src[indx]);
     }).wait();
   }
@@ -373,8 +374,8 @@ struct TestbedImpl {
     if constexpr (isVarLen) {
       int max_seq_len_q = static_cast<int>(cute::get<3>(problem_size));
       int max_seq_len_kv = static_cast<int>(cute::get<4>(problem_size));
-      cute::get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, cumulative_seqlen_q.data()};
-      cute::get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, cumulative_seqlen_kv.data()};
+      cute::get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, 0, cumulative_seqlen_q.data()};
+      cute::get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, 0, cumulative_seqlen_kv.data()};
     }
 
     auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] = cute::select<0,1,2,5,6>(problem_size);
@@ -421,10 +422,10 @@ struct TestbedImpl {
                                                 seq_len_qo * seq_len_kv    // batch_stride_S
         );
 
-        syclcompat::wait();
+        compat::wait();
 
         std::vector<ElementAccumulator> host_S(block_S.size());
-        syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
+        compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
 
         // delete this memory as it is no longer needed
         block_S.reset();
@@ -490,7 +491,7 @@ struct TestbedImpl {
         cutlass::DeviceAllocation<ElementV_> block_P;
         block_P.reset(host_P.size());
 
-        syclcompat::memcpy<ElementV_>(block_P.get(), host_P.data(), host_P.size());
+        compat::memcpy<ElementV_>(block_P.get(), host_P.data(), host_P.size());
 
         cutlass::TensorRef ref_P(block_P.get(), LayoutQ::packed({seq_len_qo, seq_len_kv}));
 
@@ -508,12 +509,12 @@ struct TestbedImpl {
                                                 seq_len_qo * head_size_vo  // batch_stride_O
         );
 
-        syclcompat::wait();
+        compat::wait();
         // delete this memory as it is no longer needed
         block_P.reset();
 
         std::vector<ElementAccumulator> vec_acc(block_acc.size());
-        syclcompat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
+        compat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
 
         // delete this memory as it is no longer needed
         block_acc.reset();
@@ -521,7 +522,7 @@ struct TestbedImpl {
         for(int i = 0; i < vec_out.size(); i++) {
           vec_out[i] = static_cast<ElementOutput>(vec_acc[i]);
         }
-        syclcompat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
+        compat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
 
         offset_q += seq_len_qo * head_size_qk;
         if(kv_group_update % q_group_size==0) {
@@ -533,7 +534,7 @@ struct TestbedImpl {
       }
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
@@ -619,29 +620,29 @@ struct TestbedImpl {
     // configure smem size and carveout
     int smem_size = FlashAttention::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<FlashAttention>>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<FlashAttention::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props {
+    compat::experimental::launch_properties launch_props {
       sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
       sycl::ext::oneapi::experimental::sub_group_size<FlashAttention::DispatchPolicy::SubgroupSize>
     };
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<cutlass::device_kernel<FlashAttention>>(policy, params);
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<FlashAttention>>(policy, params);
 #endif
     EventManager::getInstance().addEvent(event);
 
     try {
-      syclcompat::wait_and_throw();
+      compat::wait_and_throw();
     } catch (std::exception const &e) {
       ADD_FAILURE() << "Error at Kernel Sync.";
       return false;
diff --git a/test/unit/flash_attention/flash_attention_prefill_cachedkv/flash_prefill_cachedkv_testbed_3x.hpp b/test/unit/flash_attention/flash_attention_prefill_cachedkv/flash_prefill_cachedkv_testbed_3x.hpp
index 54c72a2923..015ce4a159 100644
--- a/test/unit/flash_attention/flash_attention_prefill_cachedkv/flash_prefill_cachedkv_testbed_3x.hpp
+++ b/test/unit/flash_attention/flash_attention_prefill_cachedkv/flash_prefill_cachedkv_testbed_3x.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -243,10 +244,10 @@ struct TestbedImpl {
           page_mapping[logical_idx] = physical_pages[blk];
         }
       }
-      syclcompat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
+      compat::memcpy(paged_kv_cache.page_table.get(), page_mapping.data(), page_mapping.size() * sizeof(int));
 
       paged_kv_cache.num_pages_per_seq.reset(num_pages_per_seq.size());
-      syclcompat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
+      compat::memcpy(paged_kv_cache.num_pages_per_seq.get(), num_pages_per_seq.data(), num_pages_per_seq.size() * sizeof(int));
     }
 
     initialize_block(block_Q, seed + 2023);
@@ -361,9 +362,9 @@ struct TestbedImpl {
       int max_seq_len_q = static_cast<int>(cute::get<3>(problem_size));
       int max_seq_len_kv = static_cast<int>(cute::get<4>(problem_size));
       int max_seq_len_kv_cache = static_cast<int>(cute::get<5>(problem_size));
-      cute::get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, cumulative_seqlen_q.data()};
-      cute::get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, cumulative_seqlen_kv.data()};
-      cute::get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, cumulative_seqlen_kv_cache.data()};
+      cute::get<3>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_q, 0, cumulative_seqlen_q.data()};
+      cute::get<4>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv, 0, cumulative_seqlen_kv.data()};
+      cute::get<5>(problem_size) = cutlass::fmha::collective::VariableLength{max_seq_len_kv_cache, 0, cumulative_seqlen_kv_cache.data()};
     }
 
     auto [batch, num_heads_q, num_heads_kv, head_size_qk, head_size_vo] = cute::select<0,1,2,6,7>(problem_size);
@@ -406,24 +407,24 @@ struct TestbedImpl {
           cutlass::DeviceAllocation<ElementV> block_V_concat(seq_len_kv_total * head_size_vo);
 
           // Concatenate K_cache and K
-          syclcompat::memcpy<ElementK>(
+          compat::memcpy<ElementK>(
               block_K_concat.get(),
               block_K_cache.get() + offset_k_cache,
               seq_len_kv_cache * head_size_qk
           );
-          syclcompat::memcpy<ElementK>(
+          compat::memcpy<ElementK>(
               block_K_concat.get() + seq_len_kv_cache * head_size_qk,
               block_K.get() + offset_k,
               seq_len_kv * head_size_qk
           );
 
           // Concatenate V_cache and V
-          syclcompat::memcpy<ElementV>(
+          compat::memcpy<ElementV>(
               block_V_concat.get(),
               block_V_cache.get() + offset_v_cache,
               seq_len_kv_cache * head_size_vo
           );
-          syclcompat::memcpy<ElementV>(
+          compat::memcpy<ElementV>(
               block_V_concat.get() + seq_len_kv_cache * head_size_vo,
               block_V.get() + offset_v,
               seq_len_kv * head_size_vo
@@ -451,10 +452,10 @@ struct TestbedImpl {
                                                 seq_len_qo * seq_len_kv_total    // batch_stride_S
         );
 
-        syclcompat::wait();
+        compat::wait();
 
         std::vector<ElementAccumulator> host_S(block_S.size());
-        syclcompat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
+        compat::memcpy<ElementAccumulator>(host_S.data(), block_S.get(), host_S.size());
 
         // delete this memory as it is no longer needed
         block_S.reset();
@@ -521,7 +522,7 @@ struct TestbedImpl {
         cutlass::DeviceAllocation<ElementV> block_P;
         block_P.reset(host_P.size());
 
-        syclcompat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
+        compat::memcpy<ElementV>(block_P.get(), host_P.data(), host_P.size());
 
         cutlass::TensorRef ref_P(block_P.get(), LayoutQ::packed({seq_len_qo, seq_len_kv_total}));
 
@@ -539,12 +540,12 @@ struct TestbedImpl {
                                                 seq_len_qo * head_size_vo  // batch_stride_O
         );
 
-        syclcompat::wait();
+        compat::wait();
         // delete this memory as it is no longer needed
         block_P.reset();
 
         std::vector<ElementAccumulator> vec_acc(block_acc.size());
-        syclcompat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
+        compat::memcpy<ElementAccumulator>(vec_acc.data(), block_acc.get(), vec_acc.size());
 
         // delete this memory as it is no longer needed
         block_acc.reset();
@@ -552,7 +553,7 @@ struct TestbedImpl {
         for(int i = 0; i < vec_out.size(); i++) {
           vec_out[i] = static_cast<ElementOutput>(vec_acc[i]);
         }
-        syclcompat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
+        compat::memcpy<ElementOutput>(block_ref_O.get() + offset_o, vec_out.data(), vec_out.size());
 
         offset_q += seq_len_qo * head_size_qk;
         if(kv_group_update % q_group_size==0) {
@@ -566,7 +567,7 @@ struct TestbedImpl {
       }
     }
 
-    syclcompat::wait();
+    compat::wait();
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
     bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_O.get(), block_O.get(),
@@ -659,29 +660,29 @@ struct TestbedImpl {
     // configure smem size and carveout
     int smem_size = FlashPrefillCachedKV::SharedStorageSize;
 
-    const auto sycl_block = syclcompat::dim3(block.x, block.y, block.z);
-    const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
+    const auto sycl_block = compat::dim3(block.x, block.y, block.z);
+    const auto sycl_grid = compat::dim3(grid.x, grid.y, grid.z);
 
 #if !defined(SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY)
-    using namespace syclcompat::experimental;
+    using namespace compat::experimental;
     auto event = launch<cutlass::device_kernel<FlashPrefillCachedKV>>(
         launch_policy{sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(smem_size)},
                       kernel_properties{sycl_exp::sub_group_size<FlashPrefillCachedKV::DispatchPolicy::SubgroupSize>}},
         params);
 #else
-    syclcompat::experimental::launch_properties launch_props {
+    compat::experimental::launch_properties launch_props {
       sycl::ext::oneapi::experimental::work_group_scratch_size(smem_size),
     };
-    syclcompat::experimental::kernel_properties kernel_props{
+    compat::experimental::kernel_properties kernel_props{
       sycl::ext::oneapi::experimental::sub_group_size<FlashPrefillCachedKV::DispatchPolicy::SubgroupSize>
     };
-    syclcompat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
-    auto event = syclcompat::experimental::launch<cutlass::device_kernel<FlashPrefillCachedKV>>(policy, params);
+    compat::experimental::launch_policy policy{sycl_grid, sycl_block, launch_props, kernel_props};
+    auto event = compat::experimental::launch<cutlass::device_kernel<FlashPrefillCachedKV>>(policy, params);
 #endif
     EventManager::getInstance().addEvent(event);
 
     try {
-      syclcompat::wait_and_throw();
+      compat::wait_and_throw();
     } catch (std::exception const &e) {
       ADD_FAILURE() << "Error at Kernel Sync.";
       return false;
diff --git a/test/unit/gemm/device/gemm_testbed_3x.hpp b/test/unit/gemm/device/gemm_testbed_3x.hpp
index d44c3fbed1..41ca8b9d28 100644
--- a/test/unit/gemm/device/gemm_testbed_3x.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -819,7 +820,7 @@ struct HostCollectiveMainloopSparse
 
 #if defined(CUTLASS_ENABLE_SYCL)
     try {
-      syclcompat::wait_and_throw();
+      compat::wait_and_throw();
     } catch (std::exception const &e) {
       ADD_FAILURE() << "Error at Kernel Sync.";
       return false;
@@ -1471,7 +1472,7 @@ struct HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlock
 
 #if (CUTLASS_ENABLE_SYCL)
     try {
-      syclcompat::wait_and_throw();
+      compat::wait_and_throw();
     } catch (std::exception const &e) {
       ADD_FAILURE() << "Error at Kernel Sync.";
       return false;
@@ -2843,7 +2844,7 @@ struct TestbedImpl {
     size_t smem_size = static_cast<size_t>(Gemm::GemmKernel::SharedStorageSize);
     size_t device_smem_size;
 #if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::device_info info = syclcompat::get_current_device().get_device_info();
+    compat::device_info info = compat::get_current_device().get_device_info();
     this->sm_count = info.get_max_compute_units();
     device_smem_size = info.get_local_mem_size();
 #else
@@ -2902,7 +2903,7 @@ struct TestbedImpl {
 
 #if defined(CUTLASS_ENABLE_SYCL)
     try {
-      syclcompat::wait_and_throw();
+      compat::wait_and_throw();
     } catch (std::exception const &e) {
       ADD_FAILURE() << "Error at Kernel Sync.";
       return false;
@@ -3079,7 +3080,7 @@ struct TestbedImpl {
       status = gemm_op.run();
 #if defined(CUTLASS_ENABLE_SYCL)
       try {
-        syclcompat::wait_and_throw();
+        compat::wait_and_throw();
       } catch (std::exception const &e) {
         ADD_FAILURE() << "Error at Kernel Sync.";
         return false;
diff --git a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
index 82807d1046..9341c58c02 100644
--- a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -2124,7 +2125,7 @@ struct TestbedImpl {
     status = gemm_op.run();
 #if defined SYCL_INTEL_TARGET
     result = cudaSuccess;
-    syclcompat::wait();
+    compat::wait();
 #else
     result = cudaDeviceSynchronize();
 #endif
diff --git a/tools/copy_debug/copy_debug.cpp b/tools/copy_debug/copy_debug.cpp
index e8be50b08e..a085f4463e 100644
--- a/tools/copy_debug/copy_debug.cpp
+++ b/tools/copy_debug/copy_debug.cpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,13 +32,13 @@
 
 #include <cute/tensor.hpp>
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #include <cutlass/util/device_memory.h>
-#include <syclcompat/syclcompat.hpp>
+#include <compat/compat.hpp>
 #include <cutlass/cutlass.h>
 
-using namespace syclcompat::experimental;
+using namespace compat::experimental;
 using namespace cute;
 
 #define SUBGROUP_SIZE (16)
@@ -113,14 +114,14 @@ void copy(int global_M, int global_N) {
 
   Tensor tensor_S = make_tensor(make_gmem_ptr(src.get()), make_layout(tensor_shape, LayoutLeft{}));
 
-  auto gridDim = syclcompat::dim3(1);
-  auto blockDim = syclcompat::dim3(SUBGROUP_SIZE);
+  auto gridDim = compat::dim3(1);
+  auto blockDim = compat::dim3(SUBGROUP_SIZE);
   launch<copy_kernel<CopyInstruction, decltype(tensor_S), fragment_size>>(
       launch_policy{gridDim, blockDim,
                     kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}},
       tensor_S);
 
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 }
 
 int main(){
diff --git a/tools/util/include/compat.hpp b/tools/util/include/compat.hpp
new file mode 100644
index 0000000000..d02362a611
--- /dev/null
+++ b/tools/util/include/compat.hpp
@@ -0,0 +1,26 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  Compat
+ *
+ *  compat.hpp
+ *
+ *  Description:
+ *    Main include header for Compat
+ **************************************************************************/
+
+#pragma once
+
+#include <compat/compat.hpp>
diff --git a/tools/util/include/compat/atomic.hpp b/tools/util/include/compat/atomic.hpp
new file mode 100644
index 0000000000..8d64dafc08
--- /dev/null
+++ b/tools/util/include/compat/atomic.hpp
@@ -0,0 +1,474 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  atomic.hpp
+ *
+ *  Description:
+ *    Atomic functionality for the SYCL compatibility extension
+ **************************************************************************/
+
+// The original source was under the license below:
+//==---- atomic.hpp -------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+
+#include <sycl/access/access.hpp>
+#include <sycl/atomic_ref.hpp>
+#include <sycl/memory_enums.hpp>
+#include <sycl/multi_ptr.hpp>
+
+#include <compat/traits.hpp>
+
+namespace compat {
+
+/// Atomically add the value operand to the value at the addr and assign the
+/// result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to add to the value at \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_add(T *addr, arith_t<T> operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_add(operand);
+}
+
+/// Atomically subtract the value operand from the value at the addr and
+/// assign the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to subtract from the value at \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_sub(T *addr, arith_t<T> operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_sub(operand);
+}
+
+/// Atomically perform a bitwise AND between the value operand and the value
+/// at the addr and assign the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to use in bitwise AND operation with the value at
+/// the \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_and(T *addr, type_identity_t<T> operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_and(operand);
+}
+
+/// Atomically or the value at the addr with the value operand, and assign
+/// the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to use in bitwise OR operation with the value at
+/// the \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_or(T *addr, type_identity_t<T> operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_or(operand);
+}
+
+/// Atomically xor the value at the addr with the value operand, and assign
+/// the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to use in bitwise XOR operation with the value at
+/// the \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_xor(T *addr, type_identity_t<T> operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_xor(operand);
+}
+
+/// Atomically calculate the minimum of the value at addr and the value
+/// operand and assign the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand. \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_min(T *addr, type_identity_t<T> operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_min(operand);
+}
+
+/// Atomically calculate the maximum of the value at addr and the value
+/// operand and assign the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_max(T *addr, type_identity_t<T> operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_max(operand);
+}
+
+/// Atomically set \p operand to the value stored in \p addr, if old value
+/// stored in \p addr is equal to zero or greater than \p operand, else decrease
+/// the value stored in \p addr. \param [in, out] addr The pointer to the data.
+/// \param operand The threshold value.
+/// \param memoryOrder The memory ordering used.
+/// \returns The old value stored in \p addr.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+unsigned int atomic_fetch_compare_dec(unsigned int *addr,
+                                      unsigned int operand) {
+  auto atm =
+      sycl::atomic_ref<unsigned int, memoryOrder, memoryScope, addressSpace>(
+          addr[0]);
+  unsigned int old;
+
+  while (true) {
+    old = atm.load();
+    if (old == 0 || old > operand) {
+      if (atm.compare_exchange_strong(old, operand))
+        break;
+    } else if (atm.compare_exchange_strong(old, old - 1))
+      break;
+  }
+
+  return old;
+}
+
+/// Atomically increment the value stored in \p addr if old value stored in \p
+/// addr is less than \p operand, else set 0 to the value stored in \p addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The threshold value.
+/// \param memoryOrder The memory ordering used.
+/// \returns The old value stored in \p addr.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline unsigned int atomic_fetch_compare_inc(unsigned int *addr,
+                                             unsigned int operand) {
+  auto atm =
+      sycl::atomic_ref<unsigned int, memoryOrder, memoryScope, addressSpace>(
+          addr[0]);
+  unsigned int old;
+  while (true) {
+    old = atm.load();
+    if (old >= operand) {
+      if (atm.compare_exchange_strong(old, 0))
+        break;
+    } else if (atm.compare_exchange_strong(old, old + 1))
+      break;
+  }
+  return old;
+}
+
+/// Atomically exchange the value at the address addr with the value operand.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to be exchanged with the value pointed by \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_exchange(T *addr, type_identity_t<T> operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.exchange(operand);
+}
+
+/// Atomically compare the value at \p addr to the value expected and exchange
+/// with the value desired if the value at \p addr is equal to the value
+/// expected. Returns the value at the \p addr before the call.
+/// \param [in, out] addr Multi_ptr.
+/// \param expected The value to compare against the value at \p addr.
+/// \param desired The value to assign to \p addr if the value at \p addr
+/// is expected.
+/// \param success The memory ordering used when comparison succeeds.
+/// \param fail The memory ordering used when comparison fails.
+/// \returns The value at the \p addr before the call.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+T atomic_compare_exchange_strong(
+    sycl::multi_ptr<T, addressSpace> addr, type_identity_t<T> expected,
+    type_identity_t<T> desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail = sycl::memory_order::relaxed) {
+  auto atm = sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(*addr);
+
+  atm.compare_exchange_strong(expected, desired, success, fail);
+  return expected;
+}
+
+/// Atomically compare the value at \p addr to the value expected and exchange
+/// with the value desired if the value at \p addr is equal to the value
+/// expected. Returns the value at the \p addr before the call.
+/// \param [in] addr The pointer to the data.
+/// \param expected The value to compare against the value at \p addr.
+/// \param desired The value to assign to \p addr if the value at \p addr is
+/// expected.
+/// \param success The memory ordering used when comparison succeeds.
+/// \param fail The memory ordering used when comparison fails.
+/// \returns The value at the \p addr before the call.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+T atomic_compare_exchange_strong(
+    T *addr, type_identity_t<T> expected, type_identity_t<T> desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail = sycl::memory_order::relaxed) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  atm.compare_exchange_strong(expected, desired, success, fail);
+  return expected;
+}
+
+/// Atomic extension to implement standard APIs in std::atomic
+namespace detail {
+template <typename T> struct IsValidAtomicType {
+  static constexpr bool value =
+      (std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+       std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
+       std::is_same<T, long long>::value ||
+       std::is_same<T, unsigned long long>::value ||
+       std::is_same<T, float>::value || std::is_same<T, double>::value ||
+       std::is_pointer<T>::value);
+};
+} // namespace detail
+
+template <typename T,
+          sycl::memory_scope DefaultScope = sycl::memory_scope::system,
+          sycl::memory_order DefaultOrder = sycl::memory_order::seq_cst,
+          sycl::access::address_space Space =
+              sycl::access::address_space::generic_space>
+class atomic {
+  static_assert(
+      detail::IsValidAtomicType<T>::value,
+      "Invalid atomic type.  Valid types are int, unsigned int, long, "
+      "unsigned long, long long, unsigned long long, float, double "
+      "and pointer types");
+  T __d;
+
+public:
+  /// default memory synchronization order
+  static constexpr sycl::memory_order default_read_order =
+      sycl::atomic_ref<T, DefaultOrder, DefaultScope,
+                       Space>::default_read_order;
+  static constexpr sycl::memory_order default_write_order =
+      sycl::atomic_ref<T, DefaultOrder, DefaultScope,
+                       Space>::default_write_order;
+  static constexpr sycl::memory_scope default_scope = DefaultScope;
+  static constexpr sycl::memory_order default_read_modify_write_order =
+      DefaultOrder;
+
+  /// Default constructor.
+  constexpr atomic() noexcept = default;
+  /// Constructor with initialize value.
+  constexpr atomic(T d) noexcept : __d(d){};
+
+  /// atomically replaces the value of the referenced object with a non-atomic
+  /// argument
+  /// \param operand The value to replace the pointed value.
+  /// \param memoryOrder The memory ordering used.
+  /// \param memoryScope The memory scope used.
+  void store(T operand, sycl::memory_order memoryOrder = default_write_order,
+             sycl::memory_scope memoryScope = default_scope) noexcept {
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    atm.store(operand, memoryOrder, memoryScope);
+  }
+
+  /// atomically obtains the value of the referenced object
+  /// \param memoryOrder The memory ordering used.
+  /// \param memoryScope The memory scope used.
+  /// \returns The value of the referenced object
+  T load(sycl::memory_order memoryOrder = default_read_order,
+         sycl::memory_scope memoryScope = default_scope) const noexcept {
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(
+        const_cast<T &>(__d));
+    return atm.load(memoryOrder, memoryScope);
+  }
+
+  /// atomically replaces the value of the referenced object and obtains the
+  /// value held previously
+  /// \param operand The value to replace the pointed value.
+  /// \param memoryOrder The memory ordering used.
+  /// \param memoryScope The memory scope used.
+  /// \returns The value of the referenced object before the call.
+  T exchange(T operand,
+             sycl::memory_order memoryOrder = default_read_modify_write_order,
+             sycl::memory_scope memoryScope = default_scope) noexcept {
+
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.exchange(operand, memoryOrder, memoryScope);
+  }
+
+  /// atomically compares the value of the referenced object with non-atomic
+  /// argument and performs atomic exchange if equal or atomic load if not
+  /// \param expected The value expected to be found in the object referenced by
+  /// the atomic_ref object
+  /// \param desired  The value to store in the referenced object if it is as
+  /// expected
+  /// \param success The memory models for the read-modify-write
+  /// \param failure The memory models for load operations
+  /// \param memoryScope The memory scope used.
+  /// \returns true if the referenced object was successfully changed, false
+  /// otherwise.
+  bool compare_exchange_weak(
+      T &expected, T desired, sycl::memory_order success,
+      sycl::memory_order failure,
+      sycl::memory_scope memoryScope = default_scope) noexcept {
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.compare_exchange_weak(expected, desired, success, failure,
+                                     memoryScope);
+  }
+  /// \param expected The value expected to be found in the object referenced by
+  /// the atomic_ref object
+  /// \param desired  The value to store in the referenced
+  /// object if it is as expected
+  /// \param memoryOrder 	The memory synchronization ordering for
+  /// operations
+  /// \param memoryScope The memory scope used.
+  /// \returns true if the referenced object was successfully
+  /// changed, false otherwise.
+  bool compare_exchange_weak(
+      T &expected, T desired,
+      sycl::memory_order memoryOrder = default_read_modify_write_order,
+      sycl::memory_scope memoryScope = default_scope) noexcept {
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.compare_exchange_weak(expected, desired, memoryOrder,
+                                     memoryScope);
+  }
+
+  /// atomically compares the value of the referenced object with non-atomic
+  /// argument and performs atomic exchange if equal or atomic load if not
+  /// \param expected The value expected to be found in the object referenced by
+  /// the atomic_ref object
+  /// \param desired  The value to store in the referenced
+  /// object if it is as expected
+  /// \param success The memory models for the
+  /// read-modify-write
+  /// \param failure The memory models for load operations
+  /// \param memoryScope The memory scope used.
+  /// \returns true if the referenced object was successfully changed, false
+  /// otherwise.
+  bool compare_exchange_strong(
+      T &expected, T desired, sycl::memory_order success,
+      sycl::memory_order failure,
+      sycl::memory_scope memoryScope = default_scope) noexcept {
+
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.compare_exchange_strong(expected, desired, success, failure,
+                                       memoryScope);
+  }
+  /// \param expected The value expected to be found in the object referenced by
+  /// the atomic_ref object
+  /// \param desired The value to store in the referenced
+  /// object if it is as expected
+  /// \param memoryOrder 	The memory synchronization ordering for
+  /// operations
+  /// \param memoryScope The memory scope used.
+  /// \returns true if the referenced object was successfully changed, false
+  /// otherwise.
+  bool compare_exchange_strong(
+      T &expected, T desired,
+      sycl::memory_order memoryOrder = default_read_modify_write_order,
+      sycl::memory_scope memoryScope = default_scope) noexcept {
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.compare_exchange_strong(expected, desired, memoryOrder,
+                                       memoryScope);
+  }
+
+  /// atomically adds the argument to the value stored in the atomic object and
+  /// obtains the value held previously
+  /// \param operand 	The other argument of arithmetic addition
+  /// \param memoryOrder The memory ordering used.
+  /// \param memoryScope The memory scope used.
+  /// \returns The value of the referenced object before the call.
+  T fetch_add(arith_t<T> operand,
+              sycl::memory_order memoryOrder = default_read_modify_write_order,
+              sycl::memory_scope memoryScope = default_scope) noexcept {
+
+    auto atm = sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>(__d);
+    return atm.fetch_add(operand, memoryOrder, memoryScope);
+  }
+
+  /// atomically subtracts the argument from the value stored in the atomic
+  /// object and obtains the value held previously
+  /// \param operand 	The other argument of arithmetic subtraction
+  /// \param memoryOrder The memory ordering used.
+  /// \param memoryScope The memory scope used.
+  /// \returns The value of the referenced object before the call.
+  T fetch_sub(arith_t<T> operand,
+              sycl::memory_order memoryOrder = default_read_modify_write_order,
+              sycl::memory_scope memoryScope = default_scope) noexcept {
+
+    auto atm = sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>(__d);
+    return atm.fetch_sub(operand, memoryOrder, memoryScope);
+  }
+};
+
+} // namespace compat
diff --git a/tools/util/include/compat/compat.hpp b/tools/util/include/compat/compat.hpp
new file mode 100644
index 0000000000..a2eed0fa55
--- /dev/null
+++ b/tools/util/include/compat/compat.hpp
@@ -0,0 +1,36 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  Compat
+ *
+ *  compat.hpp
+ *
+ *  Description:
+ *    Main include internal header for Compat
+ **************************************************************************/
+
+#pragma once
+
+#include <compat/atomic.hpp>
+#include <compat/defs.hpp>
+#include <compat/device.hpp>
+#include <compat/dims.hpp>
+#include <compat/group_utils.hpp>
+#include <compat/id_query.hpp>
+#include <compat/kernel.hpp>
+#include <compat/launch.hpp>
+#include <compat/math.hpp>
+#include <compat/memory.hpp>
+#include <compat/util.hpp>
diff --git a/tools/util/include/compat/defs.hpp b/tools/util/include/compat/defs.hpp
new file mode 100644
index 0000000000..5c578fbab2
--- /dev/null
+++ b/tools/util/include/compat/defs.hpp
@@ -0,0 +1,94 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  Compat
+ *
+ *  defs.hpp
+ *
+ *  Description:
+ *    helper aliases and definitions for Compat
+ *
+ **************************************************************************/
+
+// The original source was under the license below:
+//==---- defs.hpp ---------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <iostream>
+
+template <class... Args> class compat_kernel_name;
+template <int Arg> class compat_kernel_scalar;
+
+#if defined(_MSC_VER)
+#define __compat_align__(n) __declspec(align(n))
+#define __compat_inline__ __forceinline
+#define __compat_noinline__ __declspec(noinline)
+#else
+#define __compat_align__(n) __attribute__((aligned(n)))
+#define __compat_inline__ __inline__ __attribute__((always_inline))
+#define __compat_noinline__ __attribute__((noinline))
+#endif
+
+#define COMPAT_COMPATIBILITY_TEMP (900)
+
+#ifdef _WIN32
+#define COMPAT_EXPORT __declspec(dllexport)
+#else
+#define COMPAT_EXPORT
+#endif
+
+#define COMPAT_MAJOR_VERSION 0
+#define COMPAT_MINOR_VERSION 2
+#define COMPAT_PATCH_VERSION 0
+
+#define COMPAT_MAKE_VERSION(_major, _minor, _patch)                        \
+  ((1E6 * _major) + (1E3 * _minor) + _patch)
+
+#define COMPAT_VERSION                                                     \
+  COMPAT_MAKE_VERSION(COMPAT_MAJOR_VERSION, COMPAT_MINOR_VERSION,  \
+                          COMPAT_PATCH_VERSION)
+
+namespace compat {
+enum error_code { success = 0, backend_error = 1, default_error = 999 };
+/// A dummy function introduced to assist auto migration.
+/// The SYCLomatic user should replace it with a real error-handling function.
+/// SYCL reports errors using exceptions and does not use error codes.
+inline const char *get_error_string_dummy(int ec) {
+  (void)ec;
+  return "<FIXME: Placeholder>"; // Return the error string for the error code
+                                 // ec.
+}
+} // namespace compat
+
+#define COMPAT_CHECK_ERROR(expr)                                           \
+  [&]() {                                                                      \
+    try {                                                                      \
+      expr;                                                                    \
+      return compat::error_code::success;                                  \
+    } catch (sycl::exception const &e) {                                       \
+      std::cerr << e.what() << std::endl;                                      \
+      return compat::error_code::backend_error;                            \
+    } catch (std::runtime_error const &e) {                                    \
+      std::cerr << e.what() << std::endl;                                      \
+      return compat::error_code::default_error;                            \
+    }                                                                          \
+  }()
diff --git a/tools/util/include/compat/device.hpp b/tools/util/include/compat/device.hpp
new file mode 100644
index 0000000000..25e096e281
--- /dev/null
+++ b/tools/util/include/compat/device.hpp
@@ -0,0 +1,967 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  device.hpp
+ *
+ *  Description:
+ *    Device functionality for the SYCL compatibility extension
+ **************************************************************************/
+//
+// Modifications, Copyright (C) 2025 Intel Corporation
+//
+// This software and the related documents are Intel copyrighted materials, and
+// your use of them is governed by the express license under which they were
+// provided to you ("License"). Unless the License provides otherwise, you may
+// not use, modify, copy, publish, distribute, disclose or transmit this
+// software or the related documents without Intel's prior written permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+//
+// The original source was under the license below:
+//==---- device.hpp -------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <thread>
+#include <vector>
+#if defined(__linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+#if defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#include <sycl/detail/defines_elementary.hpp>
+#include <sycl/exception_list.hpp>
+#include <sycl/properties/queue_properties.hpp>
+#include <sycl/queue.hpp>
+
+namespace compat {
+
+namespace detail {
+static void parse_version_string(const std::string &ver, int &major,
+                                 int &minor) {
+  // Version string has the following format:
+  // a. OpenCL<space><major.minor><space><vendor-specific-information>
+  // b. <major.minor>
+  // c. <AmdGcnArchName> e.g gfx1030
+  std::string::size_type i = 0;
+  while (i < ver.size()) {
+    if (isdigit(ver[i]))
+      break;
+    i++;
+  }
+  if (i < ver.size())
+    major = std::stoi(&(ver[i]));
+  else
+    major = 0;
+  while (i < ver.size()) {
+    if (ver[i] == '.')
+      break;
+    i++;
+  }
+  i++;
+  if (i < ver.size())
+    minor = std::stoi(&(ver[i]));
+  else
+    minor = 0;
+}
+
+static void get_version(const sycl::device &dev, int &major, int &minor) {
+  std::string ver = dev.get_info<sycl::info::device::version>();
+  parse_version_string(ver, major, minor);
+}
+
+/// SYCL default exception handler
+inline auto exception_handler = [](sycl::exception_list exceptions) {
+  for (std::exception_ptr const &e : exceptions) {
+    try {
+      std::rethrow_exception(e);
+    } catch (sycl::exception const &e) {
+      std::cerr << "[Compat] Caught asynchronous SYCL exception:"
+                << std::endl
+                << e.what() << std::endl
+                << "Exception caught at file:" << __FILE__
+                << ", line:" << __LINE__ << std::endl;
+    }
+  }
+};
+
+} // namespace detail
+
+using event_ptr = sycl::event *;
+
+using queue_ptr = sycl::queue *;
+
+using device_ptr = char *;
+
+/// Destroy \p event pointed memory.
+///
+/// \param event Pointer to the sycl::event address.
+static void destroy_event(event_ptr event) { delete event; }
+
+class device_info {
+public:
+  // get interface
+  const char *get_name() const { return _name; }
+  char *get_name() { return _name; }
+  template <typename WorkItemSizesTy = sycl::range<3>,
+            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                 std::is_same_v<WorkItemSizesTy, int *>,
+                             int> = 0>
+  auto get_max_work_item_sizes() const {
+    if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+      return _max_work_item_sizes;
+    else
+      return _max_work_item_sizes_i;
+  }
+  template <typename WorkItemSizesTy = sycl::range<3>,
+            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                 std::is_same_v<WorkItemSizesTy, int *>,
+                             int> = 0>
+  auto get_max_work_item_sizes() {
+    if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+      return _max_work_item_sizes;
+    else
+      return _max_work_item_sizes_i;
+  }
+  bool get_host_unified_memory() const { return _host_unified_memory; }
+  int get_major_version() const { return _major; }
+  int get_minor_version() const { return _minor; }
+  int get_integrated() const { return _integrated; }
+  int get_max_clock_frequency() const { return _frequency; }
+  int get_max_compute_units() const { return _max_compute_units; }
+  int get_max_work_group_size() const { return _max_work_group_size; }
+  int get_max_sub_group_size() const { return _max_sub_group_size; }
+  int get_max_work_items_per_compute_unit() const {
+    return _max_work_items_per_compute_unit;
+  }
+  int get_max_register_size_per_work_group() const {
+    return _max_register_size_per_work_group;
+  }
+  template <typename NDRangeSizeTy = size_t *,
+            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                 std::is_same_v<NDRangeSizeTy, int *>,
+                             int> = 0>
+  auto get_max_nd_range_size() const {
+    if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+      return _max_nd_range_size;
+    else
+      return _max_nd_range_size_i;
+  }
+  template <typename NDRangeSizeTy = size_t *,
+            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                 std::is_same_v<NDRangeSizeTy, int *>,
+                             int> = 0>
+  auto get_max_nd_range_size() {
+    if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+      return _max_nd_range_size;
+    else
+      return _max_nd_range_size_i;
+  }
+  size_t get_global_mem_size() const { return _global_mem_size; }
+  size_t get_local_mem_size() const { return _local_mem_size; }
+  /// Returns the maximum clock rate of device's global memory in kHz. If
+  /// compiler does not support this API then returns default value 3200000 kHz.
+  unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
+  /// Returns the maximum bus width between device and memory in bits. If
+  /// compiler does not support this API then returns default value 64 bits.
+  unsigned int get_memory_bus_width() const { return _memory_bus_width; }
+  uint32_t get_device_id() const { return _device_id; }
+  std::array<unsigned char, 16> get_uuid() const { return _uuid; }
+  /// Returns global memory cache size in bytes.
+  unsigned int get_global_mem_cache_size() const {
+    return _global_mem_cache_size;
+  }
+  int get_image1d_max() const { return _image1d_max; }
+  auto get_image2d_max() const { return _image2d_max; }
+  auto get_image2d_max() { return _image2d_max; }
+  auto get_image3d_max() const { return _image3d_max; }
+  auto get_image3d_max() { return _image3d_max; }
+
+  // set interface
+  void set_name(const char *name) {
+    size_t length = strlen(name);
+    if (length < device_info::NAME_BUFFER_SIZE) {
+      std::memcpy(_name, name, length + 1);
+    } else {
+      std::memcpy(_name, name, device_info::NAME_BUFFER_SIZE - 1);
+      _name[255] = '\0';
+    }
+  }
+  void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) {
+    _max_work_item_sizes = max_work_item_sizes;
+    for (int i = 0; i < 3; ++i)
+      _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+  }
+  [[deprecated]] void
+  set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) {
+    for (int i = 0; i < 3; ++i) {
+      _max_work_item_sizes[i] = max_work_item_sizes[i];
+      _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+    }
+  }
+  void set_host_unified_memory(bool host_unified_memory) {
+    _host_unified_memory = host_unified_memory;
+  }
+  void set_major_version(int major) { _major = major; }
+  void set_minor_version(int minor) { _minor = minor; }
+  void set_integrated(int integrated) { _integrated = integrated; }
+  void set_max_clock_frequency(int frequency) { _frequency = frequency; }
+  void set_max_compute_units(int max_compute_units) {
+    _max_compute_units = max_compute_units;
+  }
+  void set_global_mem_size(size_t global_mem_size) {
+    _global_mem_size = global_mem_size;
+  }
+  void set_local_mem_size(size_t local_mem_size) {
+    _local_mem_size = local_mem_size;
+  }
+  void set_max_work_group_size(int max_work_group_size) {
+    _max_work_group_size = max_work_group_size;
+  }
+  void set_max_sub_group_size(int max_sub_group_size) {
+    _max_sub_group_size = max_sub_group_size;
+  }
+  void
+  set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) {
+    _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
+  }
+  void set_max_nd_range_size(int max_nd_range_size[]) {
+    for (int i = 0; i < 3; i++) {
+      _max_nd_range_size[i] = max_nd_range_size[i];
+      _max_nd_range_size_i[i] = max_nd_range_size[i];
+    }
+  }
+  void set_max_nd_range_size(sycl::id<3> max_nd_range_size) {
+    for (int i = 0; i < 3; i++) {
+      _max_nd_range_size[i] = max_nd_range_size[i];
+      _max_nd_range_size_i[i] = max_nd_range_size[i];
+    }
+  }
+  void set_memory_clock_rate(unsigned int memory_clock_rate) {
+    _memory_clock_rate = memory_clock_rate;
+  }
+  void set_memory_bus_width(unsigned int memory_bus_width) {
+    _memory_bus_width = memory_bus_width;
+  }
+  void
+  set_max_register_size_per_work_group(int max_register_size_per_work_group) {
+    _max_register_size_per_work_group = max_register_size_per_work_group;
+  }
+  void set_device_id(uint32_t device_id) { _device_id = device_id; }
+  void set_uuid(std::array<unsigned char, 16> uuid) { _uuid = std::move(uuid); }
+  void set_global_mem_cache_size(unsigned int global_mem_cache_size) {
+    _global_mem_cache_size = global_mem_cache_size;
+  }
+  void set_image1d_max(size_t image_max_buffer_size) {
+    _image1d_max = image_max_buffer_size;
+  }
+  void set_image2d_max(size_t image_max_width_buffer_size,
+                       size_t image_max_height_buffer_size) {
+    _image2d_max[0] = image_max_width_buffer_size;
+    _image2d_max[1] = image_max_height_buffer_size;
+  }
+  void set_image3d_max(size_t image_max_width_buffer_size,
+                       size_t image_max_height_buffer_size,
+                       size_t image_max_depth_buffer_size) {
+    _image3d_max[0] = image_max_width_buffer_size;
+    _image3d_max[1] = image_max_height_buffer_size;
+    _image3d_max[2] = image_max_depth_buffer_size;
+  }
+
+private:
+  constexpr static size_t NAME_BUFFER_SIZE = 256;
+
+  char _name[device_info::NAME_BUFFER_SIZE];
+  sycl::range<3> _max_work_item_sizes;
+  int _max_work_item_sizes_i[3];
+  bool _host_unified_memory = false;
+  int _major;
+  int _minor;
+  int _integrated = 0;
+  int _frequency;
+  // Set estimated value 3200000 kHz as default value.
+  unsigned int _memory_clock_rate = 3200000;
+  // Set estimated value 64 bits as default value.
+  unsigned int _memory_bus_width = 64;
+  unsigned int _global_mem_cache_size;
+  int _max_compute_units;
+  int _max_work_group_size;
+  int _max_sub_group_size;
+  int _max_work_items_per_compute_unit;
+  int _max_register_size_per_work_group;
+  size_t _global_mem_size;
+  size_t _local_mem_size;
+  size_t _max_nd_range_size[3];
+  int _max_nd_range_size_i[3];
+  uint32_t _device_id;
+  std::array<unsigned char, 16> _uuid;
+  int _image1d_max;
+  int _image2d_max[2];
+  int _image3d_max[3];
+};
+
+static int get_major_version(const sycl::device &dev) {
+  int major, minor;
+  detail::get_version(dev, major, minor);
+  return major;
+}
+
+static int get_minor_version(const sycl::device &dev) {
+  int major, minor;
+  detail::get_version(dev, major, minor);
+  return minor;
+}
+
+static inline void
+has_capability_or_fail(const sycl::device &dev,
+                       const std::initializer_list<sycl::aspect> &props) {
+  for (const auto &it : props) {
+    if (dev.has(it))
+      continue;
+    switch (it) {
+    case sycl::aspect::fp64:
+      throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+                            "[Compat] 'double' is not supported in '" +
+                                dev.get_info<sycl::info::device::name>() +
+                                "' device");
+      break;
+    case sycl::aspect::fp16:
+      throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+                            "[Compat] 'half' is not supported in '" +
+                                dev.get_info<sycl::info::device::name>() +
+                                "' device");
+      break;
+    default:
+#define __SYCL_ASPECT(ASPECT, ID)                                              \
+  case sycl::aspect::ASPECT:                                                   \
+    return #ASPECT;
+#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
+#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
+      auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string {
+        switch (AspectNum) {
+#include <sycl/info/aspects.def>
+#include <sycl/info/aspects_deprecated.def>
+        default:
+          return "unknown aspect";
+        }
+      };
+#undef __SYCL_ASPECT_DEPRECATED_ALIAS
+#undef __SYCL_ASPECT_DEPRECATED
+#undef __SYCL_ASPECT
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::runtime),
+          "[Compat] '" + getAspectNameStr(it) + "' is not supported in '" +
+              dev.get_info<sycl::info::device::name>() + "' device");
+    }
+    break;
+  }
+}
+
+/// device extension
+class device_ext : public sycl::device {
+public:
+  device_ext() : sycl::device(), _ctx(*this) {}
+  ~device_ext() {
+    try {
+      std::lock_guard<std::mutex> lock(m_mutex);
+      sycl::event::wait(_events);
+      _queues.clear();
+    } catch (std::exception &e) {
+      __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~device_ext", e);
+    }
+  }
+  device_ext(const sycl::device &base, bool print_on_async_exceptions = false,
+             bool in_order = true)
+      : sycl::device(base), _ctx(*this) {
+    if (!this->has(sycl::aspect::usm_device_allocations)) {
+      throw std::invalid_argument(
+          "Device does not support device USM allocations");
+    }
+    // calls create_queue since we don't have a locked m_mutex
+    _default_queue = create_queue(print_on_async_exceptions, in_order);
+    _saved_queue = _default_queue;
+  }
+
+  bool is_native_host_atomic_supported() { return false; }
+  int get_major_version() const { return compat::get_major_version(*this); }
+
+  int get_minor_version() const { return compat::get_minor_version(*this); }
+
+  int get_max_compute_units() const {
+    return get_device_info().get_max_compute_units();
+  }
+
+  /// Return the maximum clock frequency of this device in KHz.
+  int get_max_clock_frequency() const {
+    return get_device_info().get_max_clock_frequency();
+  }
+
+  int get_integrated() const { return get_device_info().get_integrated(); }
+
+  int get_max_sub_group_size() const {
+    return get_device_info().get_max_sub_group_size();
+  }
+
+  int get_max_register_size_per_work_group() const {
+    return get_device_info().get_max_register_size_per_work_group();
+  }
+
+  int get_max_work_group_size() const {
+    return get_device_info().get_max_work_group_size();
+  }
+
+  int get_mem_base_addr_align() const {
+    return get_info<sycl::info::device::mem_base_addr_align>();
+  }
+
+  size_t get_global_mem_size() const {
+    return get_device_info().get_global_mem_size();
+  }
+
+  size_t get_local_mem_size() const {
+    return get_device_info().get_local_mem_size();
+  }
+
+  /// Get the number of bytes of free and total memory on the SYCL device.
+  /// \param [out] free_memory The number of bytes of free memory on the SYCL
+  /// device.
+  /// \param [out] total_memory The number of bytes of total memory on the SYCL
+  /// device.
+  void get_memory_info(size_t &free_memory, size_t &total_memory) const {
+    if (!has(sycl::aspect::ext_intel_free_memory)) {
+      std::cerr << "[Compat] get_memory_info: ext_intel_free_memory is not "
+                   "supported."
+                << std::endl;
+      free_memory = 0;
+    } else {
+      free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
+    }
+    total_memory = get_device_info().get_global_mem_size();
+  }
+
+  void get_device_info(device_info &out) const {
+    if (_dev_info) {
+      out = *_dev_info;
+      return;
+    }
+
+    std::lock_guard<std::mutex> lock(m_mutex);
+    device_info prop;
+    prop.set_name(get_info<sycl::info::device::name>().c_str());
+
+    int major, minor;
+    get_version(major, minor);
+    prop.set_major_version(major);
+    prop.set_minor_version(minor);
+
+    prop.set_max_work_item_sizes(
+        // SYCL 2020-conformant code, max_work_item_sizes is a struct
+        // templated by an int
+        get_info<sycl::info::device::max_work_item_sizes<3>>());
+
+    prop.set_host_unified_memory(has(sycl::aspect::usm_host_allocations));
+
+    prop.set_max_clock_frequency(
+        get_info<sycl::info::device::max_clock_frequency>());
+    prop.set_max_compute_units(
+        get_info<sycl::info::device::max_compute_units>());
+    prop.set_max_work_group_size(
+        get_info<sycl::info::device::max_work_group_size>());
+    prop.set_global_mem_size(get_info<sycl::info::device::global_mem_size>());
+    prop.set_local_mem_size(get_info<sycl::info::device::local_mem_size>());
+
+#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
+    if (has(sycl::aspect::ext_intel_memory_clock_rate)) {
+      unsigned int tmp =
+          get_info<sycl::ext::intel::info::device::memory_clock_rate>();
+      if (tmp != 0)
+        prop.set_memory_clock_rate(1000 * tmp);
+    }
+    if (has(sycl::aspect::ext_intel_memory_bus_width)) {
+      prop.set_memory_bus_width(
+          get_info<sycl::ext::intel::info::device::memory_bus_width>());
+    }
+    if (has(sycl::aspect::ext_intel_device_id)) {
+      prop.set_device_id(get_info<sycl::ext::intel::info::device::device_id>());
+    }
+    if (has(sycl::aspect::ext_intel_device_info_uuid)) {
+      prop.set_uuid(get_info<sycl::ext::intel::info::device::uuid>());
+    }
+#elif defined(_MSC_VER) && !defined(__clang__)
+#pragma message("get_device_info: querying memory_clock_rate and \
+memory_bus_width are not supported by the compiler used. \
+Use 3200000 kHz as memory_clock_rate default value. \
+Use 64 bits as memory_bus_width default value.")
+#else
+#warning "get_device_info: querying memory_clock_rate and \
+memory_bus_width are not supported by the compiler used. \
+Use 3200000 kHz as memory_clock_rate default value. \
+Use 64 bits as memory_bus_width default value."
+#endif
+
+    size_t max_sub_group_size = 1;
+    std::vector<size_t> sub_group_sizes =
+        get_info<sycl::info::device::sub_group_sizes>();
+
+    for (const auto &sub_group_size : sub_group_sizes) {
+      if (max_sub_group_size < sub_group_size)
+        max_sub_group_size = sub_group_size;
+    }
+
+    prop.set_max_sub_group_size(max_sub_group_size);
+
+    prop.set_max_work_items_per_compute_unit(
+        get_info<sycl::info::device::max_work_group_size>());
+#ifdef SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY
+    prop.set_max_nd_range_size(
+        get_info<sycl::ext::oneapi::experimental::info::device::max_work_groups<
+            3>>());
+#else
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma message("get_device_info: querying the maximum number \
+    of work groups is not supported.")
+#else
+#warning "get_device_info: querying the maximum number of \
+    work groups is not supported."
+#endif
+    int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+    prop.set_max_nd_range_size(max_nd_range_size);
+#endif
+
+    // Estimates max register size per work group, feel free to update the
+    // value according to device properties.
+    prop.set_max_register_size_per_work_group(65536);
+
+    prop.set_global_mem_cache_size(
+        get_info<sycl::info::device::global_mem_cache_size>());
+
+    prop.set_image1d_max(get_info<sycl::info::device::image_max_buffer_size>());
+    prop.set_image1d_max(get_info<sycl::info::device::image_max_buffer_size>());
+    prop.set_image2d_max(get_info<sycl::info::device::image2d_max_width>(),
+                         get_info<sycl::info::device::image2d_max_height>());
+    prop.set_image3d_max(get_info<sycl::info::device::image3d_max_width>(),
+                         get_info<sycl::info::device::image3d_max_height>(),
+                         get_info<sycl::info::device::image3d_max_height>());
+
+    _dev_info = prop;
+    out = prop;
+  }
+
+  device_info get_device_info() const {
+    if (!_dev_info) {
+      this->get_device_info(*_dev_info);
+    }
+    return _dev_info.value();
+  }
+
+  void reset(bool print_on_async_exceptions = false, bool in_order = true) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    // The queues are shared_ptrs and the ref counts of the shared_ptrs increase
+    // only in wait_and_throw(). If there is no other thread calling
+    // wait_and_throw(), the queues will be destructed. The destructor waits for
+    // all commands executing on the queue to complete. It isn't possible to
+    // destroy a queue immediately. This is a synchronization point in SYCL.
+    _queues.clear();
+    // create new default queue
+    // calls create_queue_impl since we already have a locked m_mutex
+
+    _saved_queue = _default_queue =
+        in_order ? create_queue_impl(print_on_async_exceptions,
+                                     sycl::property::queue::in_order())
+                 : create_queue_impl(print_on_async_exceptions);
+  }
+
+  void set_default_queue(const sycl::queue &q) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    _queues.front().get()->wait_and_throw();
+    _queues[0] = std::make_shared<sycl::queue>(q);
+    if (_saved_queue == _default_queue)
+      _saved_queue = _queues.front().get();
+    _default_queue = _queues.front().get();
+  }
+
+  queue_ptr default_queue() { return _default_queue; }
+
+  void queues_wait_and_throw() {
+    std::unique_lock<std::mutex> lock(m_mutex);
+    std::vector<std::shared_ptr<sycl::queue>> current_queues(_queues);
+    lock.unlock();
+    for (const auto &q : current_queues) {
+      q->wait_and_throw();
+    }
+    // Guard the destruct of current_queues to make sure the ref count is safe.
+    lock.lock();
+  }
+  queue_ptr create_queue(bool print_on_async_exceptions = false,
+                         bool in_order = true) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return in_order ? create_queue_impl(print_on_async_exceptions,
+                                        sycl::property::queue::in_order())
+                    : create_queue_impl(print_on_async_exceptions);
+  }
+  void destroy_queue(queue_ptr &queue) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    _queues.erase(
+        std::remove_if(_queues.begin(), _queues.end(),
+                       [=](const std::shared_ptr<sycl::queue> &q) -> bool {
+                         return q.get() == queue;
+                       }),
+        _queues.end());
+    queue = nullptr;
+  }
+  void set_saved_queue(queue_ptr q) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    _saved_queue = q;
+  }
+  queue_ptr get_saved_queue() const {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return _saved_queue;
+  }
+  sycl::context get_context() const { return _ctx; }
+
+  /// Util function to check whether a device supports some kinds of
+  /// sycl::aspect.
+  void has_capability_or_fail(
+      const std::initializer_list<sycl::aspect> &props) const {
+    ::compat::has_capability_or_fail(*this, props);
+  }
+
+private:
+  /// Caller should only be done from functions where the resource \p m_mutex
+  /// has been acquired.
+  template <typename... PropertiesT>
+  queue_ptr create_queue_impl(bool print_on_async_exceptions = false,
+                              PropertiesT... properties) {
+    sycl::property_list prop = sycl::property_list(
+#ifdef COMPAT_PROFILING_ENABLED
+        sycl::property::queue::enable_profiling(),
+#endif
+        properties...);
+    if (print_on_async_exceptions) {
+      _queues.push_back(std::make_shared<sycl::queue>(
+          _ctx, *this, detail::exception_handler, prop));
+    } else {
+      _queues.push_back(std::make_shared<sycl::queue>(_ctx, *this, prop));
+    }
+    return _queues.back().get();
+  }
+
+  void get_version(int &major, int &minor) const {
+    detail::get_version(*this, major, minor);
+  }
+  void add_event(sycl::event event) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    _events.push_back(event);
+  }
+  friend sycl::event enqueue_free(const std::vector<void *> &,
+                                  const std::vector<sycl::event> &,
+                                  sycl::queue);
+  queue_ptr _default_queue;
+  queue_ptr _saved_queue;
+  sycl::context _ctx;
+  std::vector<std::shared_ptr<sycl::queue>> _queues;
+  mutable std::mutex m_mutex;
+  std::vector<sycl::event> _events;
+  mutable std::optional<device_info> _dev_info;
+};
+
+namespace detail {
+
+static inline unsigned int get_tid() {
+#if defined(__linux__)
+  return syscall(SYS_gettid);
+#elif defined(_WIN64)
+  return GetCurrentThreadId();
+#else
+#error "Only support Windows and Linux."
+#endif
+}
+
+/// device manager
+class dev_mgr {
+public:
+  device_ext &current_device() {
+    unsigned int dev_id = current_device_id();
+    check_id(dev_id);
+    return *_devs[dev_id];
+  }
+  device_ext &cpu_device() const {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    if (_cpu_device == -1) {
+      throw std::runtime_error("[Compat] No valid cpu device");
+    } else {
+      return *_devs[_cpu_device];
+    }
+  }
+  device_ext &get_device(unsigned int id) const {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    check_id(id);
+    return *_devs[id];
+  }
+  unsigned int current_device_id() const {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    auto it = _thread2dev_map.find(get_tid());
+    if (it != _thread2dev_map.end())
+      return it->second;
+    return _default_device_id;
+  }
+
+  /// Select device with a device ID.
+  /// \param [in] id The id of the device which can
+  /// be obtained through get_device_id(const sycl::device).
+  void select_device(unsigned int id) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    check_id(id);
+    _thread2dev_map[get_tid()] = id;
+  }
+  unsigned int device_count() { return _devs.size(); }
+
+  unsigned int get_device_id(const sycl::device &dev) {
+    if (!_devs.size()) {
+      throw std::runtime_error(
+          "[Compat] No SYCL devices found in the device list. Device list "
+          "may have been filtered by compat::filter_device");
+    }
+    unsigned int id = 0;
+    for (auto dev_item : _devs) {
+      if (*dev_item == dev) {
+        return id;
+      }
+      id++;
+    }
+    throw std::runtime_error("[Compat] The device[" +
+                             dev.get_info<sycl::info::device::name>() +
+                             "] is filtered out by compat::filter_device "
+                             "in current device list!");
+  }
+
+  /// List all the devices with its id in dev_mgr.
+  void list_devices() const {
+    for (size_t i = 0; i < _devs.size(); ++i) {
+      std::cout << "Device " << i << ": "
+                << _devs[i]->get_info<sycl::info::device::name>() << std::endl;
+    }
+  }
+
+  /// Filter out devices; only keep the device whose name contains one of the
+  /// subname in \p dev_subnames.
+  /// May break device id mapping and change current device. It's better to be
+  /// called before other Compat/SYCL APIs.
+  void filter(const std::vector<std::string> &dev_subnames) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    auto iter = _devs.begin();
+    while (iter != _devs.end()) {
+      std::string dev_name = (*iter)->get_info<sycl::info::device::name>();
+      bool matched = false;
+      for (const auto &name : dev_subnames) {
+        if (dev_name.find(name) != std::string::npos) {
+          matched = true;
+          break;
+        }
+      }
+      if (matched)
+        ++iter;
+      else
+        iter = _devs.erase(iter);
+    }
+    _cpu_device = -1;
+    for (unsigned i = 0; i < _devs.size(); ++i) {
+      if (_devs[i]->is_cpu()) {
+        _cpu_device = i;
+        break;
+      }
+    }
+    _thread2dev_map.clear();
+#ifdef COMPAT_VERBOSE
+    list_devices();
+#endif
+  }
+
+  /// Select device with a Device Selector
+  /// \param selector device selector to get the device id from. Defaults to
+  /// sycl::gpu_selector_v
+  template <class DeviceSelector>
+  std::enable_if_t<
+      std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
+  select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
+    sycl::device selected_device = sycl::device(selector);
+    unsigned int selected_device_id = get_device_id(selected_device);
+    select_device(selected_device_id);
+  }
+
+  /// Returns the instance of device manager singleton.
+  static dev_mgr &instance() {
+    static dev_mgr d_m;
+    return d_m;
+  }
+  dev_mgr(const dev_mgr &) = delete;
+  dev_mgr &operator=(const dev_mgr &) = delete;
+  dev_mgr(dev_mgr &&) = delete;
+  dev_mgr &operator=(dev_mgr &&) = delete;
+
+private:
+  mutable std::mutex m_mutex;
+
+  dev_mgr() {
+    sycl::device default_device = sycl::device(sycl::default_selector_v);
+    _devs.push_back(std::make_shared<device_ext>(default_device));
+
+    std::vector<sycl::device> sycl_all_devs =
+        sycl::device::get_devices(sycl::info::device_type::all);
+    // Collect other devices except for the default device.
+    if (default_device.is_cpu())
+      _cpu_device = 0;
+    for (auto &dev : sycl_all_devs) {
+      if (dev == default_device) {
+        continue;
+      }
+      _devs.push_back(std::make_shared<device_ext>(dev));
+      if (_cpu_device == -1 && dev.is_cpu()) {
+        _cpu_device = _devs.size() - 1;
+      }
+    }
+#ifdef COMPAT_VERBOSE
+    list_devices();
+#endif
+  }
+  void check_id(unsigned int id) const {
+    if (id >= _devs.size()) {
+      throw std::runtime_error("invalid device id");
+    }
+  }
+  std::vector<std::shared_ptr<device_ext>> _devs;
+  /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
+  /// thread id in _thread2dev_map, which means default device should be used
+  /// for the current thread.
+  const unsigned int _default_device_id = 0;
+  /// thread-id to device-id map.
+  std::map<unsigned int, unsigned int> _thread2dev_map;
+  int _cpu_device = -1;
+};
+
+} // namespace detail
+
+static inline sycl::queue create_queue(bool print_on_async_exceptions = false,
+                                       bool in_order = true) {
+  return *detail::dev_mgr::instance().current_device().create_queue(
+      print_on_async_exceptions, in_order);
+}
+
+/// Util function to get the default queue of current device in
+/// device manager.
+static inline sycl::queue get_default_queue() {
+  return *detail::dev_mgr::instance().current_device().default_queue();
+}
+
+/// Util function to change the default queue of the current device in the
+/// device manager
+/// If the device extension saved queue is the default queue,
+/// the previous saved queue will be overwritten as well.
+/// This function will be blocking if there are submitted kernels in the
+/// previous default queue.
+/// @param q New user-defined queue
+static inline void set_default_queue(const sycl::queue &q) {
+  detail::dev_mgr::instance().current_device().set_default_queue(q);
+}
+
+static inline void wait(sycl::queue q = get_default_queue()) { q.wait(); }
+
+static inline void wait_and_throw(sycl::queue q = get_default_queue()) {
+  q.wait_and_throw();
+}
+
+/// Util function to get the id of current device in
+/// device manager.
+static inline unsigned int get_current_device_id() {
+  return detail::dev_mgr::instance().current_device_id();
+}
+
+/// Util function to get the current device.
+static inline device_ext &get_current_device() {
+  return detail::dev_mgr::instance().current_device();
+}
+
+/// Util function to get a device by id.
+static inline device_ext &get_device(unsigned int id) {
+  return detail::dev_mgr::instance().get_device(id);
+}
+
+/// Util function to get the context of the default queue of current
+/// device in device manager.
+static inline sycl::context get_default_context() {
+  return get_current_device().get_context();
+}
+
+/// Util function to get a CPU device.
+static inline device_ext &cpu_device() {
+  return detail::dev_mgr::instance().cpu_device();
+}
+
+/// Filter out devices; only keep the device whose name contains one of the
+/// subname in \p dev_subnames.
+/// May break device id mapping and change current device. It's better to be
+/// called before other Compat or SYCL APIs.
+static inline void filter_device(const std::vector<std::string> &dev_subnames) {
+  detail::dev_mgr::instance().filter(dev_subnames);
+}
+
+/// List all the devices with its id in dev_mgr.
+static inline void list_devices() {
+  detail::dev_mgr::instance().list_devices();
+}
+
+static inline unsigned int select_device(unsigned int id) {
+  detail::dev_mgr::instance().select_device(id);
+  return id;
+}
+
+template <class DeviceSelector>
+static inline std::enable_if_t<
+    std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
+select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
+  detail::dev_mgr::instance().select_device(selector);
+}
+
+static inline unsigned int get_device_id(const sycl::device &dev) {
+  return detail::dev_mgr::instance().get_device_id(dev);
+}
+
+static inline unsigned int device_count() {
+  return detail::dev_mgr::instance().device_count();
+}
+} // namespace compat
diff --git a/tools/util/include/compat/dims.hpp b/tools/util/include/compat/dims.hpp
new file mode 100644
index 0000000000..8da01a39e6
--- /dev/null
+++ b/tools/util/include/compat/dims.hpp
@@ -0,0 +1,74 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  Compat
+ *
+ *  dims.hpp
+ *
+ *  Description:
+ *    dim3 functionality for Compat
+ **************************************************************************/
+
+#pragma once
+
+#include <tuple>
+
+#include <sycl/range.hpp>
+
+namespace compat {
+
+class dim3 {
+public:
+  unsigned int x, y, z;
+
+  dim3(const sycl::range<3> &r) : x(r[2]), y(r[1]), z(r[0]) {}
+
+  dim3(const sycl::range<2> &r) : x(r[1]), y(r[0]), z(1) {}
+
+  dim3(const sycl::range<1> &r) : x(r[0]), y(1), z(1) {}
+
+  constexpr dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1)
+      : x(x), y(y), z(z) {}
+
+  constexpr size_t size() const { return x * y * z; }
+
+  operator sycl::range<3>() const { return sycl::range<3>(z, y, x); }
+  operator sycl::range<2>() const {
+    if (z != 1)
+      throw std::invalid_argument(
+          "Attempting to convert a 3D dim3 into sycl::range<2>");
+    return sycl::range<2>(y, x);
+  }
+  operator sycl::range<1>() const {
+    if (z != 1 || y != 1)
+      throw std::invalid_argument(
+          "Attempting to convert a 2D or 3D dim3 into sycl::range<1>");
+    return sycl::range<1>(x);
+  }
+}; // namespace dim3
+
+inline dim3 operator*(const dim3 &a, const dim3 &b) {
+  return dim3{a.x * b.x, a.y * b.y, a.z * b.z};
+}
+
+inline dim3 operator+(const dim3 &a, const dim3 &b) {
+  return dim3{a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+inline dim3 operator-(const dim3 &a, const dim3 &b) {
+  return dim3{a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+} // namespace compat
diff --git a/tools/util/include/compat/group_utils.hpp b/tools/util/include/compat/group_utils.hpp
new file mode 100644
index 0000000000..a473b5f59b
--- /dev/null
+++ b/tools/util/include/compat/group_utils.hpp
@@ -0,0 +1,1270 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  group_utils.hpp
+ *
+ *  Description:
+ *    Group util functionality for the SYCL compatibility extension
+ **************************************************************************/
+
+// The original source was under the license below:
+//==---- group_utils.hpp ------------------*- C++ -*--------------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===------------------------------------------------------------------===//
+
+#pragma once
+
+#include <iterator>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+
+#include <compat/defs.hpp>
+#include <compat/math.hpp>
+
+namespace compat {
+namespace group {
+namespace detail {
+
+template <typename... _Args>
+constexpr auto __reduce_over_group(_Args... __args) {
+  return sycl::reduce_over_group(__args...);
+}
+
+template <typename... _Args> constexpr auto __group_broadcast(_Args... __args) {
+  return sycl::group_broadcast(__args...);
+}
+
+template <typename... _Args>
+constexpr auto __exclusive_scan_over_group(_Args... __args) {
+  return sycl::exclusive_scan_over_group(__args...);
+}
+
+template <typename... _Args>
+constexpr auto __inclusive_scan_over_group(_Args... __args) {
+  return sycl::inclusive_scan_over_group(__args...);
+}
+
+template <typename Item, typename T, class BinaryOperation,
+          class GroupPrefixCallbackOperation>
+__compat_inline__ T
+exclusive_scan(const Item &item, T input, BinaryOperation binary_op,
+               GroupPrefixCallbackOperation &prefix_callback_op) {
+  T group_aggregate;
+
+  T output =
+      detail::__exclusive_scan_over_group(item.get_group(), input, binary_op);
+  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
+    group_aggregate = binary_op(output, input);
+  }
+
+  group_aggregate = detail::__group_broadcast(
+      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
+
+  T group_prefix = prefix_callback_op(group_aggregate);
+  if (item.get_local_linear_id() == 0) {
+    output = group_prefix;
+  } else {
+    output = binary_op(group_prefix, output);
+  }
+
+  return output;
+}
+
+typedef uint16_t digit_counter_type;
+typedef uint32_t packed_counter_type;
+
+template <int N, int CURRENT_VAL = N, int COUNT = 0> struct log2 {
+  enum { VALUE = log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
+};
+
+template <int N, int COUNT> struct log2<N, 0, COUNT> {
+  enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 };
+};
+
+template <int RADIX_BITS, bool DESCENDING = false> class radix_rank {
+public:
+  static size_t get_local_memory_size(size_t group_threads) {
+    return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type);
+  }
+
+  radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  template <typename Item, int VALUES_PER_THREAD>
+  __compat_inline__ void
+  rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD],
+            int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) {
+
+    digit_counter_type thread_prefixes[VALUES_PER_THREAD];
+    digit_counter_type *digit_counters[VALUES_PER_THREAD];
+    digit_counter_type *buffer =
+        reinterpret_cast<digit_counter_type *>(_local_memory);
+    auto g = item.get_group();
+    reset_local_memory(item);
+
+    sycl::group_barrier(g, sycl::memory_scope::work_group);
+
+#pragma unroll
+    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
+      uint32_t digit =
+          ::compat::detail::bfe(keys[i], current_bit, num_bits);
+      uint32_t sub_counter = digit >> LOG_COUNTER_LANES;
+      uint32_t counter_lane = digit & (COUNTER_LANES - 1);
+
+      if (DESCENDING) {
+        sub_counter = PACKING_RATIO - 1 - sub_counter;
+        counter_lane = COUNTER_LANES - 1 - counter_lane;
+      }
+
+      digit_counters[i] =
+          &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO +
+                  item.get_local_linear_id() * PACKING_RATIO + sub_counter];
+      thread_prefixes[i] = *digit_counters[i];
+      *digit_counters[i] = thread_prefixes[i] + 1;
+    }
+
+    sycl::group_barrier(g, sycl::memory_scope::work_group);
+
+    scan_counters(item);
+
+    sycl::group_barrier(g, sycl::memory_scope::work_group);
+
+    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
+      ranks[i] = thread_prefixes[i] + *digit_counters[i];
+    }
+  }
+
+private:
+  template <typename Item>
+  __compat_inline__ void reset_local_memory(const Item &item) {
+    packed_counter_type *ptr =
+        reinterpret_cast<packed_counter_type *>(_local_memory);
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0;
+    }
+  }
+
+  template <typename Item>
+  __compat_inline__ packed_counter_type upsweep(const Item &item) {
+    packed_counter_type sum = 0;
+    packed_counter_type *ptr =
+        reinterpret_cast<packed_counter_type *>(_local_memory);
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; i++) {
+      cached_segment[i] =
+          ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i];
+    }
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      sum += cached_segment[i];
+    }
+
+    return sum;
+  }
+
+  template <typename Item>
+  __compat_inline__ void
+  exclusive_downsweep(const Item &item, packed_counter_type raking_partial) {
+    packed_counter_type *ptr =
+        reinterpret_cast<packed_counter_type *>(_local_memory);
+    packed_counter_type sum = raking_partial;
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      packed_counter_type value = cached_segment[i];
+      cached_segment[i] = sum;
+      sum += value;
+    }
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] =
+          cached_segment[i];
+    }
+  }
+
+  struct prefix_callback {
+    __compat_inline__ packed_counter_type
+    operator()(packed_counter_type block_aggregate) {
+      packed_counter_type block_prefix = 0;
+
+#pragma unroll
+      for (int packed = 1; packed < PACKING_RATIO; packed++) {
+        block_prefix += block_aggregate
+                        << (sizeof(digit_counter_type) * 8 * packed);
+      }
+
+      return block_prefix;
+    }
+  };
+
+  template <typename Item>
+  __compat_inline__ void scan_counters(const Item &item) {
+    packed_counter_type raking_partial = upsweep(item);
+
+    prefix_callback callback;
+    packed_counter_type exclusive_partial = exclusive_scan(
+        item, raking_partial, sycl::ext::oneapi::plus<packed_counter_type>(),
+        callback);
+
+    exclusive_downsweep(item, exclusive_partial);
+  }
+
+private:
+  static constexpr int PACKING_RATIO =
+      sizeof(packed_counter_type) / sizeof(digit_counter_type);
+  static constexpr int LOG_PACKING_RATIO = log2<PACKING_RATIO>::VALUE;
+  static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO;
+  static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES;
+  static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1;
+
+  packed_counter_type cached_segment[PADDED_COUNTER_LANES];
+  uint8_t *_local_memory;
+};
+
+template <typename T, typename U> struct base_traits {
+
+  static __compat_inline__ U twiddle_in(U key) {
+    throw std::runtime_error("Not implemented");
+  }
+  static __compat_inline__ U twiddle_out(U key) {
+    throw std::runtime_error("Not implemented");
+  }
+};
+
+template <typename U> struct base_traits<uint32_t, U> {
+  static __compat_inline__ U twiddle_in(U key) { return key; }
+  static __compat_inline__ U twiddle_out(U key) { return key; }
+};
+
+template <typename U> struct base_traits<int, U> {
+  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
+  static __compat_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; }
+  static __compat_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; }
+};
+
+template <typename U> struct base_traits<float, U> {
+  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
+  static __compat_inline__ U twiddle_in(U key) {
+    U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT;
+    return key ^ mask;
+  }
+  static __compat_inline__ U twiddle_out(U key) {
+    U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1);
+    return key ^ mask;
+  }
+};
+
+template <typename T> struct traits : base_traits<T, T> {};
+template <> struct traits<uint32_t> : base_traits<uint32_t, uint32_t> {};
+template <> struct traits<int> : base_traits<int, uint32_t> {};
+template <> struct traits<float> : base_traits<float, uint32_t> {};
+
+template <int N> struct power_of_two {
+  enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+__compat_inline__ uint32_t shr_add(uint32_t x, uint32_t shift,
+                                       uint32_t addend) {
+  return (x >> shift) + addend;
+}
+
+} // namespace detail
+
+/// Rearranging data partitioned across a work-group.
+///
+/// \tparam T The type of the data elements.
+/// \tparam ElementsPerWorkItem The number of data elements assigned to a
+/// work-item.
+template <typename T, size_t ElementsPerWorkItem> class exchange {
+public:
+  static size_t get_local_memory_size(size_t group_threads) {
+    size_t padding_values =
+        (INSERT_PADDING)
+            ? ((group_threads * ElementsPerWorkItem) >> LOG_LOCAL_MEMORY_BANKS)
+            : 0;
+    return (group_threads * ElementsPerWorkItem + padding_values) * sizeof(T);
+  }
+
+  exchange(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  // TODO: Investigate if padding is required for performance,
+  // and if specializations are required for specific target hardware.
+  static size_t adjust_by_padding(size_t offset) {
+
+    if constexpr (INSERT_PADDING) {
+      offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
+    }
+    return offset;
+  }
+
+  struct blocked_offset {
+    template <typename Item> size_t operator()(Item item, size_t i) {
+      size_t offset = item.get_local_linear_id() * ElementsPerWorkItem + i;
+      return adjust_by_padding(offset);
+    }
+  };
+
+  struct striped_offset {
+    template <typename Item> size_t operator()(Item item, size_t i) {
+      size_t offset = i * item.get_local_range(2) * item.get_local_range(1) *
+                          item.get_local_range(0) +
+                      item.get_local_linear_id();
+      return adjust_by_padding(offset);
+    }
+  };
+
+  template <typename Iterator> struct scatter_offset {
+    Iterator begin;
+    scatter_offset(const int (&ranks)[ElementsPerWorkItem]) {
+      begin = std::begin(ranks);
+    }
+    template <typename Item> size_t operator()(Item item, size_t i) const {
+      // iterator i is expected to be within bounds [0,VALUES_PER_THREAD)
+      return adjust_by_padding(begin[i]);
+    }
+  };
+
+  /// Inplace rearrange elements from blocked order to striped order.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// blocked \p input across the work-group is:
+  ///
+  ///   {[0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511]}.
+  ///
+  /// The striped order output is:
+  ///
+  ///   {[0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511]}.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  template <typename Item>
+  __compat_inline__ void
+  blocked_to_striped(Item item, T (&input)[ElementsPerWorkItem]) {
+    striped_offset get_striped_offset;
+    blocked_offset get_blocked_offset;
+    helper_exchange(item, input, input, get_blocked_offset, get_striped_offset);
+  }
+
+  /// Inplace rearrange elements from striped order to blocked order.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// striped \p input across the work-group is:
+  ///
+  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
+  ///
+  /// The blocked order output is:
+  ///
+  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  template <typename Item>
+  __compat_inline__ void
+  striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem]) {
+    blocked_offset get_blocked_offset;
+    striped_offset get_striped_offset;
+    helper_exchange(item, input, input, get_striped_offset, get_blocked_offset);
+  }
+
+  /// Rearrange elements from blocked order to striped order.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// blocked \p input across the work-group is:
+  ///
+  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
+  ///
+  /// The striped order output is:
+  ///
+  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param output The corresponding output data of each work-item.
+  template <typename Item>
+  __compat_inline__ void
+  blocked_to_striped(Item item, T (&input)[ElementsPerWorkItem],
+                     T (&output)[ElementsPerWorkItem]) {
+    striped_offset get_striped_offset;
+    blocked_offset get_blocked_offset;
+    helper_exchange(item, input, output, get_blocked_offset,
+                    get_striped_offset);
+  }
+
+  /// Rearrange elements from striped order to blocked order.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// striped \p input across the work-group is:
+  ///
+  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
+  ///
+  /// The blocked order output is:
+  ///
+  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param output The corresponding output data of each work-item.
+  template <typename Item>
+  __compat_inline__ void
+  striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem],
+                     T (&output)[ElementsPerWorkItem]) {
+    blocked_offset get_blocked_offset;
+    striped_offset get_striped_offset;
+    helper_exchange(item, input, output, get_striped_offset,
+                    get_blocked_offset);
+  }
+
+  /// Inplace exchanges data items annotated by rank into blocked arrangement.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// striped \p input across the work-group is:
+  ///
+  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
+  ///
+  /// The rank across the work-group is:
+  ///
+  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
+  ///
+  /// The blocked order output is:
+  ///
+  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param ranks The corresponding rank annotation of each work-item.
+  template <typename Item>
+  __compat_inline__ void
+  scatter_to_blocked(Item item, T (&input)[ElementsPerWorkItem],
+                     int (&ranks)[ElementsPerWorkItem]) {
+    scatter_offset<const int *> get_scatter_offset(ranks);
+    blocked_offset get_blocked_offset;
+    helper_exchange(item, input, input, get_scatter_offset, get_blocked_offset);
+  }
+
+  /// Inplace exchanges data items annotated by rank into striped arrangement.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// blocked \p input across the work-group is:
+  ///
+  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
+  ///
+  /// The rank across the work-group is:
+  ///
+  ///   { [16, 20, 24, 28], [32, 36, 40, 44], ..., [499, 503, 507, 511] }.
+  ///
+  /// The striped order output of each work-item will be:
+  ///
+  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param ranks The corresponding rank annotation of each work-item.
+  template <typename Item>
+  __compat_inline__ void
+  scatter_to_striped(Item item, T (&input)[ElementsPerWorkItem],
+                     int (&ranks)[ElementsPerWorkItem]) {
+    scatter_offset<const int *> get_scatter_offset(ranks);
+    striped_offset get_striped_offset;
+    helper_exchange(item, input, input, get_scatter_offset, get_striped_offset);
+  }
+
+private:
+  template <typename Item, typename offsetFunctorTypeFW,
+            typename offsetFunctorTypeRV>
+  __compat_inline__ void
+  helper_exchange(Item item, T (&input)[ElementsPerWorkItem],
+                  T (&output)[ElementsPerWorkItem],
+                  offsetFunctorTypeFW &offset_functor_fw,
+                  offsetFunctorTypeRV &offset_functor_rv) {
+    T *buffer = reinterpret_cast<T *>(_local_memory);
+#pragma unroll
+    for (size_t i = 0; i < ElementsPerWorkItem; i++) {
+      size_t offset = offset_functor_fw(item, i);
+      buffer[offset] = input[i];
+    }
+    sycl::group_barrier(item.get_group());
+#pragma unroll
+    for (size_t i = 0; i < ElementsPerWorkItem; i++) {
+      size_t offset = offset_functor_rv(item, i);
+      output[i] = buffer[offset];
+    }
+  }
+
+  static constexpr int LOG_LOCAL_MEMORY_BANKS = 4;
+  static constexpr bool INSERT_PADDING =
+      (ElementsPerWorkItem > 4) &&
+      (detail::power_of_two<ElementsPerWorkItem>::VALUE);
+
+  uint8_t *_local_memory;
+};
+
+/// The work-group wide radix sort to sort integer data elements
+/// assigned to all work-items in the work-group.
+///
+/// \tparam T The type of the data elements.
+/// \tparam ElementsPerWorkItem The number of data elements assigned to
+/// a work-item.
+/// \tparam RADIX_BITS The number of radix bits per digit place.
+template <typename T, int ElementsPerWorkItem, int RADIX_BITS = 4>
+class group_radix_sort {
+  uint8_t *_local_memory;
+
+public:
+  group_radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  static size_t get_local_memory_size(size_t group_threads) {
+    size_t ranks_size =
+        detail::radix_rank<RADIX_BITS>::get_local_memory_size(group_threads);
+    size_t exchange_size =
+        exchange<T, ElementsPerWorkItem>::get_local_memory_size(group_threads);
+    return sycl::max(ranks_size, exchange_size);
+  }
+
+private:
+  template <typename Item, bool DESCENDING>
+  __compat_inline__ void
+  helper_sort(const Item &item, T (&keys)[ElementsPerWorkItem],
+              int begin_bit = 0, int end_bit = 8 * sizeof(T),
+              bool is_striped = false) {
+
+    uint32_t(&unsigned_keys)[ElementsPerWorkItem] =
+        reinterpret_cast<uint32_t(&)[ElementsPerWorkItem]>(keys);
+
+#pragma unroll
+    for (int i = 0; i < ElementsPerWorkItem; ++i) {
+      unsigned_keys[i] = detail::traits<T>::twiddle_in(unsigned_keys[i]);
+    }
+
+    for (int i = begin_bit; i < end_bit; i += RADIX_BITS) {
+      int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit);
+
+      int ranks[ElementsPerWorkItem];
+      detail::radix_rank<RADIX_BITS, DESCENDING>(_local_memory)
+          .template rank_keys<Item, ElementsPerWorkItem>(item, unsigned_keys,
+                                                         ranks, i, pass_bits);
+
+      sycl::group_barrier(item.get_group());
+
+      bool last_iter = i + RADIX_BITS >= end_bit;
+      if (last_iter && is_striped) {
+        exchange<T, ElementsPerWorkItem>(_local_memory)
+            .scatter_to_striped(item, keys, ranks);
+
+      } else {
+        exchange<T, ElementsPerWorkItem>(_local_memory)
+            .scatter_to_blocked(item, keys, ranks);
+      }
+
+      sycl::group_barrier(item.get_group());
+    }
+
+#pragma unroll
+    for (int i = 0; i < ElementsPerWorkItem; ++i) {
+      unsigned_keys[i] = detail::traits<T>::twiddle_out(unsigned_keys[i]);
+    }
+  }
+
+public:
+  /// Performs an ascending work-group wide radix sort over a blocked
+  /// arrangement of input elements.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// \p input across the work-group is:
+  ///
+  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
+  ///
+  /// The ascending order output is:
+  ///
+  ///   { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param begin_bit The beginning (least-significant) bit index needed for
+  /// key comparison.
+  /// \param end_bit The past-the-end (most-significant) bit
+  /// index needed for key comparison.
+  template <typename Item>
+  __compat_inline__ void
+  sort(const Item &item, T (&input)[ElementsPerWorkItem], int begin_bit = 0,
+       int end_bit = 8 * sizeof(T)) {
+    helper_sort<Item, /*DESCENDING=*/false>(item, input, begin_bit, end_bit);
+  }
+
+  /// Performs an descending work-group wide radix sort over a blocked
+  /// arrangement of input elements.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// \p input across the work-group is:
+  ///
+  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
+  ///
+  /// The descending order output is:
+  ///
+  ///   { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param begin_bit The beginning (least-significant) bit index needed for
+  /// key comparison.
+  /// \param end_bit The past-the-end (most-significant) bit
+  /// index needed for key comparison.
+  template <typename Item>
+  __compat_inline__ void
+  sort_descending(const Item &item, T (&input)[ElementsPerWorkItem],
+                  int begin_bit = 0, int end_bit = 8 * sizeof(T)) {
+    helper_sort<Item, /*DESCENDING=*/true>(item, input, begin_bit, end_bit);
+  }
+
+  /// Performs an ascending radix sort across a blocked arrangement of input
+  /// elements, leaving them in a striped arrangement.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// \p input across the work-group is:
+  ///
+  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
+  ///
+  /// The corresponding output of each work-item will be:
+  ///
+  ///   { [0,128,256,384], [1,129,257,385], [2,130,258,386], ...,
+  ///   [127,255,383,511] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param begin_bit The beginning (least-significant) bit index needed for
+  /// key comparison.
+  /// \param end_bit The past-the-end (most-significant) bit
+  /// index needed for key comparison.
+  template <typename Item>
+  __compat_inline__ void
+  sort_blocked_to_striped(const Item &item, T (&input)[ElementsPerWorkItem],
+                          int begin_bit = 0, int end_bit = 8 * sizeof(T)) {
+    helper_sort<Item, /*DESCENDING=*/false>(item, input, begin_bit, end_bit,
+                                            /*is_striped=*/true);
+  }
+
+  /// Performs an descending radix sort across a blocked arrangement of input
+  /// elements, leaving them in a striped arrangement.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// \p input across the work-group is:
+  ///
+  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
+  ///
+  /// The descending striped order output is:
+  ///
+  ///   { [0,128,256,384], [1,129,257,385], [2,130,258,386], ...,
+  ///   [127,255,383,511] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param begin_bit The beginning (least-significant) bit index needed for
+  /// key comparison.
+  /// \param end_bit The past-the-end (most-significant) bit
+  /// index needed for key comparison.
+  template <typename Item>
+  __compat_inline__ void sort_descending_blocked_to_striped(
+      const Item &item, T (&input)[ElementsPerWorkItem], int begin_bit = 0,
+      int end_bit = 8 * sizeof(T)) {
+    helper_sort<Item, /*DESCENDING=*/true>(item, input, begin_bit, end_bit,
+                                           /*is_striped=*/true);
+  }
+};
+
+/// Load linear segment items into block format across threads
+/// Helper for Block Load
+enum load_algorithm {
+  BLOCK_LOAD_DIRECT,
+  BLOCK_LOAD_STRIPED,
+};
+
+/// Load a linear segment of elements into a blocked arrangement across the
+/// work-group.
+///
+/// \tparam T The data type to load.
+/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
+/// onto each work-item.
+/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
+/// \tparam ItemT The sycl::nd_item index space class.
+/// \param item The calling work-item.
+/// \param input_iter The work-group's base input iterator for loading from.
+/// \param data Data to load.
+template <typename T, size_t ElementsPerWorkItem, typename InputIteratorT,
+          typename ItemT>
+__compat_inline__ void load_direct_blocked(const ItemT &item,
+                                               InputIteratorT input_iter,
+                                               T (&data)[ElementsPerWorkItem]) {
+  size_t work_item_id = item.get_local_linear_id();
+#pragma unroll
+  for (size_t i = 0; i < ElementsPerWorkItem; i++)
+    data[i] = input_iter[(work_item_id * ElementsPerWorkItem) + i];
+}
+
+/// Load a linear segment of elements into a striped arrangement across the
+/// work-group.
+///
+/// \tparam T The data type to load.
+/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
+/// onto each work-item.
+/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
+/// \tparam ItemT The sycl::nd_item index space class.
+/// \param item The calling work-item.
+/// \param input_iter The work-group's base input iterator for loading from.
+/// \param data Data to load.
+template <typename T, int ElementsPerWorkItem, typename InputIteratorT,
+          typename ItemT>
+__compat_inline__ void load_direct_striped(const ItemT &item,
+                                               InputIteratorT input_iter,
+                                               T (&data)[ElementsPerWorkItem]) {
+  size_t work_group_size = item.get_group().get_local_linear_range();
+  size_t work_item_id = item.get_local_linear_id();
+#pragma unroll
+  for (size_t i = 0; i < ElementsPerWorkItem; i++)
+    data[i] = input_iter[work_item_id + i * work_group_size];
+}
+
+/// Load a linear segment of elements into a blocked arrangement across the
+/// work-group, guarded by range.
+///
+/// \tparam T The data type to load.
+/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
+/// onto each work-item.
+/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
+/// \tparam ItemT The sycl::nd_item index space class.
+/// \param item The calling work-item.
+/// \param input_iter The work-group's base input iterator for loading from.
+/// \param data Data to load.
+/// \param valid_items Number of valid items to load
+template <typename T, size_t ElementsPerWorkItem, typename InputIteratorT,
+          typename ItemT>
+__compat_inline__ void
+load_direct_blocked(const ItemT &item, InputIteratorT input_iter,
+                    T (&data)[ElementsPerWorkItem], int valid_items) {
+  size_t work_item_id = item.get_local_linear_id();
+#pragma unroll
+  for (size_t i = 0; i < ElementsPerWorkItem; i++)
+    if ((work_item_id * ElementsPerWorkItem) + i < valid_items)
+      data[i] = input_iter[(work_item_id * ElementsPerWorkItem) + i];
+}
+
+/// Load a linear segment of elements into a striped arrangement across the
+/// work-group, guarded by range.
+///
+/// \tparam T The data type to load.
+/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
+/// onto each work-item.
+/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
+/// \tparam ItemT The sycl::nd_item index space class.
+/// \param item The calling work-item.
+/// \param input_iter The work-group's base input iterator for loading from.
+/// \param data Data to load.
+/// \param valid_items Number of valid items to load
+template <typename T, int ElementsPerWorkItem, typename InputIteratorT,
+          typename ItemT>
+__compat_inline__ void
+load_direct_striped(const ItemT &item, InputIteratorT input_iter,
+                    T (&data)[ElementsPerWorkItem], int valid_items) {
+  size_t work_group_size = item.get_group().get_local_linear_range();
+  size_t work_item_id = item.get_local_linear_id();
+#pragma unroll
+  for (size_t i = 0; i < ElementsPerWorkItem; i++)
+    if (work_item_id + (i * work_group_size) < valid_items)
+      data[i] = input_iter[work_item_id + i * work_group_size];
+}
+
+/// Store a blocked arrangement of items across a work-group into a linear
+/// segment of items.
+///
+/// \tparam T The data type to store.
+/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
+/// onto each work-item.
+/// \tparam OutputIteratorT  The random-access iterator type for output.
+/// \iterator.
+/// \tparam ItemT The sycl::nd_item index space class.
+/// \param item The calling work-item.
+/// \param output_iter The work-group's base output iterator for writing.
+/// \param data Data to store.
+template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
+          typename ItemT>
+__compat_inline__ void
+store_direct_blocked(const ItemT &item, OutputIteratorT output_iter,
+                     T (&data)[ElementsPerWorkItem]) {
+  size_t work_item_id = item.get_local_linear_id();
+  OutputIteratorT work_item_iter =
+      output_iter + (work_item_id * ElementsPerWorkItem);
+#pragma unroll
+  for (size_t i = 0; i < ElementsPerWorkItem; i++)
+    work_item_iter[i] = data[i];
+}
+
+/// Store a striped arrangement of items across a work-group into a linear
+/// segment of items.
+///
+/// \tparam T The data type to store.
+/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
+/// onto each work-item.
+/// \tparam OutputIteratorT  The random-access iterator type for output.
+/// \iterator.
+/// \tparam ItemT The sycl::nd_item index space class.
+/// \param item The calling work-item.
+/// \param output_iter The work-group's base output iterator for writing.
+/// \param items Data to store.
+template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
+          typename ItemT>
+__compat_inline__ void
+store_direct_striped(const ItemT &item, OutputIteratorT output_iter,
+                     T (&data)[ElementsPerWorkItem]) {
+  size_t work_group_size = item.get_group().get_local_linear_range();
+  size_t work_item_id = item.get_local_linear_id();
+  OutputIteratorT work_item_iter = output_iter + work_item_id;
+#pragma unroll
+  for (size_t i = 0; i < ElementsPerWorkItem; i++)
+    work_item_iter[i * work_group_size] = data[i];
+}
+
+/// Store a blocked arrangement of items across a work-group into a linear
+/// segment of items, guarded by range.
+///
+/// \tparam T The data type to store.
+/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
+/// onto each work-item.
+/// \tparam OutputIteratorT  The random-access iterator type for output.
+/// \iterator.
+/// \tparam ItemT The sycl::nd_item index space class.
+/// \param item The calling work-item.
+/// \param output_iter The work-group's base output iterator for writing.
+/// \param data Data to store.
+/// \param valid_items Number of valid items to load
+template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
+          typename ItemT>
+__compat_inline__ void
+store_direct_blocked(const ItemT &item, OutputIteratorT output_iter,
+                     T (&data)[ElementsPerWorkItem], size_t valid_items) {
+  size_t work_item_id = item.get_local_linear_id();
+  OutputIteratorT work_item_iter =
+      output_iter + (work_item_id * ElementsPerWorkItem);
+#pragma unroll
+  for (size_t i = 0; i < ElementsPerWorkItem; i++)
+    if (i + (work_item_id * ElementsPerWorkItem) < valid_items)
+      work_item_iter[i] = data[i];
+}
+
+/// Store a striped arrangement of items across a work-group into a linear
+/// segment of items, guarded by range.
+///
+/// \tparam T The data type to store.
+/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
+/// onto each work-item.
+/// \tparam OutputIteratorT  The random-access iterator type for output.
+/// \iterator.
+/// \tparam ItemT The sycl::nd_item index space class.
+/// \param item The calling work-item.
+/// \param output_iter The work-group's base output iterator for writing.
+/// \param items Data to store.
+/// \param valid_items Number of valid items to load
+template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
+          typename ItemT>
+__compat_inline__ void
+store_direct_striped(const ItemT &item, OutputIteratorT output_iter,
+                     T (&data)[ElementsPerWorkItem], size_t valid_items) {
+  size_t work_group_size = item.get_group().get_local_linear_range();
+  size_t work_item_id = item.get_local_linear_id();
+  OutputIteratorT work_item_iter = output_iter + work_item_id;
+#pragma unroll
+  for (size_t i = 0; i < ElementsPerWorkItem; i++)
+    if ((i * work_group_size) + work_item_id < valid_items)
+      work_item_iter[i * work_group_size] = data[i];
+}
+
+/// Enumerates alternative algorithms for compat::group::group_load to read
+/// a linear segment of data from memory into a blocked arrangement across a
+/// work-group.
+enum class group_load_algorithm {
+  /// A blocked arrangement of data is read directly from memory.
+  blocked,
+
+  /// A striped arrangement of data is read directly from memory.
+  striped
+};
+
+/// Provide methods for loading a linear segment of items from memory into a
+/// blocked arrangement across a work-group.
+///
+/// \tparam T The input data type.
+/// \tparam ElementsPerWorkItem The number of data elements assigned to a
+/// work-item.
+/// \tparam LoadAlgorithm The data movement strategy, default is blocked.
+template <typename T, size_t ElementsPerWorkItem,
+          group_load_algorithm LoadAlgorithm = group_load_algorithm::blocked>
+class group_load {
+public:
+  static size_t get_local_memory_size([[maybe_unused]] size_t work_group_size) {
+    return 0;
+  }
+  group_load(uint8_t *) {}
+
+  /// Load a linear segment of items from memory.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// \p input across the work-group is:
+  ///
+  ///   1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511.
+  ///
+  /// The blocked order \p data of each work-item will be:
+  ///
+  ///   {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}.
+  ///
+  /// The striped order \p output of each work-item will be:
+  ///
+  ///   {[0,128,256,384], [1,129,257,385], ..., [127,255,383,511]}.
+  ///
+  /// \tparam ItemT The sycl::nd_item index space class.
+  /// \tparam InputIteratorT The random-access iterator type for input
+  /// \iterator.
+  /// \param item The work-item identifier.
+  /// \param input_iter The work-group's base input iterator for loading from.
+  /// \param data The data to load.
+  template <typename ItemT, typename InputIteratorT>
+  __compat_inline__ void load(const ItemT &item, InputIteratorT input_iter,
+                                  T (&data)[ElementsPerWorkItem]) {
+    if constexpr (LoadAlgorithm == group_load_algorithm::blocked) {
+      load_direct_blocked<T, ElementsPerWorkItem, InputIteratorT, ItemT>(
+          item, input_iter, data);
+    } else if constexpr (LoadAlgorithm == group_load_algorithm::striped) {
+      load_direct_striped<T, ElementsPerWorkItem, InputIteratorT, ItemT>(
+          item, input_iter, data);
+    }
+  }
+
+  /// Load a linear segment of items from memory, guarded by range.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and
+  /// valid_items is 5, the \p input across the work-group is:
+  ///
+  ///   0, 1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511.
+  ///
+  /// The blocked order \p data of each work-item will be:
+  ///
+  ///   {[0,1,2,3], [4,?,?,?], ..., [?,?,?,?]}.
+  ///
+  /// The striped order \p output of each work-item will be:
+  ///
+  ///   {[0,?,?,?], [1,?,?,?], [2,?,?,?], [3,?,?,?] ..., [?,?,?,?]}.
+  ///
+  /// \tparam ItemT The sycl::nd_item index space class.
+  /// \tparam InputIteratorT The random-access iterator type for input
+  /// \iterator.
+  /// \param item The work-item identifier.
+  /// \param input_iter The work-group's base input iterator for loading from.
+  /// \param data The data to load.
+  /// \param valid_items Number of valid items to load
+  template <typename ItemT, typename InputIteratorT>
+  __compat_inline__ void load(const ItemT &item, InputIteratorT input_iter,
+                                  T (&data)[ElementsPerWorkItem],
+                                  int valid_items) {
+    if constexpr (LoadAlgorithm == group_load_algorithm::blocked) {
+      load_direct_blocked<T, ElementsPerWorkItem, InputIteratorT, ItemT>(
+          item, input_iter, data, valid_items);
+    } else if constexpr (LoadAlgorithm == group_load_algorithm::striped) {
+      load_direct_striped<T, ElementsPerWorkItem, InputIteratorT, ItemT>(
+          item, input_iter, data, valid_items);
+    }
+  }
+};
+
+/// Enumerates alternative algorithms for compat::group::group_load to write
+/// a blocked arrangement of items across a work-group to a linear segment of
+/// memory.
+enum class group_store_algorithm {
+  /// A blocked arrangement of data is written directly to memory.
+  blocked,
+
+  /// A striped arrangement of data is written directly to memory.
+  striped,
+};
+
+/// Provide methods for writing a blocked arrangement of elements partitioned
+/// across a work-group to a linear segment of memory.
+///
+/// \tparam T The output data type.
+/// \tparam ElementsPerWorkItem The number of data elements assigned to a
+/// work-item.
+/// \tparam StoreAlgorithm The data movement strategy, default is blocked.
+template <typename T, size_t ElementsPerWorkItem,
+          group_store_algorithm StoreAlgorithm = group_store_algorithm::blocked>
+class group_store {
+public:
+  static size_t get_local_memory_size([[maybe_unused]] size_t work_group_size) {
+    return 0;
+  }
+  group_store(uint8_t *) {}
+
+  /// Store items into a linear segment of memory.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// \p input across the work-group is:
+  ///
+  ///   {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}.
+  ///
+  /// The blocked order \p output will be:
+  ///
+  ///   1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511.
+  ///
+  /// The striped order \p output will be:
+  ///
+  ///   0, 128, 256, 384, 1, 129, 257, 385, ..., 127, 255, 383, 511.
+  ///
+  /// \tparam ItemT The sycl::nd_item index space class.
+  /// \tparam OutputIteratorT The random-access iterator type for \p output
+  /// iterator.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param data The data to store.
+  template <typename ItemT, typename OutputIteratorT>
+  __compat_inline__ void store(const ItemT &item,
+                                   OutputIteratorT output_iter,
+                                   T (&data)[ElementsPerWorkItem]) {
+    if constexpr (StoreAlgorithm == group_store_algorithm::blocked) {
+      store_direct_blocked<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
+          item, output_iter, data);
+    } else if constexpr (StoreAlgorithm == group_store_algorithm::striped) {
+      store_direct_striped<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
+          item, output_iter, data);
+    }
+  }
+
+  /// Store items into a linear segment of memory, guarded by range.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and
+  /// \p valid_items is 5, the \p output across the work-group is:
+  ///
+  ///   {[0,0,0,0], [0,0,0,0], ..., [0,0,0,0]}.
+  ///
+  /// The blocked order \p output will be:
+  ///
+  ///   0, 1, 2, 3, 4, 5, 0, 0, ..., 0, 0, 0, 0.
+  ///
+  /// The striped order \p output will be:
+  ///
+  ///   0, 4, 8, 12, 16, 0, 0, 0, ..., 0, 0, 0, 0.
+  ///
+  /// \tparam ItemT The sycl::nd_item index space class.
+  /// \tparam OutputIteratorT The random-access iterator type for \p output
+  /// iterator.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param data The data to store.
+  /// \param valid_items Number of valid items to load
+  template <typename ItemT, typename OutputIteratorT>
+  __compat_inline__ void
+  store(const ItemT &item, OutputIteratorT output_iter,
+        T (&data)[ElementsPerWorkItem], size_t valid_items) {
+    if constexpr (StoreAlgorithm == group_store_algorithm::blocked) {
+      store_direct_blocked<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
+          item, output_iter, data, valid_items);
+    } else if constexpr (StoreAlgorithm == group_store_algorithm::striped) {
+      store_direct_striped<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
+          item, output_iter, data, valid_items);
+    }
+  }
+};
+
+/// The work-group wide shuffle operations that allow work-items to exchange
+/// data elements with other work-items within the same work-group.
+///
+/// \tparam T The type of the data elements.
+/// \tparam group_dim_0 The first dimension size of the work-group.
+/// \tparam group_dim_1 The second dimension size of the work-group.
+/// \tparam group_dim_2 The third dimension size of the work-group.
+template <typename T, int group_dim_0, int group_dim_1 = 1, int group_dim_2 = 1>
+class group_shuffle {
+  T *_local_memory = nullptr;
+  static constexpr size_t group_work_items =
+      group_dim_0 * group_dim_1 * group_dim_2;
+
+public:
+  static constexpr size_t get_local_memory_size(size_t work_group_size) {
+    return sizeof(T) * work_group_size;
+  }
+  group_shuffle(uint8_t *local_memory) : _local_memory((T *)local_memory) {}
+
+  /// Selects a value from a work-item at a given distance in the work-group
+  /// and stores the value in the output.
+  ///
+  /// \tparam ItemT The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input from the calling work-item.
+  /// \param output The output where the selected data will be stored.
+  /// \param distance The distance of work-items to look ahead or behind in the
+  /// work-group.
+  template <typename ItemT>
+  __compat_inline__ void select(const ItemT &item, T input, T &output,
+                                    int distance = 1) {
+    auto g = item.get_group();
+    size_t id = g.get_local_linear_id();
+    _local_memory[id] = input;
+
+    sycl::group_barrier(g, sycl::memory_scope::work_group);
+
+    const int target_id = static_cast<int>(id) + distance;
+    if ((target_id >= 0) && (target_id < group_work_items)) {
+      output = _local_memory[static_cast<size_t>(target_id)];
+    }
+  }
+  /// Selects a value from a work-item at a given distance in the work-group
+  /// and stores the value in the output, using a wrapped index to handle
+  /// overflow.
+  ///
+  /// \tparam ItemT The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data to be selected.
+  /// \param output The output where the selected data will be stored.
+  /// \param distance The number of work-items to look ahead in the
+  /// work-group.
+  template <typename ItemT>
+  __compat_inline__ void select2(const ItemT &item, T input, T &output,
+                                     unsigned int distance = 1) {
+    auto g = item.get_group();
+    size_t id = g.get_local_linear_id();
+    _local_memory[id] = input;
+
+    sycl::group_barrier(g, sycl::memory_scope::work_group);
+
+    unsigned int offset = id + distance;
+    if (offset >= group_work_items)
+      offset -= group_work_items;
+
+    output = _local_memory[offset];
+  }
+  /// Performs a shuffle operation to move data to the right across the
+  /// work-items, shifting elements in a work-item array by one position to the
+  /// right.
+  ///
+  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
+  /// \tparam ItemT The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data to be shuffled.
+  /// \param output The array that will store the shuffle result.
+  template <int ElementsPerWorkItem, typename ItemT>
+  __compat_inline__ void shuffle_right(const ItemT &item,
+                                           T (&input)[ElementsPerWorkItem],
+                                           T (&output)[ElementsPerWorkItem]) {
+    auto g = item.get_group();
+    size_t id = g.get_local_linear_id();
+    _local_memory[id] = input[ElementsPerWorkItem - 1];
+
+    sycl::group_barrier(g, sycl::memory_scope::work_group);
+
+#pragma unroll
+    for (int index = ElementsPerWorkItem - 1; index > 0; --index)
+      output[index] = input[index - 1];
+
+    if (id > 0)
+      output[0] = _local_memory[id - 1];
+  }
+  /// Performs a shuffle operation to move data to the right across the
+  /// work-items, storing the suffix of the group after the shuffle operation.
+  ///
+  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
+  /// \tparam ItemT The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data to be shuffled.
+  /// \param output The array that will store the shuffle result.
+  /// \param group_suffix The suffix of the group after the shuffle.
+  template <int ElementsPerWorkItem, typename ItemT>
+  __compat_inline__ void
+  shuffle_right(const ItemT &item, T (&input)[ElementsPerWorkItem],
+                T (&output)[ElementsPerWorkItem], T &group_suffix) {
+    shuffle_right(item, input, output);
+    group_suffix = _local_memory[group_work_items - 1];
+  }
+  /// Performs a shuffle operation to move data to the left across the
+  /// work-items, shifting elements in a work-item array by one position to the
+  /// left.
+  ///
+  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
+  /// \tparam ItemT The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data to be shuffled.
+  /// \param output The array that will store the shuffle result.
+  template <int ElementsPerWorkItem, typename ItemT>
+  __compat_inline__ void shuffle_left(const ItemT &item,
+                                          T (&input)[ElementsPerWorkItem],
+                                          T (&output)[ElementsPerWorkItem]) {
+    auto g = item.get_group();
+    size_t id = g.get_local_linear_id();
+    _local_memory[id] = input[0];
+
+    sycl::group_barrier(g, sycl::memory_scope::work_group);
+
+#pragma unroll
+    for (int index = 0; index < ElementsPerWorkItem - 1; index++)
+      output[index] = input[index + 1];
+
+    if (id < group_work_items - 1)
+      output[ElementsPerWorkItem - 1] = _local_memory[id + 1];
+  }
+  /// Performs a shuffle operation to move data to the left across the
+  /// work-items, storing the prefix of the group before the shuffle operation.
+  ///
+  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
+  /// \tparam ItemT The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data to be shuffled.
+  /// \param output The array that will store the shuffle result.
+  /// \param group_prefix The prefix of the group before the shuffle.
+  template <int ElementsPerWorkItem, typename ItemT>
+  __compat_inline__ void
+  shuffle_left(const ItemT &item, T (&input)[ElementsPerWorkItem],
+               T (&output)[ElementsPerWorkItem], T &group_prefix) {
+    shuffle_left(item, input, output);
+    group_prefix = _local_memory[0];
+  }
+};
+} // namespace group
+} // namespace compat
diff --git a/tools/util/include/compat/id_query.hpp b/tools/util/include/compat/id_query.hpp
new file mode 100644
index 0000000000..120b1a5b29
--- /dev/null
+++ b/tools/util/include/compat/id_query.hpp
@@ -0,0 +1,71 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  id_query.hpp
+ *
+ *  Description:
+ *    id_query functionality for the SYCL compatibility extension
+ **************************************************************************/
+
+#pragma once
+
+#include <sycl/ext/oneapi/free_function_queries.hpp>
+#include <sycl/nd_item.hpp>
+
+namespace compat {
+
+using sycl::ext::oneapi::this_work_item::get_nd_item;
+
+inline void wg_barrier() { get_nd_item<3>().barrier(); }
+
+namespace local_id {
+inline size_t x() { return get_nd_item<3>().get_local_id(2); }
+inline size_t y() { return get_nd_item<3>().get_local_id(1); }
+inline size_t z() { return get_nd_item<3>().get_local_id(0); }
+} // namespace local_id
+
+namespace local_range {
+inline size_t x() { return get_nd_item<3>().get_local_range(2); }
+inline size_t y() { return get_nd_item<3>().get_local_range(1); }
+inline size_t z() { return get_nd_item<3>().get_local_range(0); }
+} // namespace local_range
+
+namespace work_group_id {
+inline size_t x() { return get_nd_item<3>().get_group(2); }
+inline size_t y() { return get_nd_item<3>().get_group(1); }
+inline size_t z() { return get_nd_item<3>().get_group(0); }
+} // namespace work_group_id
+
+namespace work_group_range {
+inline size_t x() { return get_nd_item<3>().get_group_range(2); }
+inline size_t y() { return get_nd_item<3>().get_group_range(1); }
+inline size_t z() { return get_nd_item<3>().get_group_range(0); }
+} // namespace work_group_range
+
+namespace global_range {
+inline size_t x() { return get_nd_item<3>().get_global_range(2); }
+inline size_t y() { return get_nd_item<3>().get_global_range(1); }
+inline size_t z() { return get_nd_item<3>().get_global_range(0); }
+} // namespace global_range
+
+namespace global_id {
+inline size_t x() { return get_nd_item<3>().get_global_id(2); }
+inline size_t y() { return get_nd_item<3>().get_global_id(1); }
+inline size_t z() { return get_nd_item<3>().get_global_id(0); }
+} // namespace global_id
+
+} // namespace compat
diff --git a/tools/util/include/compat/kernel.hpp b/tools/util/include/compat/kernel.hpp
new file mode 100644
index 0000000000..b9851f1d10
--- /dev/null
+++ b/tools/util/include/compat/kernel.hpp
@@ -0,0 +1,470 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  kernel.hpp
+ *
+ *  Description:
+ *    kernel functionality for the SYCL compatibility extension.
+ **************************************************************************/
+
+// The original source was under the license below:
+//==---- kernel.hpp -------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifdef _WIN32
+#include <unordered_set>
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#if defined(__has_include) && __has_include(<filesystem>)
+#include <filesystem>
+#elif defined(__has_include) && __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+#else
+#error "SYCLomatic runtime requires C++ filesystem support"
+#endif
+
+#include <fstream>
+#include <random>
+
+#include <sycl/image.hpp>
+#include <sycl/info/info_desc.hpp>
+#include <sycl/nd_range.hpp>
+#include <sycl/queue.hpp>
+
+namespace compat {
+
+typedef void (*kernel_functor)(sycl::queue &, const sycl::nd_range<3> &,
+                               unsigned int, void **, void **);
+
+struct kernel_function_info {
+  int max_work_group_size = 0;
+};
+
+static inline void get_kernel_function_info(kernel_function_info *kernel_info,
+                                            const void *function) {
+  kernel_info->max_work_group_size =
+      detail::dev_mgr::instance()
+          .current_device()
+          .get_info<sycl::info::device::max_work_group_size>();
+}
+
+static inline kernel_function_info
+get_kernel_function_info(const void *function) {
+  kernel_function_info kernel_info;
+  kernel_info.max_work_group_size =
+      detail::dev_mgr::instance()
+          .current_device()
+          .get_info<sycl::info::device::max_work_group_size>();
+  return kernel_info;
+}
+
+namespace detail {
+
+#if defined(__has_include) && __has_include(<filesystem>)
+namespace fs = std::filesystem;
+#else
+namespace fs = std::experimental::filesystem;
+#endif
+
+/// Write data to temporary file and return absolute path to temporary file.
+/// Temporary file is created in a temporary directory both of which have random
+/// names with only the user having access permissions.  Only one temporary file
+/// will be created in the temporary directory.
+static inline fs::path write_data_to_file(char const *const data, size_t size) {
+  std::error_code ec;
+
+  if (sizeof(size_t) >= sizeof(std::streamsize) &&
+      size > (std::numeric_limits<std::streamsize>::max)())
+    throw std::runtime_error("[Compat] data file too large");
+
+  // random number generator
+  std::random_device dev;
+  std::mt19937 prng(dev());
+  std::uniform_int_distribution<uint64_t> rand(0);
+
+  // find temporary directory
+  auto tmp_dir = fs::temp_directory_path(ec);
+  if (ec)
+    throw std::runtime_error("[Compat] could not find temporary directory");
+
+  // create private directory
+  std::stringstream directory;
+  fs::path directory_path;
+  constexpr int max_attempts = 5;
+  int i;
+
+  for (i = 0; i < max_attempts; i++) {
+    directory << std::hex << rand(prng);
+    directory_path = tmp_dir / directory.str();
+    if (fs::create_directory(directory_path)) {
+      break;
+    }
+  }
+  if (i == max_attempts)
+    throw std::runtime_error("[Compat] could not create directory");
+
+  // only allow owner permissions to private directory
+  fs::permissions(directory_path, fs::perms::owner_all, ec);
+  if (ec)
+    throw std::runtime_error(
+        "[Compat] could not set directory permissions");
+
+  // random filename in private directory
+  std::stringstream filename;
+  filename << std::hex << rand(prng);
+#ifdef _WIN32
+  auto filepath = directory_path / (filename.str() + ".dll");
+#else
+  auto filepath = directory_path / filename.str();
+#endif
+
+  // write data to temporary file
+  auto outfile = std::ofstream(filepath, std::ios::out | std::ios::binary);
+  if (outfile) {
+    // only allow program to write file
+    fs::permissions(filepath, fs::perms::owner_write, ec);
+    if (ec)
+      throw std::runtime_error("[Compat] could not set permissions");
+
+    outfile.write(data, size);
+    if (!outfile.good())
+      throw std::runtime_error("[Compat] could not write data");
+    outfile.close();
+
+    // only allow program to read/execute file
+    fs::permissions(filepath, fs::perms::owner_read | fs::perms::owner_exec,
+                    ec);
+    if (ec)
+      throw std::runtime_error("[Compat] could not set permissions");
+  } else
+    throw std::runtime_error("[Compat] could not write data");
+
+  // check temporary file contents
+  auto infile = std::ifstream(filepath, std::ios::in | std::ios::binary);
+  if (infile) {
+    bool mismatch = false;
+    size_t cnt = 0;
+
+    while (1) {
+      char c;
+      infile.get(c);
+      if (infile.eof())
+        break;
+      if (c != data[cnt++])
+        mismatch = true;
+    }
+    if (cnt != size || mismatch)
+      throw std::runtime_error(
+          "[Compat] file contents not written correctly");
+  } else
+    throw std::runtime_error("[Compat] could not validate file");
+
+  if (!filepath.is_absolute())
+    throw std::runtime_error("[Compat] temporary filepath is not absolute");
+
+  return filepath;
+}
+
+static inline uint16_t extract16(unsigned char const *const ptr) {
+  uint16_t ret = 0;
+
+  ret |= static_cast<uint16_t>(ptr[0]) << 0;
+  ret |= static_cast<uint16_t>(ptr[1]) << 8;
+
+  return (ret);
+}
+
+static inline uint32_t extract32(unsigned char const *const ptr) {
+  uint32_t ret = 0;
+
+  ret |= static_cast<uint32_t>(ptr[0]) << 0;
+  ret |= static_cast<uint32_t>(ptr[1]) << 8;
+  ret |= static_cast<uint32_t>(ptr[2]) << 16;
+  ret |= static_cast<uint32_t>(ptr[3]) << 24;
+
+  return (ret);
+}
+
+static inline uint64_t extract64(unsigned char const *const ptr) {
+  uint64_t ret = 0;
+
+  ret |= static_cast<uint64_t>(ptr[0]) << 0;
+  ret |= static_cast<uint64_t>(ptr[1]) << 8;
+  ret |= static_cast<uint64_t>(ptr[2]) << 16;
+  ret |= static_cast<uint64_t>(ptr[3]) << 24;
+  ret |= static_cast<uint64_t>(ptr[4]) << 32;
+  ret |= static_cast<uint64_t>(ptr[5]) << 40;
+  ret |= static_cast<uint64_t>(ptr[6]) << 48;
+  ret |= static_cast<uint64_t>(ptr[7]) << 56;
+
+  return (ret);
+}
+
+static inline uint64_t get_lib_size(char const *const blob) {
+#ifdef _WIN32
+  ///////////////////////////////////////////////////////////////////////
+  // Analyze DOS stub
+  unsigned char const *const ublob =
+      reinterpret_cast<unsigned char const *const>(blob);
+  if (ublob[0] != 0x4d || ublob[1] != 0x5a) {
+    throw std::runtime_error("[Compat] blob is not a Windows DLL.");
+  }
+  uint32_t pe_header_offset = extract32(ublob + 0x3c);
+
+  ///////////////////////////////////////////////////////////////////////
+  // Ananlyze PE-header
+  unsigned char const *const pe_header = ublob + pe_header_offset;
+
+  // signature
+  uint32_t pe_signature = extract32(pe_header + 0);
+  if (pe_signature != 0x00004550) {
+    throw std::runtime_error(
+        "[Compat] PE-header signature is not 0x00004550");
+  }
+
+  // machine
+  uint16_t machine = extract16(pe_header + 4);
+  if (machine != 0x8664) {
+    throw std::runtime_error("[Compat] only DLLs for x64 supported");
+  }
+
+  // number of sections
+  uint16_t number_of_sections = extract16(pe_header + 6);
+
+  // sizeof optional header
+  uint16_t sizeof_optional_header = extract16(pe_header + 20);
+
+  // magic
+  uint16_t magic = extract16(pe_header + 24);
+  if (magic != 0x10b && magic != 0x20b) {
+    throw std::runtime_error("[Compat] MAGIC is not 0x010b or 0x020b");
+  }
+
+  ///////////////////////////////////////////////////////////////////////
+  // Analyze tail of optional header
+  constexpr int coff_header_size = 24;
+
+  unsigned char const *const tail_of_optional_header =
+      pe_header + coff_header_size + sizeof_optional_header;
+  if (extract64(tail_of_optional_header - 8) != 0) {
+    throw std::runtime_error("Optional header not zero-padded");
+  }
+
+  ///////////////////////////////////////////////////////////////////////
+  // Analyze last section header
+  constexpr int section_header_size = 40;
+  unsigned char const *const last_section_header =
+      tail_of_optional_header + section_header_size * (number_of_sections - 1);
+
+  uint32_t sizeof_raw_data = extract32(last_section_header + 16);
+  uint32_t pointer_to_raw_data = extract32(last_section_header + 20);
+
+  return sizeof_raw_data + pointer_to_raw_data;
+#else
+  if (blob[0] != 0x7F || blob[1] != 'E' || blob[2] != 'L' || blob[3] != 'F')
+    throw std::runtime_error("[Compat] blob is not in ELF format");
+
+  if (blob[4] != 0x02)
+    throw std::runtime_error("[Compat] only 64-bit headers are supported");
+
+  if (blob[5] != 0x01)
+    throw std::runtime_error(
+        "[Compat] only little-endian headers are supported");
+
+  unsigned char const *const ublob =
+      reinterpret_cast<unsigned char const *const>(blob);
+  uint64_t e_shoff = extract64(ublob + 0x28);
+  uint16_t e_shentsize = extract16(ublob + 0x3A);
+  uint16_t e_shnum = extract16(ublob + 0x3C);
+
+  return e_shoff + (e_shentsize * e_shnum);
+#endif
+}
+
+#ifdef _WIN32
+class path_lib_record {
+public:
+  void operator=(const path_lib_record &) = delete;
+  ~path_lib_record() {
+    for (auto entry : lib_to_path) {
+      FreeLibrary(static_cast<HMODULE>(entry.first));
+      fs::permissions(entry.second, fs::perms::owner_all);
+      fs::remove_all(entry.second.remove_filename());
+    }
+  }
+  static void record_lib_path(fs::path path, void *library) {
+    lib_to_path[library] = path;
+  }
+  static void remove_lib(void *library) {
+    auto path = lib_to_path[library];
+    std::error_code ec;
+
+    FreeLibrary(static_cast<HMODULE>(library));
+    fs::permissions(path, fs::perms::owner_all);
+    if (fs::remove_all(path.remove_filename(), ec) != 2 || ec)
+      // one directory and one temporary file should have been deleted
+      throw std::runtime_error("[Compat] directory delete failed");
+
+    lib_to_path.erase(library);
+  }
+
+private:
+  static inline std::unordered_map<void *, fs::path> lib_to_path;
+};
+#endif
+
+} // namespace detail
+
+class kernel_library {
+public:
+  constexpr kernel_library() : ptr{nullptr} {}
+  constexpr kernel_library(void *ptr) : ptr{ptr} {}
+
+  operator void *() const { return ptr; }
+
+private:
+  void *ptr;
+#ifdef _WIN32
+  static inline detail::path_lib_record single_instance_to_trigger_destructor;
+#endif
+};
+
+namespace detail {
+
+static inline kernel_library load_dl_from_data(char const *const data,
+                                               size_t size) {
+  fs::path filename = write_data_to_file(data, size);
+#ifdef _WIN32
+  void *so = LoadLibraryW(filename.wstring().c_str());
+#else
+  void *so = dlopen(filename.c_str(), RTLD_LAZY);
+#endif
+  if (so == nullptr)
+    throw std::runtime_error("[Compat] failed to load kernel library");
+
+#ifdef _WIN32
+  detail::path_lib_record::record_lib_path(filename, so);
+#else
+  std::error_code ec;
+
+  // Windows DLL cannot be deleted while in use
+  if (fs::remove_all(filename.remove_filename(), ec) != 2 || ec)
+    // one directory and one temporary file should have been deleted
+    throw std::runtime_error("[Compat] directory delete failed");
+#endif
+
+  return so;
+}
+
+} // namespace detail
+
+/// Load kernel library and return a handle to use the library.
+/// \param [in] name The name of the library.
+static inline kernel_library load_kernel_library(const std::string &name) {
+  std::ifstream ifs;
+  ifs.open(name, std::ios::in | std::ios::binary);
+
+  std::stringstream buffer;
+  buffer << ifs.rdbuf();
+
+  const std::string buffer_string = buffer.str();
+  return detail::load_dl_from_data(buffer_string.c_str(), buffer_string.size());
+}
+
+/// Load kernel library whose image is alreay in memory and return a handle to
+/// use the library.
+/// \param [in] image A pointer to the image in memory.
+static inline kernel_library load_kernel_library_mem(char const *const image) {
+  const size_t size = detail::get_lib_size(image);
+
+  return detail::load_dl_from_data(image, size);
+}
+
+/// Unload kernel library.
+/// \param [in,out] library Handle to the library to be closed.
+static inline void unload_kernel_library(const kernel_library &library) {
+#ifdef _WIN32
+  detail::path_lib_record::remove_lib(library);
+#else
+  dlclose(library);
+#endif
+}
+
+class kernel_function {
+public:
+  constexpr kernel_function() : ptr{nullptr} {}
+  constexpr kernel_function(kernel_functor ptr) : ptr{ptr} {}
+
+  operator void *() const { return ((void *)ptr); }
+
+  void operator()(sycl::queue &q, const sycl::nd_range<3> &range,
+                  unsigned int local_mem_size, void **args, void **extra) {
+    ptr(q, range, local_mem_size, args, extra);
+  }
+
+private:
+  kernel_functor ptr;
+};
+
+/// Find kernel function in a kernel library and return its address.
+/// \param [in] library Handle to the kernel library.
+/// \param [in] name Name of the kernel function.
+static inline kernel_function get_kernel_function(kernel_library &library,
+                                                  const std::string &name) {
+#ifdef _WIN32
+  kernel_functor fn = reinterpret_cast<kernel_functor>(
+      GetProcAddress(static_cast<HMODULE>(static_cast<void *>(library)),
+                     (name + std::string("_wrapper")).c_str()));
+#else
+  kernel_functor fn = reinterpret_cast<kernel_functor>(
+      dlsym(library, (name + std::string("_wrapper")).c_str()));
+#endif
+  if (fn == nullptr)
+    throw std::runtime_error("[Compat] failed to get function");
+  return fn;
+}
+
+/// Invoke a kernel function.
+/// \param [in] function kernel function.
+/// \param [in] queue SYCL queue used to execute kernel
+/// \param [in] group_range SYCL group range
+/// \param [in] local_range SYCL local range
+/// \param [in] local_mem_size The size of local memory required by the kernel
+///             function.
+/// \param [in] kernel_params Array of pointers to kernel arguments.
+/// \param [in] extra Extra arguments.
+static inline void invoke_kernel_function(kernel_function &function,
+                                          sycl::queue &queue,
+                                          sycl::range<3> group_range,
+                                          sycl::range<3> local_range,
+                                          unsigned int local_mem_size,
+                                          void **kernel_params, void **extra) {
+  function(queue, sycl::nd_range<3>(group_range * local_range, local_range),
+           local_mem_size, kernel_params, extra);
+}
+
+} // namespace compat
diff --git a/tools/util/include/compat/launch.hpp b/tools/util/include/compat/launch.hpp
new file mode 100644
index 0000000000..0e0d84fa15
--- /dev/null
+++ b/tools/util/include/compat/launch.hpp
@@ -0,0 +1,165 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  launch.hpp
+ *
+ *  Description:
+ *    launch functionality for the SYCL compatibility extension
+ **************************************************************************/
+
+#pragma once
+
+#include <sycl/accessor.hpp>
+#include <sycl/event.hpp>
+#include <sycl/nd_range.hpp>
+#include <sycl/queue.hpp>
+#include <sycl/range.hpp>
+#include <sycl/reduction.hpp>
+
+#include <compat/device.hpp>
+#include <compat/dims.hpp>
+#include <compat/launch_policy.hpp>
+
+namespace compat {
+
+namespace detail {
+
+template <typename R, typename... Types>
+constexpr size_t getArgumentCount(R (*f)(Types...)) {
+  return sizeof...(Types);
+}
+
+template <int Dim>
+sycl::nd_range<3> transform_nd_range(const sycl::nd_range<Dim> &range) {
+  sycl::range<Dim> global_range = range.get_global_range();
+  sycl::range<Dim> local_range = range.get_local_range();
+  if constexpr (Dim == 3) {
+    return range;
+  } else if constexpr (Dim == 2) {
+    return sycl::nd_range<3>{{1, global_range[0], global_range[1]},
+                             {1, local_range[0], local_range[1]}};
+  }
+  return sycl::nd_range<3>{{1, 1, global_range[0]}, {1, 1, local_range[0]}};
+}
+
+template <auto F, typename... Args>
+std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
+launch(const sycl::nd_range<3> &range, sycl::queue q, Args... args) {
+  static_assert(detail::getArgumentCount(F) == sizeof...(args),
+                "Wrong number of arguments to SYCL kernel");
+  static_assert(
+      std::is_same<std::invoke_result_t<decltype(F), Args...>, void>::value,
+      "SYCL kernels should return void");
+
+  return q.parallel_for(
+      range, [=](sycl::nd_item<3>) { [[clang::always_inline]] F(args...); });
+}
+
+} // namespace detail
+
+template <int Dim>
+inline sycl::nd_range<Dim> compute_nd_range(sycl::range<Dim> global_size_in,
+                                            sycl::range<Dim> work_group_size) {
+
+  if (global_size_in.size() == 0 || work_group_size.size() == 0) {
+    throw std::invalid_argument("Global or local size is zero!");
+  }
+  for (size_t i = 0; i < Dim; ++i) {
+    if (global_size_in[i] < work_group_size[i])
+      throw std::invalid_argument("Work group size larger than global size");
+  }
+
+  auto global_size =
+      ((global_size_in + work_group_size - 1) / work_group_size) *
+      work_group_size;
+  return {global_size, work_group_size};
+}
+
+inline sycl::nd_range<1> compute_nd_range(int global_size_in,
+                                          int work_group_size) {
+  return compute_nd_range<1>(global_size_in, work_group_size);
+}
+
+template <auto F, int Dim, typename... Args>
+std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
+launch(const sycl::nd_range<Dim> &range, sycl::queue q, Args... args) {
+  return detail::launch<F>(detail::transform_nd_range<Dim>(range), q, args...);
+}
+
+template <auto F, int Dim, typename... Args>
+std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
+launch(const sycl::nd_range<Dim> &range, Args... args) {
+  return launch<F>(range, get_default_queue(), args...);
+}
+
+// Alternative launch through dim3 objects
+template <auto F, typename... Args>
+std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
+launch(const dim3 &grid, const dim3 &threads, sycl::queue q, Args... args) {
+  return launch<F>(sycl::nd_range<3>{grid * threads, threads}, q, args...);
+}
+
+template <auto F, typename... Args>
+std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
+launch(const dim3 &grid, const dim3 &threads, Args... args) {
+  return launch<F>(grid, threads, get_default_queue(), args...);
+}
+
+} // namespace compat
+
+namespace compat::experimental {
+
+namespace detail {
+
+template <auto F, typename LaunchPolicy, typename... Args>
+sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args) {
+  static_assert(compat::args_compatible<LaunchPolicy, F, Args...>,
+                "Mismatch between device function signature and supplied "
+                "arguments. Have you correctly handled local memory/char*?");
+
+  sycl_exp::launch_config config(launch_policy.get_range(),
+                                 launch_policy.get_launch_properties());
+
+  return sycl_exp::submit_with_event(q, [&](sycl::handler &cgh) {
+    auto KernelFunctor = build_kernel_functor<F>(cgh, launch_policy, args...);
+    if constexpr (compat::detail::is_range_v<
+                      typename LaunchPolicy::RangeT>) {
+      parallel_for(cgh, config, KernelFunctor);
+    } else {
+      static_assert(
+          compat::detail::is_nd_range_v<typename LaunchPolicy::RangeT>);
+      nd_launch(cgh, config, KernelFunctor);
+    }
+  });
+}
+
+}
+
+
+template <auto F, typename LaunchPolicy, typename... Args>
+sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args) {
+  static_assert(detail::is_launch_policy_v<LaunchPolicy>);
+  return detail::launch<F>(launch_policy, q, args...);
+}
+
+template <auto F, typename LaunchPolicy, typename... Args>
+sycl::event launch(LaunchPolicy launch_policy, Args... args) {
+  static_assert(detail::is_launch_policy_v<LaunchPolicy>);
+  return launch<F>(launch_policy, get_default_queue(), args...);
+}
+
+} // namespace compat::experimental
diff --git a/tools/util/include/compat/launch_policy.hpp b/tools/util/include/compat/launch_policy.hpp
new file mode 100644
index 0000000000..b7b7a01da2
--- /dev/null
+++ b/tools/util/include/compat/launch_policy.hpp
@@ -0,0 +1,273 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  launch.hpp
+ *
+ *  Description:
+ *    launch functionality for the SYCL compatibility extension
+ **************************************************************************/
+
+#pragma once
+
+#include "sycl/ext/oneapi/experimental/enqueue_functions.hpp"
+#include "sycl/ext/oneapi/properties/properties.hpp"
+#include <sycl/event.hpp>
+#include <sycl/nd_range.hpp>
+#include <sycl/queue.hpp>
+#include <sycl/range.hpp>
+
+#include <compat/defs.hpp>
+#include <compat/device.hpp>
+#include <compat/dims.hpp>
+#include <compat/traits.hpp>
+
+namespace compat {
+namespace experimental {
+
+namespace sycl_exp = sycl::ext::oneapi::experimental;
+
+// Wrapper for kernel sycl_exp::properties
+template <typename Properties> struct kernel_properties {
+  static_assert(sycl_exp::is_property_list_v<Properties>);
+  using Props = Properties;
+
+  template <typename... Props>
+  kernel_properties(Props... properties) : props{properties...} {}
+
+  template <typename... Props>
+  kernel_properties(sycl_exp::properties<Props...> properties)
+      : props{properties} {}
+
+  Properties props;
+};
+
+template <typename... Props, typename = std::enable_if_t<detail::are_all_props<Props...>::value, void>>
+kernel_properties(Props... props)
+    -> kernel_properties<decltype(sycl_exp::properties(props...))>;
+
+template <typename... Props>
+kernel_properties(sycl_exp::properties<Props...> props)
+    -> kernel_properties<sycl_exp::properties<Props...>>;
+
+// Wrapper for launch sycl_exp::properties
+template <typename Properties> struct launch_properties {
+  static_assert(sycl_exp::is_property_list_v<Properties>);
+  using Props = Properties;
+
+  template <typename... Props>
+  launch_properties(Props... properties) : props{properties...} {}
+
+  template <typename... Props>
+  launch_properties(sycl_exp::properties<Props...> properties)
+      : props{properties} {}
+
+  Properties props;
+};
+
+template <typename... Props, typename = std::enable_if_t<detail::are_all_props<Props...>::value, void>>
+launch_properties(Props... props)
+    -> launch_properties<decltype(sycl_exp::properties(props...))>;
+
+template <typename... Props>
+launch_properties(sycl_exp::properties<Props...> props)
+    -> launch_properties<sycl_exp::properties<Props...>>;
+
+// Wrapper for local memory size
+struct local_mem_size {
+  local_mem_size(size_t size = 0) : size{size} {};
+  size_t size;
+};
+
+// launch_policy is constructed by the user & passed to `compat_exp::launch`
+template <typename Range, typename KProps, typename LProps, bool LocalMem>
+class launch_policy {
+  static_assert(sycl_exp::is_property_list_v<KProps>);
+  static_assert(sycl_exp::is_property_list_v<LProps>);
+  static_assert(compat::detail::is_range_or_nd_range_v<Range>);
+  static_assert(compat::detail::is_nd_range_v<Range> || !LocalMem,
+                "sycl::range kernel launches are incompatible with local "
+                "memory usage!");
+
+public:
+  using KPropsT = KProps;
+  using LPropsT = LProps;
+  using RangeT = Range;
+  static constexpr bool HasLocalMem = LocalMem;
+
+private:
+  launch_policy() = default;
+
+  template <typename... Ts>
+  launch_policy(Ts... ts)
+      : _kernel_properties{detail::property_getter<
+            kernel_properties, kernel_properties<KPropsT>, std::tuple<Ts...>>()(
+            std::tuple<Ts...>(ts...))},
+        _launch_properties{detail::property_getter<
+            launch_properties, launch_properties<LPropsT>, std::tuple<Ts...>>()(
+            std::tuple<Ts...>(ts...))},
+        _local_mem_size{
+            detail::local_mem_getter<local_mem_size, std::tuple<Ts...>>()(
+                std::tuple<Ts...>(ts...))} {
+    check_variadic_args(ts...);
+  }
+
+  template <typename... Ts> void check_variadic_args(Ts...) {
+    static_assert(
+        std::conjunction_v<std::disjunction<detail::is_kernel_properties<Ts>,
+                                            detail::is_launch_properties<Ts>,
+                                            detail::is_local_mem_size<Ts>>...>,
+        "Received an unexpected argument to ctor. Did you forget to wrap "
+        "in "
+        "compat::kernel_properties, launch_properties, local_mem_size?");
+  }
+
+public:
+  template <typename... Ts>
+  launch_policy(Range range, Ts... ts) : launch_policy(ts...) {
+    _range = range;
+    check_variadic_args(ts...);
+  }
+
+  template <typename... Ts>
+  launch_policy(dim3 global_range, Ts... ts) : launch_policy(ts...) {
+    _range = Range{global_range};
+    check_variadic_args(ts...);
+  }
+
+  template <typename... Ts>
+  launch_policy(dim3 global_range, dim3 local_range, Ts... ts)
+      : launch_policy(ts...) {
+    _range = Range{global_range * local_range, local_range};
+    check_variadic_args(ts...);
+  }
+
+  KProps get_kernel_properties() { return _kernel_properties.props; }
+  LProps get_launch_properties() { return _launch_properties.props; }
+  size_t get_local_mem_size() { return _local_mem_size.size; }
+  Range get_range() { return _range; }
+
+private:
+  Range _range;
+  kernel_properties<KProps> _kernel_properties;
+  launch_properties<LProps> _launch_properties;
+  local_mem_size _local_mem_size;
+};
+
+// Deduction guides for launch_policy
+template <typename Range, typename... Ts>
+launch_policy(Range, Ts...) -> launch_policy<
+    Range, detail::properties_or_empty<kernel_properties, Ts...>,
+    detail::properties_or_empty<launch_properties, Ts...>,
+    detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;
+
+template <int Dim, typename... Ts>
+launch_policy(sycl::range<Dim>, sycl::range<Dim>, Ts...) -> launch_policy<
+    sycl::nd_range<Dim>, detail::properties_or_empty<kernel_properties, Ts...>,
+    detail::properties_or_empty<launch_properties, Ts...>,
+    detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;
+
+template <typename... Ts>
+launch_policy(dim3, Ts...) -> launch_policy<
+    sycl::range<3>, detail::properties_or_empty<kernel_properties, Ts...>,
+    detail::properties_or_empty<launch_properties, Ts...>,
+    detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;
+
+template <typename... Ts>
+launch_policy(dim3, dim3, Ts...) -> launch_policy<
+    sycl::nd_range<3>, detail::properties_or_empty<kernel_properties, Ts...>,
+    detail::properties_or_empty<launch_properties, Ts...>,
+    detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;
+
+namespace detail {
+// Custom std::apply helpers to enable inlining
+template <class F, class Tuple, size_t... Is>
+__compat_inline__ constexpr void apply_expand(F &&f, Tuple &&t,
+                                                  std::index_sequence<Is...>) {
+  [[clang::always_inline]] std::forward<F>(f)(
+      get<Is>(std::forward<Tuple>(t))...);
+}
+
+template <class F, class Tuple>
+__compat_inline__ constexpr void apply_helper(F &&f, Tuple &&t) {
+  apply_expand(
+      std::forward<F>(f), std::forward<Tuple>(t),
+      std::make_index_sequence<std::tuple_size_v<std::decay_t<Tuple>>>{});
+}
+
+template <auto F, typename Range, typename KProps, bool HasLocalMem,
+          typename... Args>
+struct KernelFunctor {
+  KernelFunctor(KProps kernel_props, Args... args)
+      : _kernel_properties{kernel_props},
+        _argument_tuple(std::make_tuple(args...)) {}
+
+  KernelFunctor(KProps kernel_props, sycl::local_accessor<char, 1> local_acc,
+                Args... args)
+      : _kernel_properties{kernel_props}, _local_acc{local_acc},
+        _argument_tuple(std::make_tuple(args...)) {}
+
+  auto get(sycl_exp::properties_tag) const { return _kernel_properties; }
+
+  __compat_inline__ void
+  operator()(compat::detail::range_to_item_t<Range>) const {
+    if constexpr (HasLocalMem) {
+      char *local_mem_ptr = static_cast<char *>(
+          _local_acc.template get_multi_ptr<sycl::access::decorated::no>()
+              .get());
+      apply_helper(
+          [lmem_ptr = local_mem_ptr](auto &&...args) {
+            [[clang::always_inline]] F(args..., lmem_ptr);
+          },
+          _argument_tuple);
+    } else {
+      apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); },
+                   _argument_tuple);
+    }
+  }
+
+  KProps _kernel_properties;
+  std::tuple<Args...> _argument_tuple;
+  std::conditional_t<HasLocalMem, sycl::local_accessor<char, 1>, std::monostate>
+      _local_acc; // monostate for empty type
+};
+
+//====================================================================
+// This helper function avoids 2 nested `if constexpr` in detail::launch
+template <auto F, typename LaunchPolicy, typename... Args>
+auto build_kernel_functor(sycl::handler &cgh, LaunchPolicy launch_policy,
+                          Args... args)
+    -> KernelFunctor<F, typename LaunchPolicy::RangeT,
+                     typename LaunchPolicy::KPropsT, LaunchPolicy::HasLocalMem,
+                     Args...> {
+  if constexpr (LaunchPolicy::HasLocalMem) {
+    sycl::local_accessor<char, 1> local_memory(
+        launch_policy.get_local_mem_size(), cgh);
+    return KernelFunctor<F, typename LaunchPolicy::RangeT,
+                         typename LaunchPolicy::KPropsT,
+                         LaunchPolicy::HasLocalMem, Args...>(
+        launch_policy.get_kernel_properties(), local_memory, args...);
+  } else {
+    return KernelFunctor<F, typename LaunchPolicy::RangeT,
+                         typename LaunchPolicy::KPropsT,
+                         LaunchPolicy::HasLocalMem, Args...>(
+        launch_policy.get_kernel_properties(), args...);
+  }
+}
+
+} // namespace detail
+} // namespace experimental
+} // namespace compat
diff --git a/tools/util/include/compat/math.hpp b/tools/util/include/compat/math.hpp
new file mode 100644
index 0000000000..536a17a005
--- /dev/null
+++ b/tools/util/include/compat/math.hpp
@@ -0,0 +1,2386 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  math.hpp
+ *
+ *  Description:
+ *    math utilities for the SYCL compatibility extension.
+ **************************************************************************/
+
+// The original source was under the license below:
+//==---- math.hpp ---------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <limits>
+#include <sycl/feature_test.hpp>
+#include <type_traits>
+
+// TODO(compat-lib-reviewers): this should not be required
+#ifndef SYCL_EXT_ONEAPI_COMPLEX
+#define SYCL_EXT_ONEAPI_COMPLEX
+#endif
+
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+#include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
+#endif
+#include <sycl/ext/oneapi/experimental/complex/complex.hpp>
+#include <compat/traits.hpp>
+
+namespace compat {
+namespace detail {
+
+namespace complex_namespace = sycl::ext::oneapi::experimental;
+
+template <typename ValueT>
+using complex_type = detail::complex_namespace::complex<ValueT>;
+
+template <typename T>
+constexpr bool is_int32_type = std::is_same_v<std::decay_t<T>, int32_t> ||
+  std::is_same_v<std::decay_t<T>, uint32_t>;
+
+// Helper constexpr bool to avoid ugly macros where possible
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+constexpr bool support_bfloat16_math = true;
+#else
+constexpr bool support_bfloat16_math = false;
+#endif
+
+template <typename ValueT>
+inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val) {
+  return sycl::clamp(val, min_val, max_val);
+}
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+// TODO(compat-lib-reviewers): Follow the process to add this (& other math
+// fns) to the bfloat16 math function extension. If added, remove this
+// functionality from the header.
+template <>
+inline sycl::ext::oneapi::bfloat16 clamp(sycl::ext::oneapi::bfloat16 val,
+                                         sycl::ext::oneapi::bfloat16 min_val,
+                                         sycl::ext::oneapi::bfloat16 max_val) {
+  if (val < min_val)
+    return min_val;
+  if (val > max_val)
+    return max_val;
+  return val;
+}
+
+template <typename T, int Size>
+inline std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
+                        sycl::vec<T, Size>>
+clamp(sycl::vec<T, Size> val, sycl::vec<T, Size> min_val,
+      sycl::vec<T, Size> max_val) {
+  return [&val, &min_val, &max_val]<int... I>(std::integer_sequence<int, I...>) {
+    return sycl::vec<T, Size>{
+        clamp<sycl::ext::oneapi::bfloat16>(val[I], min_val[I], max_val[I])...};
+  }(std::make_integer_sequence<int, Size>{});
+}
+
+template <typename T, std::size_t Size>
+inline std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
+                        sycl::marray<T, Size>>
+clamp(sycl::marray<T, Size> val, sycl::marray<T, Size> min_val,
+      sycl::marray<T, Size> max_val) {
+  return [&val, &min_val, &max_val]<std::size_t... I>(std::index_sequence<I...>) {
+    return sycl::marray<T, Size>{
+        clamp<sycl::ext::oneapi::bfloat16>(val[I], min_val[I], max_val[I])...};
+  }(std::make_index_sequence<Size>{});
+}
+#endif
+
+template <typename VecT, class BinaryOperation, class = void>
+class vectorized_binary {
+public:
+  inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) {
+    VecT v4;
+    for (size_t i = 0; i < v4.size(); ++i) {
+      v4[i] = binary_op(a[i], b[i]);
+    }
+    return v4;
+  }
+};
+
+template <typename VecT, class BinaryOperation>
+class vectorized_binary<
+    VecT, BinaryOperation,
+    std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>> {
+public:
+  inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) {
+    return binary_op(a, b).template as<VecT>();
+  }
+};
+
+/// Extend the 'val' to 'bit' size, zero extend for unsigned int and signed
+/// extend for signed int. Returns a signed integer type.
+template <typename ValueT>
+inline auto zero_or_signed_extend(ValueT val, unsigned bit) {
+  static_assert(std::is_integral_v<ValueT>);
+  if constexpr (sizeof(ValueT) == 4) {
+    assert(bit < 64 &&
+           "When extending int32 value, bit must be smaller than 64.");
+    if constexpr (std::is_signed_v<ValueT>)
+      return int64_t(val) << (64 - bit) >> (64 - bit);
+    else
+      return int64_t(val);
+  } else if constexpr (sizeof(ValueT) == 2) {
+    assert(bit < 32 &&
+           "When extending int16 value, bit must be smaller than 32.");
+    if constexpr (std::is_signed_v<ValueT>)
+      return int32_t(val) << (32 - bit) >> (32 - bit);
+    else
+      return int32_t(val);
+  } else if constexpr (sizeof(ValueT) == 1) {
+    assert(bit < 16 &&
+           "When extending int8 value, bit must be smaller than 16.");
+    if constexpr (std::is_signed_v<ValueT>)
+      return int16_t(val) << (16 - bit) >> (16 - bit);
+    else
+      return int16_t(val);
+  } else {
+    static_assert(sizeof(ValueT) == 8);
+    assert(bit < 64 && "Cannot extend int64 value.");
+    return static_cast<int64_t>(val);
+  }
+}
+
+template <typename RetT, bool needSat, typename AT, typename BT,
+          typename BinaryOperation>
+inline constexpr RetT extend_binary(AT a, BT b, BinaryOperation binary_op) {
+  const int64_t extend_a = zero_or_signed_extend(a, 33);
+  const int64_t extend_b = zero_or_signed_extend(b, 33);
+  const int64_t ret = binary_op(extend_a, extend_b);
+  if constexpr (needSat)
+    return detail::clamp<int64_t>(ret, std::numeric_limits<RetT>::min(),
+                                  std::numeric_limits<RetT>::max());
+  return ret;
+}
+
+template <typename RetT, bool needSat, typename AT, typename BT, typename CT,
+          typename BinaryOperation1, typename BinaryOperation2>
+inline constexpr RetT extend_binary(AT a, BT b, CT c,
+                                    BinaryOperation1 binary_op,
+                                    BinaryOperation2 second_op) {
+  const int64_t extend_a = zero_or_signed_extend(a, 33);
+  const int64_t extend_b = zero_or_signed_extend(b, 33);
+  int64_t extend_temp =
+      zero_or_signed_extend(binary_op(extend_a, extend_b), 34);
+  if constexpr (needSat)
+    extend_temp =
+        detail::clamp<int64_t>(extend_temp, std::numeric_limits<RetT>::min(),
+                               std::numeric_limits<RetT>::max());
+  const int64_t extend_c = zero_or_signed_extend(c, 33);
+  return second_op(extend_temp, extend_c);
+}
+
+template <typename T> sycl::vec<int32_t, 2> extract_and_extend2(T a) {
+  sycl::vec<int32_t, 2> ret;
+  sycl::vec<T, 1> va{a};
+  using IntT = std::conditional_t<std::is_signed_v<T>, int16_t, uint16_t>;
+  auto v = va.template as<sycl::vec<IntT, 2>>();
+  ret[0] = zero_or_signed_extend(v[0], 17);
+  ret[1] = zero_or_signed_extend(v[1], 17);
+  return ret;
+}
+
+template <typename T> sycl::vec<int16_t, 4> extract_and_extend4(T a) {
+  sycl::vec<int16_t, 4> ret;
+  sycl::vec<T, 1> va{a};
+  using IntT = std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>;
+  auto v = va.template as<sycl::vec<IntT, 4>>();
+  ret[0] = zero_or_signed_extend(v[0], 9);
+  ret[1] = zero_or_signed_extend(v[1], 9);
+  ret[2] = zero_or_signed_extend(v[2], 9);
+  ret[3] = zero_or_signed_extend(v[3], 9);
+  return ret;
+}
+
+template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
+          typename BinaryOperation>
+inline constexpr RetT extend_vbinary2(AT a, BT b, RetT c,
+                                      BinaryOperation binary_op) {
+  static_assert(is_int32_type<AT> && is_int32_type<BT> && is_int32_type<RetT>);
+  sycl::vec<int32_t, 2> extend_a = extract_and_extend2(a);
+  sycl::vec<int32_t, 2> extend_b = extract_and_extend2(b);
+  sycl::vec<int32_t, 2> temp{binary_op(extend_a[0], extend_b[0]),
+                             binary_op(extend_a[1], extend_b[1])};
+  using IntT = std::conditional_t<std::is_signed_v<RetT>, int16_t, uint16_t>;
+
+  if constexpr (NeedSat) {
+    int32_t min_val = 0, max_val = 0;
+    min_val = std::numeric_limits<IntT>::min();
+    max_val = std::numeric_limits<IntT>::max();
+    temp = detail::clamp(temp, sycl::vec<int32_t, 2>(min_val),
+                         sycl::vec<int32_t, 2>(max_val));
+  }
+  if constexpr (NeedAdd) {
+    return temp[0] + temp[1] + c;
+  }
+  return sycl::vec<IntT, 2>{temp[0], temp[1]}.template as<sycl::vec<RetT, 1>>();
+}
+
+template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
+          typename BinaryOperation>
+inline constexpr RetT extend_vbinary4(AT a, BT b, RetT c,
+                                      BinaryOperation binary_op) {
+  static_assert(is_int32_type<AT> && is_int32_type<BT> && is_int32_type<RetT>);
+  sycl::vec<int16_t, 4> extend_a = extract_and_extend4(a);
+  sycl::vec<int16_t, 4> extend_b = extract_and_extend4(b);
+  sycl::vec<int16_t, 4> temp{
+      binary_op(extend_a[0], extend_b[0]), binary_op(extend_a[1], extend_b[1]),
+      binary_op(extend_a[2], extend_b[2]), binary_op(extend_a[3], extend_b[3])};
+  using IntT = std::conditional_t<std::is_signed_v<RetT>, int8_t, uint8_t>;
+
+  if constexpr (NeedSat) {
+    int16_t min_val = 0, max_val = 0;
+    min_val = std::numeric_limits<IntT>::min();
+    max_val = std::numeric_limits<IntT>::max();
+    temp = detail::clamp(temp, sycl::vec<int16_t, 4>(min_val),
+                         sycl::vec<int16_t, 4>(max_val));
+  }
+  if constexpr (NeedAdd) {
+    return temp[0] + temp[1] + temp[2] + temp[3] + c;
+  }
+
+  return sycl::vec<IntT, 4>{temp[0], temp[1], temp[2], temp[3]}
+      .template as<sycl::vec<RetT, 1>>();
+}
+
+template <typename ValueT> inline bool isnan(const ValueT a) {
+  if constexpr (std::is_same_v<ValueT, sycl::ext::oneapi::bfloat16>) {
+    static_assert(detail::support_bfloat16_math);
+    return sycl::ext::oneapi::experimental::isnan(a);
+  } else {
+    return sycl::isnan(a);
+  }
+}
+
+// FIXME(compat-lib-reviewers): move bfe outside detail once perf is
+// improved & semantics understood
+/// Bitfield-extract.
+///
+/// \tparam T The type of \param source value, must be an integer.
+/// \param source The source value to extracting.
+/// \param bit_start The position to start extracting.
+/// \param num_bits The number of bits to extracting.
+template <typename T>
+inline T bfe(const T source, const uint32_t bit_start,
+             const uint32_t num_bits) {
+  static_assert(std::is_unsigned_v<T>);
+  // FIXME(compat-lib-reviewers): This ternary was added to catch a case
+  // which may be undefined anyway. Consider that we are losing perf here.
+  const T mask =
+      num_bits >= std::numeric_limits<unsigned char>::digits * sizeof(T)
+          ? static_cast<T>(-1)
+          : ((static_cast<T>(1) << num_bits) - 1);
+  return (source >> bit_start) & mask;
+}
+
+} // namespace detail
+
+/// Bitfield-extract with boundary checking.
+///
+/// Extract bit field from \param source and return the zero or sign-extended
+/// result. Source \param bit_start gives the bit field starting bit position,
+/// and source \param num_bits gives the bit field length in bits.
+///
+/// The result is padded with the sign bit of the extracted field. If `num_bits`
+/// is zero, the result is zero. If the start position is beyond the msb of the
+/// input, the result is filled with the replicated sign bit of the extracted
+/// field.
+///
+/// \tparam T The type of \param source value, must be an integer.
+/// \param source The source value to extracting.
+/// \param bit_start The position to start extracting.
+/// \param num_bits The number of bits to extracting.
+template <typename T>
+inline T bfe_safe(const T source, const uint32_t bit_start,
+                  const uint32_t num_bits) {
+  static_assert(std::is_integral_v<T>);
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, int8_t> || std::is_same_v<T, int16_t> ||
+                std::is_same_v<T, int32_t>) {
+    int32_t res{};
+    asm volatile("bfe.s32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"((int32_t)source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint8_t> ||
+                       std::is_same_v<T, uint16_t> ||
+                       std::is_same_v<T, uint32_t>) {
+    uint32_t res{};
+    asm volatile("bfe.u32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"((uint32_t)source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, int64_t>) {
+    T res{};
+    asm volatile("bfe.s64 %0, %1, %2, %3;"
+                 : "=l"(res)
+                 : "l"(source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    T res{};
+    asm volatile("bfe.u64 %0, %1, %2, %3;"
+                 : "=l"(res)
+                 : "l"(source), "r"(bit_start), "r"(num_bits));
+    return res;
+  }
+#endif
+  const uint32_t bit_width =
+      std::numeric_limits<unsigned char>::digits * sizeof(T);
+  const uint32_t pos = std::min(bit_start, bit_width);
+  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
+  if constexpr (std::is_signed_v<T>) {
+    // FIXME(compat-lib-reviewers): As above, catching a case whose result
+    // is undefined and likely losing perf.
+    const T mask = len >= bit_width ? T{-1} : static_cast<T>((T{1} << len) - 1);
+
+    // Find the sign-bit, the result is padded with the sign bit of the
+    // extracted field.
+    // Note if requested num_bits==0, we return zero via sign_bit=0
+    const uint32_t sign_bit_pos = std::min(pos + len - 1, bit_width - 1);
+    const T sign_bit = num_bits != 0 && ((source >> sign_bit_pos) & 1);
+    const T sign_bit_padding = (-sign_bit & ~mask);
+    return ((source >> pos) & mask) | sign_bit_padding;
+  } else {
+    return compat::detail::bfe(source, pos, len);
+  }
+}
+
+namespace detail {
+// FIXME(compat-lib-reviewers): move bfi outside detail once perf is
+// improved & semantics understood
+/// Bitfield-insert.
+///
+/// \tparam T The type of \param x and \param y , must be an unsigned integer.
+/// \param x The source of the bitfield.
+/// \param y The source where bitfield is inserted.
+/// \param bit_start The position to start insertion.
+/// \param num_bits The number of bits to insertion.
+template <typename T>
+inline T bfi(const T x, const T y, const uint32_t bit_start,
+             const uint32_t num_bits) {
+  static_assert(std::is_unsigned_v<T>);
+  constexpr unsigned bit_width =
+      std::numeric_limits<unsigned char>::digits * sizeof(T);
+
+  // if bit_start > bit_width || len == 0, should return y.
+  const T ignore_bfi = static_cast<T>(bit_start > bit_width || num_bits == 0);
+  T extract_bitfield_mask = (static_cast<T>(~T{0}) >> (bit_width - num_bits))
+                            << bit_start;
+  T clean_bitfield_mask = ~extract_bitfield_mask;
+  return (y & (-ignore_bfi | clean_bitfield_mask)) |
+         (~-ignore_bfi & ((x << bit_start) & extract_bitfield_mask));
+}
+} // namespace detail
+
+/// Bitfield-insert with boundary checking.
+///
+/// Align and insert a bit field from \param x into \param y . Source \param
+/// bit_start gives the starting bit position for the insertion, and source
+/// \param num_bits gives the bit field length in bits.
+///
+/// \tparam T The type of \param x and \param y , must be an unsigned integer.
+/// \param x The source of the bitfield.
+/// \param y The source where bitfield is inserted.
+/// \param bit_start The position to start insertion.
+/// \param num_bits The number of bits to insertion.
+template <typename T>
+inline T bfi_safe(const T x, const T y, const uint32_t bit_start,
+                  const uint32_t num_bits) {
+  static_assert(std::is_unsigned_v<T>);
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, uint16_t> ||
+                std::is_same_v<T, uint32_t>) {
+    uint32_t res{};
+    asm volatile("bfi.b32 %0, %1, %2, %3, %4;"
+                 : "=r"(res)
+                 : "r"((uint32_t)x), "r"((uint32_t)y), "r"(bit_start),
+                   "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    uint64_t res{};
+    asm volatile("bfi.b64 %0, %1, %2, %3, %4;"
+                 : "=l"(res)
+                 : "l"(x), "l"(y), "r"(bit_start), "r"(num_bits));
+    return res;
+  }
+#endif
+  constexpr unsigned bit_width =
+      std::numeric_limits<unsigned char>::digits * sizeof(T);
+  const uint32_t pos = std::min(bit_start, bit_width);
+  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
+  return compat::detail::bfi(x, y, pos, len);
+}
+
+/// Emulated function for __funnelshift_l
+inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
+                                  unsigned int shift) {
+  return (sycl::upsample(high, low) << (shift & 31U)) >> 32;
+}
+
+/// Emulated function for __funnelshift_lc
+inline unsigned int funnelshift_lc(unsigned int low, unsigned int high,
+                                   unsigned int shift) {
+  return (sycl::upsample(high, low) << sycl::min(shift, 32U)) >> 32;
+}
+
+/// Emulated function for __funnelshift_r
+inline unsigned int funnelshift_r(unsigned int low, unsigned int high,
+                                  unsigned int shift) {
+  return (sycl::upsample(high, low) >> (shift & 31U)) & 0xFFFFFFFF;
+}
+
+/// Emulated function for __funnelshift_rc
+inline unsigned int funnelshift_rc(unsigned int low, unsigned int high,
+                                   unsigned int shift) {
+  return (sycl::upsample(high, low) >> sycl::min(shift, 32U)) & 0xFFFFFFFF;
+}
+
+/// Compute fast_length for variable-length array
+/// \param [in] a The array
+/// \param [in] len Length of the array
+/// \returns The computed fast_length
+inline float fast_length(const float *a, int len) {
+  switch (len) {
+  case 1:
+    return sycl::fast_length(a[0]);
+  case 2:
+    return sycl::fast_length(sycl::float2(a[0], a[1]));
+  case 3:
+    return sycl::fast_length(sycl::float3(a[0], a[1], a[2]));
+  case 4:
+    return sycl::fast_length(sycl::float4(a[0], a[1], a[2], a[3]));
+  case 0:
+    return 0;
+  default:
+    float f = 0;
+    for (int i = 0; i < len; ++i)
+      f += a[i] * a[i];
+    return sycl::sqrt(f);
+  }
+}
+
+/// Calculate the square root of the input array.
+/// \param [in] a The array pointer
+/// \param [in] len Length of the array
+/// \returns The square root
+template <typename ValueT>
+inline ValueT length(const ValueT *a, const int len) {
+  switch (len) {
+  case 1:
+    return a[0];
+  case 2:
+    return sycl::length(sycl::vec<ValueT, 2>(a[0], a[1]));
+  case 3:
+    return sycl::length(sycl::vec<ValueT, 3>(a[0], a[1], a[2]));
+  case 4:
+    return sycl::length(sycl::vec<ValueT, 4>(a[0], a[1], a[2], a[3]));
+  default:
+    ValueT ret = 0;
+    for (int i = 0; i < len; ++i)
+      ret += a[i] * a[i];
+    return sycl::sqrt(ret);
+  }
+}
+
+/// Performs comparison.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename ValueT, class BinaryOperation>
+inline std::enable_if_t<
+    std::is_same_v<std::invoke_result_t<BinaryOperation, ValueT, ValueT>, bool>,
+    bool>
+compare(const ValueT a, const ValueT b, const BinaryOperation binary_op) {
+  return binary_op(a, b);
+}
+template <typename ValueT>
+inline std::enable_if_t<
+    std::is_same_v<std::invoke_result_t<std::not_equal_to<>, ValueT, ValueT>,
+                   bool>,
+    bool>
+compare(const ValueT a, const ValueT b, const std::not_equal_to<> binary_op) {
+  return !detail::isnan(a) && !detail::isnan(b) && binary_op(a, b);
+}
+
+/// Performs 2 element comparison.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename ValueT, class BinaryOperation>
+inline std::enable_if_t<ValueT::size() == 2, ValueT>
+compare(const ValueT a, const ValueT b, const BinaryOperation binary_op) {
+  return {compare(a[0], b[0], binary_op), compare(a[1], b[1], binary_op)};
+}
+
+/// Performs unordered comparison.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename ValueT, class BinaryOperation>
+inline std::enable_if_t<
+    std::is_same_v<std::invoke_result_t<BinaryOperation, ValueT, ValueT>, bool>,
+    bool>
+unordered_compare(const ValueT a, const ValueT b,
+                  const BinaryOperation binary_op) {
+  return detail::isnan(a) || detail::isnan(b) || binary_op(a, b);
+}
+
+/// Performs 2 element unordered comparison.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename ValueT, class BinaryOperation>
+inline std::enable_if_t<ValueT::size() == 2, ValueT>
+unordered_compare(const ValueT a, const ValueT b,
+                  const BinaryOperation binary_op) {
+  return {unordered_compare(a[0], b[0], binary_op),
+          unordered_compare(a[1], b[1], binary_op)};
+}
+
+/// Performs 2 element comparison and return true if both results are true.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename ValueT, class BinaryOperation>
+inline std::enable_if_t<ValueT::size() == 2, bool>
+compare_both(const ValueT a, const ValueT b, const BinaryOperation binary_op) {
+  return compare(a[0], b[0], binary_op) && compare(a[1], b[1], binary_op);
+}
+
+/// Performs 2 element unordered comparison and return true if both results are
+/// true.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename ValueT, class BinaryOperation>
+inline std::enable_if_t<ValueT::size() == 2, bool>
+unordered_compare_both(const ValueT a, const ValueT b,
+                       const BinaryOperation binary_op) {
+  return unordered_compare(a[0], b[0], binary_op) &&
+         unordered_compare(a[1], b[1], binary_op);
+}
+
+/// Performs 2 elements comparison, compare result of each element is 0 (false)
+/// or 0xffff (true), returns an unsigned int by composing compare result of two
+/// elements.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename ValueT, class BinaryOperation>
+inline std::enable_if_t<ValueT::size() == 2, unsigned>
+compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op) {
+  // Since compare returns 0 or 1, -compare will be 0x00000000 or 0xFFFFFFFF
+  return ((-compare(a[0], b[0], binary_op)) & 0xFFFF) |
+         ((-compare(a[1], b[1], binary_op)) << 16u);
+}
+
+/// Performs 2 elements unordered comparison, compare result of each element is
+/// 0 (false) or 0xffff (true), returns an unsigned int by composing compare
+/// result of two elements.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename ValueT, class BinaryOperation>
+inline std::enable_if_t<ValueT::size() == 2, unsigned>
+unordered_compare_mask(const ValueT a, const ValueT b,
+                       const BinaryOperation binary_op) {
+  return ((-unordered_compare(a[0], b[0], binary_op)) & 0xFFFF) |
+         ((-unordered_compare(a[1], b[1], binary_op)) << 16);
+}
+
+/// Compute vectorized max for two values, with each value treated as a vector
+/// type \p S
+/// \param [in] S The type of the vector
+/// \param [in] T The type of the original values
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The vectorized max of the two values
+template <typename S, typename T> inline T vectorized_max(T a, T b) {
+  sycl::vec<T, 1> v0{a}, v1{b};
+  auto v2 = v0.template as<S>();
+  auto v3 = v1.template as<S>();
+  v2 = sycl::max(v2, v3);
+  v0 = v2.template as<sycl::vec<T, 1>>();
+  return v0;
+}
+
+/// Compute vectorized min for two values, with each value treated as a vector
+/// type \p S
+/// \param [in] S The type of the vector
+/// \param [in] T The type of the original values
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The vectorized min of the two values
+template <typename S, typename T> inline T vectorized_min(T a, T b) {
+  sycl::vec<T, 1> v0{a}, v1{b};
+  auto v2 = v0.template as<S>();
+  auto v3 = v1.template as<S>();
+  v2 = sycl::min(v2, v3);
+  v0 = v2.template as<sycl::vec<T, 1>>();
+  return v0;
+}
+
+/// Compute vectorized unary operation for a value, with the value treated as a
+/// vector type \p VecT.
+/// \tparam [in] VecT The type of the vector
+/// \tparam [in] UnaryOperation The unary operation class
+/// \param [in] a The input value
+/// \returns The vectorized unary operation value of the input value
+template <typename VecT, class UnaryOperation>
+inline unsigned vectorized_unary(unsigned a, const UnaryOperation unary_op) {
+  sycl::vec<unsigned, 1> v0{a};
+  auto v1 = v0.as<VecT>();
+  auto v2 = unary_op(v1);
+  v0 = v2.template as<sycl::vec<unsigned, 1>>();
+  return v0;
+}
+
+/// Compute vectorized absolute difference for two values without modulo
+/// overflow, with each value treated as a vector type \p VecT.
+/// \tparam [in] VecT The type of the vector
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The vectorized absolute difference of the two values
+template <typename VecT>
+inline unsigned vectorized_sum_abs_diff(unsigned a, unsigned b) {
+  sycl::vec<unsigned, 1> v0{a}, v1{b};
+  // Need convert element type to wider signed type to avoid overflow.
+  auto v2 = v0.as<VecT>().template convert<int>();
+  auto v3 = v1.as<VecT>().template convert<int>();
+  auto v4 = sycl::abs_diff(v2, v3);
+  unsigned sum = 0;
+  for (size_t i = 0; i < v4.size(); ++i) {
+    sum += v4[i];
+  }
+  return sum;
+}
+
+/// Compute vectorized isgreater for two values, with each value treated as a
+/// vector type \p S
+/// \param [in] S The type of the vector
+/// \param [in] T The type of the original values
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The vectorized greater than of the two values
+template <typename S, typename T> inline T vectorized_isgreater(T a, T b) {
+  sycl::vec<T, 1> v0{a}, v1{b};
+  auto v2 = v0.template as<S>();
+  auto v3 = v1.template as<S>();
+  auto v4 = sycl::isgreater(v2, v3);
+  v0 = v4.template as<sycl::vec<T, 1>>();
+  return v0;
+}
+
+/// Compute vectorized isgreater for two unsigned int values, with each value
+/// treated as a vector of two unsigned short
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The vectorized greater than of the two values
+template <>
+inline unsigned vectorized_isgreater<sycl::ushort2, unsigned>(unsigned a,
+                                                              unsigned b) {
+  sycl::vec<unsigned, 1> v0{a}, v1{b};
+  auto v2 = v0.template as<sycl::ushort2>();
+  auto v3 = v1.template as<sycl::ushort2>();
+  sycl::ushort2 v4;
+  v4[0] = v2[0] > v3[0];
+  v4[1] = v2[1] > v3[1];
+  v0 = v4.template as<sycl::vec<unsigned, 1>>();
+  return v0;
+}
+
+/// Returns min(max(val, min_val), max_val)
+/// \param [in] val The input value
+/// \param [in] min_val The minimum value
+/// \param [in] max_val The maximum value
+/// \returns the value between min_val and max_val
+template <typename ValueT>
+inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val) {
+  return detail::clamp(val, min_val, max_val);
+}
+
+/// Determine whether 2 element value is NaN.
+/// \param [in] a The input value
+/// \returns the comparison result
+template <typename ValueT>
+inline std::enable_if_t<ValueT::size() == 2, ValueT> isnan(const ValueT a) {
+  return {detail::isnan(a[0]), detail::isnan(a[1])};
+}
+
+/// cbrt function wrapper.
+template <typename ValueT>
+inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
+                            std::is_same_v<ValueT, sycl::half>,
+                        ValueT>
+cbrt(ValueT val) {
+  return sycl::cbrt(static_cast<ValueT>(val));
+}
+
+// min/max function overloads.
+// For floating-point types, `float` or `double` arguments are acceptable.
+// For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
+// `std::int64_t` type arguments are acceptable.
+// sycl::half supported as well, and sycl::ext::oneapi::bfloat16 if available.
+template <typename ValueT, typename ValueU>
+inline std::enable_if_t<std::is_integral_v<ValueT> &&
+                            std::is_integral_v<ValueU>,
+                        std::common_type_t<ValueT, ValueU>>
+min(ValueT a, ValueU b) {
+  return sycl::min(static_cast<std::common_type_t<ValueT, ValueU>>(a),
+                   static_cast<std::common_type_t<ValueT, ValueU>>(b));
+}
+
+template <typename ValueT, typename ValueU>
+inline std::enable_if_t<compat::is_floating_point_v<ValueT> &&
+                            compat::is_floating_point_v<ValueU>,
+                        std::common_type_t<ValueT, ValueU>>
+min(ValueT a, ValueU b) {
+  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
+                               sycl::ext::oneapi::bfloat16>) {
+    static_assert(detail::support_bfloat16_math);
+    return sycl::ext::oneapi::experimental::fmin(
+        static_cast<std::common_type_t<ValueT, ValueU>>(a),
+        static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  } else {
+    return sycl::fmin(static_cast<std::common_type_t<ValueT, ValueU>>(a),
+                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  }
+}
+
+template <typename ValueT, typename ValueU>
+inline std::enable_if_t<std::is_integral_v<ValueT> &&
+                            std::is_integral_v<ValueU>,
+                        std::common_type_t<ValueT, ValueU>>
+max(ValueT a, ValueU b) {
+  return sycl::max(static_cast<std::common_type_t<ValueT, ValueU>>(a),
+                   static_cast<std::common_type_t<ValueT, ValueU>>(b));
+}
+template <typename ValueT, typename ValueU>
+inline std::enable_if_t<compat::is_floating_point_v<ValueT> &&
+                            compat::is_floating_point_v<ValueU>,
+                        std::common_type_t<ValueT, ValueU>>
+max(ValueT a, ValueU b) {
+  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
+                               sycl::ext::oneapi::bfloat16>) {
+    static_assert(detail::support_bfloat16_math);
+    return sycl::ext::oneapi::experimental::fmax(
+        static_cast<std::common_type_t<ValueT, ValueU>>(a),
+        static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  } else {
+    return sycl::fmax(static_cast<std::common_type_t<ValueT, ValueU>>(a),
+                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  }
+}
+
+/// Performs 2 elements comparison and returns the bigger one. If either of
+/// inputs is NaN, then return NaN.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns the bigger value
+template <typename ValueT, typename ValueU>
+inline std::common_type_t<ValueT, ValueU> fmax_nan(const ValueT a,
+                                                   const ValueU b) {
+  if (detail::isnan(a) || detail::isnan(b))
+    return NAN;
+  return compat::max(a, b);
+}
+
+template <typename ValueT, typename ValueU>
+inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
+fmax_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b) {
+  return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
+}
+
+template <typename ValueT, typename ValueU>
+inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
+fmax_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b) {
+  return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
+}
+
+/// Performs 2 elements comparison and returns the smaller one. If either of
+/// inputs is NaN, then return NaN.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns the smaller value
+template <typename ValueT, typename ValueU>
+inline std::common_type_t<ValueT, ValueU> fmin_nan(const ValueT a,
+                                                   const ValueU b) {
+  if (detail::isnan(a) || detail::isnan(b))
+    return NAN;
+  return compat::min(a,b);
+}
+
+template <typename ValueT, typename ValueU>
+inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
+fmin_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b) {
+  return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
+}
+
+template <typename ValueT, typename ValueU>
+inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
+fmin_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b) {
+  return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
+}
+
+// pow functions overload.
+inline float pow(const float a, const int b) { return sycl::pown(a, b); }
+inline double pow(const double a, const int b) { return sycl::pown(a, b); }
+
+template <typename ValueT, typename ValueU>
+inline typename std::enable_if_t<std::is_floating_point_v<ValueT>, ValueT>
+pow(const ValueT a, const ValueU b) {
+  return sycl::pow(a, static_cast<ValueT>(b));
+}
+// TODO(compat-lib-reviewers)  calling pow with non-floating point values
+// is currently defaulting to double, which fails on devices without
+// aspect::fp64. This has to be properly documented, and maybe changed to
+// support all devices.
+template <typename ValueT, typename ValueU>
+inline typename std::enable_if_t<!std::is_floating_point_v<ValueT>, double>
+pow(const ValueT a, const ValueU b) {
+  return sycl::pow(static_cast<double>(a), static_cast<double>(b));
+}
+
+/// Performs relu saturation.
+/// \param [in] a The input value
+/// \returns the relu saturation result
+template <typename ValueT> inline ValueT relu(const ValueT a) {
+  if constexpr (compat::is_floating_point_v<ValueT>)
+    if (detail::isnan(a))
+      return a;
+  if (a < ValueT(0))
+    return ValueT(0);
+  return a;
+}
+template <class ValueT, int NumElements>
+inline sycl::vec<ValueT, NumElements>
+relu(const sycl::vec<ValueT, NumElements> a) {
+  sycl::vec<ValueT, NumElements> ret;
+  for (int i = 0; i < NumElements; ++i)
+    ret[i] = relu(a[i]);
+  return ret;
+}
+template <class ValueT>
+inline sycl::marray<ValueT, 2> relu(const sycl::marray<ValueT, 2> a) {
+  return {relu(a[0]), relu(a[1])};
+}
+
+/// Computes the multiplication of two complex numbers.
+/// \tparam T Complex element type
+/// \param [in] x The first input complex number
+/// \param [in] y The second input complex number
+/// \returns The result
+template <typename T>
+sycl::vec<T, 2> cmul(sycl::vec<T, 2> x, sycl::vec<T, 2> y) {
+  sycl::ext::oneapi::experimental::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
+  t1 = t1 * t2;
+  return sycl::vec<T, 2>(t1.real(), t1.imag());
+}
+
+/// Computes the division of two complex numbers.
+/// \tparam T Complex element type
+/// \param [in] x The first input complex number
+/// \param [in] y The second input complex number
+/// \returns The result
+template <typename T>
+sycl::vec<T, 2> cdiv(sycl::vec<T, 2> x, sycl::vec<T, 2> y) {
+  sycl::ext::oneapi::experimental::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
+  t1 = t1 / t2;
+  return sycl::vec<T, 2>(t1.real(), t1.imag());
+}
+
+/// Computes the magnitude of a complex number.
+/// \tparam T Complex element type
+/// \param [in] x The input complex number
+/// \returns The result
+template <typename T> T cabs(sycl::vec<T, 2> x) {
+  sycl::ext::oneapi::experimental::complex<T> t(x[0], x[1]);
+  return sycl::ext::oneapi::experimental::abs(t);
+}
+
+/// Computes the complex conjugate of a complex number.
+/// \tparam T Complex element type
+/// \param [in] x The input complex number
+/// \returns The result
+template <typename T> sycl::vec<T, 2> conj(sycl::vec<T, 2> x) {
+  sycl::ext::oneapi::experimental::complex<T> t(x[0], x[1]);
+  t = conj(t);
+  return sycl::vec<T, 2>(t.real(), t.imag());
+}
+
+/// Performs complex number multiply addition.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns the operation result
+template <typename ValueT>
+inline sycl::vec<ValueT, 2> cmul_add(const sycl::vec<ValueT, 2> a,
+                                     const sycl::vec<ValueT, 2> b,
+                                     const sycl::vec<ValueT, 2> c) {
+  sycl::ext::oneapi::experimental::complex<ValueT> t(a[0], a[1]);
+  sycl::ext::oneapi::experimental::complex<ValueT> u(b[0], b[1]);
+  sycl::ext::oneapi::experimental::complex<ValueT> v(c[0], c[1]);
+  t = t * u + v;
+  return sycl::vec<ValueT, 2>{t.real(), t.imag()};
+}
+template <typename ValueT>
+inline sycl::marray<ValueT, 2> cmul_add(const sycl::marray<ValueT, 2> a,
+                                        const sycl::marray<ValueT, 2> b,
+                                        const sycl::marray<ValueT, 2> c) {
+  sycl::ext::oneapi::experimental::complex<ValueT> t(a[0], a[1]);
+  sycl::ext::oneapi::experimental::complex<ValueT> u(b[0], b[1]);
+  sycl::ext::oneapi::experimental::complex<ValueT> v(c[0], c[1]);
+  t = t * u + v;
+  return sycl::marray<ValueT, 2>{t.real(), t.imag()};
+}
+
+/// A sycl::abs wrapper functors.
+struct abs {
+  template <typename ValueT> auto operator()(const ValueT x) const {
+    return sycl::abs(x);
+  }
+};
+
+/// A sycl::abs_diff wrapper functors.
+struct abs_diff {
+  template <typename ValueT>
+  auto operator()(const ValueT x, const ValueT y) const {
+    return sycl::abs_diff(x, y);
+  }
+};
+
+/// A sycl::add_sat wrapper functors.
+struct add_sat {
+  template <typename ValueT>
+  auto operator()(const ValueT x, const ValueT y) const {
+    return sycl::add_sat(x, y);
+  }
+};
+
+/// A sycl::rhadd wrapper functors.
+struct rhadd {
+  template <typename ValueT>
+  auto operator()(const ValueT x, const ValueT y) const {
+    return sycl::rhadd(x, y);
+  }
+};
+
+/// A sycl::hadd wrapper functors.
+struct hadd {
+  template <typename ValueT>
+  auto operator()(const ValueT x, const ValueT y) const {
+    return sycl::hadd(x, y);
+  }
+};
+
+/// A sycl::max wrapper functors.
+struct maximum {
+  template <typename ValueT>
+  auto operator()(const ValueT x, const ValueT y) const {
+    return sycl::max(x, y);
+  }
+  template <typename ValueT>
+  auto operator()(const ValueT x, const ValueT y, bool *pred) const {
+    return (x >= y) ? ((*pred = true), x) : ((*pred = false), y);
+  }
+};
+
+/// A sycl::min wrapper functors.
+struct minimum {
+  template <typename ValueT>
+  auto operator()(const ValueT x, const ValueT y) const {
+    return sycl::min(x, y);
+  }
+  template <typename ValueT>
+  auto operator()(const ValueT x, const ValueT y, bool *pred) const {
+    return (x <= y) ? ((*pred = true), x) : ((*pred = false), y);
+  }
+};
+
+/// A sycl::sub_sat wrapper functors.
+struct sub_sat {
+  template <typename ValueT>
+  auto operator()(const ValueT x, const ValueT y) const {
+    return sycl::sub_sat(x, y);
+  }
+};
+
+namespace detail {
+struct shift_left {
+  template <typename T>
+  auto operator()(const T x, const uint32_t offset) const {
+    return x << offset;
+  }
+};
+
+struct shift_right {
+  template <typename T>
+  auto operator()(const T x, const uint32_t offset) const {
+    return x >> offset;
+  }
+};
+
+struct average {
+  template <typename T> auto operator()(const T x, const T y) const {
+    return (x + y + (x + y >= 0)) >> 1;
+  }
+};
+
+} // namespace detail
+
+/// Compute vectorized binary operation value for two/four values, with each
+/// treated as a vector type \p VecT.
+/// \tparam [in] VecT The type of the vector
+/// \tparam [in] BinaryOperation The binary operation class
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op The operation to do with the two values
+/// \param [in] need_relu Whether the result need relu saturation
+/// \returns The vectorized binary operation value of the two values
+template <typename VecT, class BinaryOperation>
+inline unsigned vectorized_binary(unsigned a, unsigned b,
+                                  const BinaryOperation binary_op,
+                                  [[maybe_unused]] bool need_relu = false) {
+  sycl::vec<unsigned, 1> v0{a}, v1{b};
+  auto v2 = v0.as<VecT>();
+  auto v3 = v1.as<VecT>();
+  auto v4 =
+      detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
+  if (need_relu)
+    v4 = relu(v4);
+  v0 = v4.template as<sycl::vec<unsigned, 1>>();
+  return v0;
+}
+
+/// Compute two vectorized binary operation value with pred for three values,
+/// with each value treated as a 2 \p T type elements vector type.
+///
+/// \tparam [in] VecT The type of the vector
+/// \tparam [in] BinaryOperation1 The first binary operation class
+/// \tparam [in] BinaryOperation2 The second binary operation class
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] binary_op1 The first operation to do with the first two values
+/// \param [in] binary_op2 The second operation to do with the third values
+/// \param [in] need_relu Whether the result need relu saturation
+/// \returns The two vectorized binary operation value of the three values
+template <typename VecT, typename BinaryOperation1, typename BinaryOperation2>
+inline unsigned vectorized_ternary(unsigned a, unsigned b, unsigned c,
+                                   const BinaryOperation1 binary_op1,
+                                   const BinaryOperation2 binary_op2,
+                                   bool need_relu = false) {
+  const auto v1 = sycl::vec<unsigned, 1>(a).as<VecT>();
+  const auto v2 = sycl::vec<unsigned, 1>(b).as<VecT>();
+  const auto v3 = sycl::vec<unsigned, 1>(c).as<VecT>();
+  auto v4 =
+      detail::vectorized_binary<VecT, BinaryOperation1>()(v1, v2, binary_op1);
+  v4 = detail::vectorized_binary<VecT, BinaryOperation2>()(v4, v3, binary_op2);
+  if (need_relu)
+    v4 = relu(v4);
+  return v4.template as<sycl::vec<unsigned, 1>>();
+}
+
+/// Compute vectorized binary operation value with pred for two values, with
+/// each value treated as a 2 \p T type elements vector type.
+///
+/// \tparam [in] VecT The type of the vector
+/// \tparam [in] BinaryOperation The binary operation class
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op The operation with pred to do with the two values
+/// \param [out] pred_hi The pred pointer that pass into high halfword operation
+/// \param [out] pred_lo The pred pointer that pass into low halfword operation
+/// \returns The vectorized binary operation value of the two values
+template <typename VecT, typename BinaryOperation>
+inline unsigned vectorized_binary_with_pred(unsigned a, unsigned b,
+                                            const BinaryOperation binary_op,
+                                            bool *pred_hi, bool *pred_lo) {
+  auto v1 = sycl::vec<unsigned, 1>(a).as<VecT>();
+  auto v2 = sycl::vec<unsigned, 1>(b).as<VecT>();
+  VecT ret;
+  ret[0] = binary_op(v1[0], v2[0], pred_lo);
+  ret[1] = binary_op(v1[1], v2[1], pred_hi);
+  return ret.template as<sycl::vec<unsigned, 1>>();
+}
+
+template <typename T1, typename T2>
+using dot_product_acc_t =
+    std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
+                       uint32_t, int32_t>;
+
+namespace detail {
+
+template <typename T> sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
+  return sycl::vec<T, 1>(val)
+      .template as<sycl::vec<
+          std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
+      .template convert<T>();
+}
+
+template <typename T> sycl::vec<T, 2> extract_and_sign_or_zero_extend2(T val) {
+  return sycl::vec<T, 1>(val)
+      .template as<sycl::vec<
+          std::conditional_t<std::is_signed_v<T>, int16_t, uint16_t>, 2>>()
+      .template convert<T>();
+}
+
+} // namespace detail
+
+/// Two-way dot product-accumulate. Calculate and return integer_vector2(
+/// \param a) dot product integer_vector2(low16_bit( \param b)) + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
+/// result.
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp2a_lo(T1 a, T2 b,
+                                         dot_product_acc_t<T1, T2> c) {
+  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
+                "[Compat] dp2a_lo expects 32-bit integers as operands.");
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610
+  dot_product_acc_t<T1, T2> res;
+  if constexpr (std::is_signed_v<T1> && std::is_signed_v<T2>) {
+    asm volatile("dp2a.lo.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  } else if constexpr (std::is_signed_v<T1> && std::is_unsigned_v<T2>) {
+    asm volatile("dp2a.lo.s32.u32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  } else if constexpr (std::is_unsigned_v<T1> && std::is_signed_v<T2>) {
+    asm volatile("dp2a.lo.u32.s32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  } else {
+    asm volatile("dp2a.lo.u32.u32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  }
+  return res;
+#else
+  dot_product_acc_t<T1, T2> res = c;
+  auto va = detail::extract_and_sign_or_zero_extend2(a);
+  auto vb = detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[0];
+  res += va[1] * vb[1];
+  return res;
+#endif
+}
+
+/// Two-way dot product-accumulate. Calculate and return integer_vector2(
+/// \param a) dot product integer_vector2(high_16bit( \param b)) + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
+/// result.
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
+                                         dot_product_acc_t<T1, T2> c) {
+  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
+                "[Compat] dp2a_hi expects 32-bit integers as operands.");
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610
+  dot_product_acc_t<T1, T2> res;
+  if constexpr (std::is_signed_v<T1> && std::is_signed_v<T2>) {
+    asm volatile("dp2a.hi.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  } else if constexpr (std::is_signed_v<T1> && std::is_unsigned_v<T2>) {
+    asm volatile("dp2a.hi.s32.u32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  } else if constexpr (std::is_unsigned_v<T1> && std::is_signed_v<T2>) {
+    asm volatile("dp2a.hi.u32.s32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  } else {
+    asm volatile("dp2a.hi.u32.u32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  }
+  return res;
+#else
+  dot_product_acc_t<T1, T2> res = c;
+  auto va = detail::extract_and_sign_or_zero_extend2(a);
+  auto vb = detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[2];
+  res += va[1] * vb[3];
+  return res;
+#endif
+}
+
+/// Four-way byte dot product-accumulate. Calculate and return integer_vector4(
+/// \param a) dot product integer_vector4( \param b)  + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Four-way byte dot product which is accumulated in 32-bit result.
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b, dot_product_acc_t<T1, T2> c) {
+  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
+                "[Compat] dp4a expects 32-bit integers as operands.");
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610
+  dot_product_acc_t<T1, T2> res;
+  if constexpr (std::is_signed_v<T1> && std::is_signed_v<T2>) {
+    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  } else if constexpr (std::is_signed_v<T1> && std::is_unsigned_v<T2>) {
+    asm volatile("dp4a.s32.u32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  } else if constexpr (std::is_unsigned_v<T1> && std::is_signed_v<T2>) {
+    asm volatile("dp4a.u32.s32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  } else {
+    asm volatile("dp4a.u32.u32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"(a), "r"(b), "r"(c));
+  }
+  return res;
+#else
+  dot_product_acc_t<T1, T2> res = c;
+  auto va = detail::extract_and_sign_or_zero_extend4(a);
+  auto vb = detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[0];
+  res += va[1] * vb[1];
+  res += va[2] * vb[2];
+  res += va[3] * vb[3];
+  return res;
+#endif
+}
+
+/// Extend \p a and \p b to 33 bit and add them.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_add(AT a, BT b) {
+  return detail::extend_binary<RetT, false>(a, b, std::plus());
+}
+
+/// Extend Inputs to 33 bit, add \p a, \p b, then do \p second_op with \p c.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \tparam [in] CT The type of the third value
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend addition of \p a, \p b and \p second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_add(AT a, BT b, CT c, BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b, c, std::plus(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and add them with saturation.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_add_sat(AT a, BT b) {
+  return detail::extend_binary<RetT, true>(a, b, std::plus());
+}
+
+/// Extend Inputs to 33 bit, add \p a, \p b with saturation, then do \p
+/// second_op with \p c.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \tparam [in] CT The type of the third value
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend addition of \p a, \p b with saturation and \p second_op
+/// with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_add_sat(AT a, BT b, CT c,
+                                     BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b, c, std::plus(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and minus them.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_sub(AT a, BT b) {
+  return detail::extend_binary<RetT, false>(a, b, std::minus());
+}
+
+/// Extend Inputs to 33 bit, minus \p a, \p b, then do \p second_op with \p c.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \tparam [in] CT The type of the third value
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend subtraction of \p a, \p b and \p second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_sub(AT a, BT b, CT c, BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b, c, std::minus(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and minus them with saturation.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_sub_sat(AT a, BT b) {
+  return detail::extend_binary<RetT, true>(a, b, std::minus());
+}
+
+/// Extend Inputs to 33 bit, minus \p a, \p b with saturation, then do \p
+/// second_op with \p c.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \tparam [in] CT The type of the third value
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend subtraction of \p a, \p b with saturation and \p
+/// second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_sub_sat(AT a, BT b, CT c,
+                                     BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b, c, std::minus(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and do abs_diff.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_absdiff(AT a, BT b) {
+  return detail::extend_binary<RetT, false>(a, b, abs_diff());
+}
+
+/// Extend Inputs to 33 bit, abs_diff \p a, \p b, then do \p second_op with \p
+/// c.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \tparam [in] CT The type of the third value
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend abs_diff of \p a, \p b and \p second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_absdiff(AT a, BT b, CT c,
+                                     BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b, c, abs_diff(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and do abs_diff with saturation.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_absdiff_sat(AT a, BT b) {
+  return detail::extend_binary<RetT, true>(a, b, abs_diff());
+}
+
+/// Extend Inputs to 33 bit, abs_diff \p a, \p b with saturation, then do \p
+/// second_op with \p c.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \tparam [in] CT The type of the third value
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend abs_diff of \p a, \p b with saturation and \p
+/// second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_absdiff_sat(AT a, BT b, CT c,
+                                         BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b, c, abs_diff(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return smaller one.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The smaller one of the two extended values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_min(AT a, BT b) {
+  return detail::extend_binary<RetT, false>(a, b, minimum());
+}
+
+/// Extend Inputs to 33 bit, find the smaller one in \p a, \p b, then do \p
+/// second_op with \p c.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \tparam [in] CT The type of the third value
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The smaller one of \p a, \p b and \p second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_min(AT a, BT b, CT c, BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b, c, minimum(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return smaller one with saturation.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The smaller one of the two extended values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_min_sat(AT a, BT b) {
+  return detail::extend_binary<RetT, true>(a, b, minimum());
+}
+
+/// Extend Inputs to 33 bit, find the smaller one in \p a, \p b with saturation,
+/// then do \p second_op with \p c.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \tparam [in] CT The type of the third value
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The smaller one of \p a, \p b with saturation and \p
+/// second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_min_sat(AT a, BT b, CT c,
+                                     BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b, c, minimum(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return bigger one.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The bigger one of the two extended values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_max(AT a, BT b) {
+  return detail::extend_binary<RetT, false>(a, b, maximum());
+}
+
+/// Extend Inputs to 33 bit, find the bigger one in \p a, \p b, then do \p
+/// second_op with \p c.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \tparam [in] CT The type of the third value
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The bigger one of \p a, \p b and \p second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_max(AT a, BT b, CT c, BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b, c, maximum(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return bigger one with saturation.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The bigger one of the two extended values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_max_sat(AT a, BT b) {
+  return detail::extend_binary<RetT, true>(a, b, maximum());
+}
+
+/// Extend Inputs to 33 bit, find the bigger one in \p a, \p b with saturation,
+/// then do \p second_op with \p c.
+/// \tparam [in] RetT The type of the return value
+/// \tparam [in] AT The type of the first value
+/// \tparam [in] BT The type of the second value
+/// \tparam [in] CT The type of the third value
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The bigger one of \p a, \p b with saturation and \p
+/// second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_max_sat(AT a, BT b, CT c,
+                                     BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b, c, maximum(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return a << clamp(b, 0, 32).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \returns a << clamp(b, 0, 32)
+template <typename RetT, typename T>
+inline constexpr RetT extend_shl_clamp(T a, uint32_t b) {
+  return detail::extend_binary<RetT, false>(a, sycl::clamp(b, 0u, 32u),
+                                            detail::shift_left());
+}
+
+/// Extend Inputs to 33 bit, and return second_op(a << clamp(b, 0, 32), c).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \param [in] c The value to merge
+/// \param [in] second_op The operation to do with the third value
+/// \returns second_op(a << clamp(b, 0, 32), c)
+template <typename RetT, typename T, typename BinaryOperation>
+inline constexpr RetT extend_shl_clamp(T a, uint32_t b, uint32_t c,
+                                       BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, sycl::clamp(b, 0u, 32u), c,
+                                            detail::shift_left(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return sat(a << clamp(b, 0, 32)).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \returns sat(a << clamp(b, 0, 32))
+template <typename RetT, typename T>
+inline constexpr RetT extend_shl_sat_clamp(T a, uint32_t b) {
+  return detail::extend_binary<RetT, true>(a, sycl::clamp(b, 0u, 32u),
+                                           detail::shift_left());
+}
+
+/// Extend Inputs to 33 bit, and return second_op(sat(a << clamp(b, 0, 32)), c).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \param [in] c The value to merge
+/// \param [in] second_op The operation to do with the third value
+/// \returns second_op(sat(a << clamp(b, 0, 32)), c)
+template <typename RetT, typename T, typename BinaryOperation>
+inline constexpr RetT extend_shl_sat_clamp(T a, uint32_t b, uint32_t c,
+                                           BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, sycl::clamp(b, 0u, 32u), c,
+                                           detail::shift_left(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return a << (b & 0x1F).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \returns a << (b & 0x1F)
+template <typename RetT, typename T>
+inline constexpr RetT extend_shl_wrap(T a, uint32_t b) {
+  return detail::extend_binary<RetT, false>(a, b & 0x1F, detail::shift_left());
+}
+
+/// Extend Inputs to 33 bit, and return second_op(a << (b & 0x1F), c).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \param [in] c The value to merge
+/// \param [in] second_op The operation to do with the third value
+/// \returns second_op(a << (b & 0x1F), c)
+template <typename RetT, typename T, typename BinaryOperation>
+inline constexpr RetT extend_shl_wrap(T a, uint32_t b, uint32_t c,
+                                      BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b & 0x1F, c,
+                                            detail::shift_left(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return sat(a << (b & 0x1F)).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \returns sat(a << (b & 0x1F))
+template <typename RetT, typename T>
+inline constexpr RetT extend_shl_sat_wrap(T a, uint32_t b) {
+  return detail::extend_binary<RetT, true>(a, b & 0x1F, detail::shift_left());
+}
+
+/// Extend Inputs to 33 bit, and return second_op(sat(a << (b & 0x1F)), c).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \param [in] c The value to merge
+/// \param [in] second_op The operation to do with the third value
+/// \returns second_op(sat(a << (b & 0x1F)), c)
+template <typename RetT, typename T, typename BinaryOperation>
+inline constexpr RetT extend_shl_sat_wrap(T a, uint32_t b, uint32_t c,
+                                          BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b & 0x1F, c, detail::shift_left(),
+                                           second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return a >> clamp(b, 0, 32).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \returns a >> clamp(b, 0, 32)
+template <typename RetT, typename T>
+inline constexpr RetT extend_shr_clamp(T a, uint32_t b) {
+  return detail::extend_binary<RetT, false>(a, sycl::clamp(b, 0u, 32u),
+                                            detail::shift_right());
+}
+
+/// Extend Inputs to 33 bit, and return second_op(a >> clamp(b, 0, 32), c).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \param [in] c The value to merge
+/// \param [in] second_op The operation to do with the third value
+/// \returns second_op(a >> clamp(b, 0, 32), c)
+template <typename RetT, typename T, typename BinaryOperation>
+inline constexpr RetT extend_shr_clamp(T a, uint32_t b, uint32_t c,
+                                       BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, sycl::clamp(b, 0u, 32u), c,
+                                            detail::shift_right(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return sat(a >> clamp(b, 0, 32)).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \returns sat(a >> clamp(b, 0, 32))
+template <typename RetT, typename T>
+inline constexpr RetT extend_shr_sat_clamp(T a, uint32_t b) {
+  return detail::extend_binary<RetT, true>(a, sycl::clamp(b, 0u, 32u),
+                                           detail::shift_right());
+}
+
+/// Extend Inputs to 33 bit, and return second_op(sat(a >> clamp(b, 0, 32)), c).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \param [in] c The value to merge
+/// \param [in] second_op The operation to do with the third value
+/// \returns second_op(sat(a >> clamp(b, 0, 32)), c)
+template <typename RetT, typename T, typename BinaryOperation>
+inline constexpr RetT extend_shr_sat_clamp(T a, uint32_t b, uint32_t c,
+                                           BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, sycl::clamp(b, 0u, 32u), c,
+                                           detail::shift_right(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return a >> (b & 0x1F).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \returns a >> (b & 0x1F)
+template <typename RetT, typename T>
+inline constexpr RetT extend_shr_wrap(T a, uint32_t b) {
+  return detail::extend_binary<RetT, false>(a, b & 0x1F, detail::shift_right());
+}
+
+/// Extend Inputs to 33 bit, and return second_op(a >> (b & 0x1F), c).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \param [in] c The value to merge
+/// \param [in] second_op The operation to do with the third value
+/// \returns second_op(a >> (b & 0x1F), c)
+template <typename RetT, typename T, typename BinaryOperation>
+inline constexpr RetT extend_shr_wrap(T a, uint32_t b, uint32_t c,
+                                      BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b & 0x1F, c,
+                                            detail::shift_right(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return sat(a >> (b & 0x1F)).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \returns sat(a >> (b & 0x1F))
+template <typename RetT, typename T>
+inline constexpr RetT extend_shr_sat_wrap(T a, uint32_t b) {
+  return detail::extend_binary<RetT, true>(a, b & 0x1F, detail::shift_right());
+}
+
+/// Extend Inputs to 33 bit, and return second_op(sat(a >> (b & 0x1F)), c).
+/// \param [in] a The source value
+/// \param [in] b The offset to shift
+/// \param [in] c The value to merge
+/// \param [in] second_op The operation to do with the third value
+/// \returns second_op(sat(a >> (b & 0x1F)), c)
+template <typename RetT, typename T, typename BinaryOperation>
+inline constexpr RetT extend_shr_sat_wrap(T a, uint32_t b, uint32_t c,
+                                          BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b & 0x1F, c,
+                                           detail::shift_right(), second_op);
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a
+/// 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized addition of the two
+/// values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit. Then add each
+/// half of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized subtraction of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized abs_diff of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized minimum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c,
+                                                     detail::average());
+}
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend average maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, detail::average());
+}
+
+/// Compute vectorized average of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, detail::average());
+}
+
+/// Extend \p a and \p b to 33 bit and vectorized compare input values using
+/// specified comparison \p cmp .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values.
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare2(AT a, BT b, BinaryOperation cmp) {
+  return detail::extend_vbinary2<unsigned, false, false>(a, b, 0, cmp);
+}
+
+/// Extend Inputs to 33 bit, and vectorized compare input values using specified
+/// comparison \p cmp , then add the result with \p c .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values, and add the
+/// result with \p c .
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare2_add(AT a, BT b, unsigned c,
+                                               BinaryOperation cmp) {
+  return detail::extend_vbinary2<unsigned, false, true>(a, b, c, cmp);
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a
+/// 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized addition of the two
+/// values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 4 elements vector type and extend each element to 9 bit. Then add each
+/// half of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized subtraction of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized abs_diff of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized minimum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c,
+                                                     detail::average());
+}
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized average of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, detail::average());
+}
+
+/// Compute vectorized average of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, detail::average());
+}
+
+/// Extend \p a and \p b to 33 bit and vectorized compare input values using
+/// specified comparison \p cmp .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values.
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare4(AT a, BT b, BinaryOperation cmp) {
+  return detail::extend_vbinary4<unsigned, false, false>(a, b, 0, cmp);
+}
+
+/// Extend Inputs to 33 bit, and vectorized compare input values using specified
+/// comparison \p cmp , then add the result with \p c .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values, and add the
+/// result with \p c .
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c,
+                                               BinaryOperation cmp) {
+  return detail::extend_vbinary4<unsigned, false, true>(a, b, c, cmp);
+}
+
+} // namespace compat
diff --git a/tools/util/include/compat/memory.hpp b/tools/util/include/compat/memory.hpp
new file mode 100644
index 0000000000..d50e1ed92c
--- /dev/null
+++ b/tools/util/include/compat/memory.hpp
@@ -0,0 +1,1762 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  memory.hpp
+ *
+ *  Description:
+ *    memory functionality for the SYCL compatibility extension
+ **************************************************************************/
+
+// The original source was under the license below:
+//==---- memory.hpp -------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <map>
+#include <mutex>
+#include <thread>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+
+#include <sycl/builtins.hpp>
+#include <sycl/ext/oneapi/free_function_queries.hpp>
+#include <sycl/ext/oneapi/group_local_memory.hpp>
+#include <sycl/group.hpp>
+#include <sycl/usm.hpp>
+
+#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
+#include <sycl/ext/intel/experimental/usm_properties.hpp>
+#endif
+
+#include <compat/device.hpp>
+#include <compat/traits.hpp>
+#include <compat/defs.hpp>
+
+#if defined(__linux__)
+#include <sys/mman.h>
+#elif defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#error "Only support Windows and Linux."
+#endif
+
+namespace compat {
+
+template <typename AllocT>
+#ifdef __SYCL_DEVICE_ONLY__
+[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
+#endif
+__SYCL_ALWAYS_INLINE auto *local_mem() {
+  sycl::multi_ptr<AllocT, sycl::access::address_space::local_space>
+      As_multi_ptr =
+          sycl::ext::oneapi::group_local_memory_for_overwrite<AllocT>(
+              sycl::ext::oneapi::this_work_item::get_work_group<3>());
+  auto *As = *As_multi_ptr;
+  return As;
+}
+
+namespace detail {
+enum memcpy_direction {
+  host_to_host,
+  host_to_device,
+  device_to_host,
+  device_to_device,
+  automatic
+};
+} // namespace detail
+
+template <typename T>
+__compat_inline__
+    std::enable_if_t<std::is_same_v<T, uint32_t> || std::is_same_v<T, size_t>,
+                     T>
+    ptr_to_int(void *ptr) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, uint32_t>) {
+    return (intptr_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
+  } else {
+    return (size_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
+  }
+#else
+  throw sycl::exception(make_error_code(sycl::errc::runtime),
+                        "ptr_to_int is only supported on Nvidia devices.");
+#endif
+}
+
+enum class memory_region {
+  global = 0, // device global memory
+  constant,   // device read-only memory
+  local,      // device local memory
+  usm_shared, // memory which can be accessed by host and device
+};
+
+using byte_t = uint8_t;
+
+/// Buffer type to be used in Memory Management runtime.
+typedef sycl::buffer<byte_t> buffer_t;
+
+/// Pitched 2D/3D memory data.
+class pitched_data {
+public:
+  pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
+  pitched_data(void *data, size_t pitch, size_t x, size_t y)
+      : _data(data), _pitch(pitch), _x(x), _y(y) {}
+
+  void *get_data_ptr() { return _data; }
+  void set_data_ptr(void *data) { _data = data; }
+
+  size_t get_pitch() { return _pitch; }
+  void set_pitch(size_t pitch) { _pitch = pitch; }
+
+  size_t get_x() { return _x; }
+  void set_x(size_t x) { _x = x; };
+
+  size_t get_y() { return _y; }
+  void set_y(size_t y) { _y = y; }
+
+private:
+  void *_data;
+  size_t _pitch, _x, _y;
+};
+
+namespace experimental {
+#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
+class image_mem_wrapper;
+namespace detail {
+static sycl::event memcpy(const image_mem_wrapper *src,
+                          const sycl::id<3> &src_id, pitched_data &dest,
+                          const sycl::id<3> &dest_id,
+                          const sycl::range<3> &copy_extend, sycl::queue q);
+static sycl::event memcpy(const pitched_data src, const sycl::id<3> &src_id,
+                          image_mem_wrapper *dest, const sycl::id<3> &dest_id,
+                          const sycl::range<3> &copy_extend, sycl::queue q);
+} // namespace detail
+#endif
+class image_matrix;
+namespace detail {
+static pitched_data to_pitched_data(image_matrix *image);
+}
+
+/// Memory copy parameters for 2D/3D memory data.
+struct memcpy_parameter {
+  struct data_wrapper {
+    pitched_data pitched{};
+    sycl::id<3> pos{};
+#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
+    experimental::image_mem_wrapper *image_bindless{nullptr};
+#endif
+    image_matrix *image{nullptr};
+  };
+  data_wrapper from{};
+  data_wrapper to{};
+  sycl::range<3> size{};
+};
+} // namespace experimental
+
+namespace detail {
+class mem_mgr {
+  mem_mgr() {
+    // Reserved address space, no real memory allocation happens here.
+#if defined(__linux__)
+    mapped_address_space =
+        (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#elif defined(_WIN64)
+    mapped_address_space = (byte_t *)VirtualAlloc(
+        NULL,               // NULL specified as the base address parameter
+        mapped_region_size, // Size of allocation
+        MEM_RESERVE,        // Allocate reserved pages
+        PAGE_NOACCESS);     // Protection = no access
+#else
+#error "Only support Windows and Linux."
+#endif
+    next_free = mapped_address_space;
+  };
+
+public:
+  using buffer_id_t = int;
+
+  struct allocation {
+    buffer_t buffer;
+    byte_t *alloc_ptr;
+    size_t size;
+  };
+
+  ~mem_mgr() {
+#if defined(__linux__)
+    munmap(mapped_address_space, mapped_region_size);
+#elif defined(_WIN64)
+    VirtualFree(mapped_address_space, 0, MEM_RELEASE);
+#else
+#error "Only support Windows and Linux."
+#endif
+  };
+
+  mem_mgr(const mem_mgr &) = delete;
+  mem_mgr &operator=(const mem_mgr &) = delete;
+  mem_mgr(mem_mgr &&) = delete;
+  mem_mgr &operator=(mem_mgr &&) = delete;
+
+  /// Allocate
+  void *mem_alloc(size_t size) {
+    if (!size)
+      return nullptr;
+    std::lock_guard<std::mutex> lock(m_mutex);
+    if (next_free + size > mapped_address_space + mapped_region_size) {
+      throw std::runtime_error(
+          "[Compat] malloc: out of memory for virtual memory pool");
+    }
+    // Allocation
+    sycl::range<1> buffer_range(size);
+    buffer_t buf(buffer_range);
+    allocation alloc{buf, next_free, size};
+    // Map allocation to device pointer
+    void *result = next_free;
+    m_map.emplace(next_free + size, alloc);
+    // Update pointer to the next free space.
+    next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
+
+    return result;
+  }
+
+  /// Deallocate
+  void mem_free(const void *ptr) {
+    if (!ptr)
+      return;
+    std::lock_guard<std::mutex> lock(m_mutex);
+    auto it = get_map_iterator(ptr);
+    m_map.erase(it);
+  }
+
+  /// map: device pointer -> allocation(buffer, alloc_ptr, size)
+  allocation translate_ptr(const void *ptr) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    auto it = get_map_iterator(ptr);
+    return it->second;
+  }
+
+  /// Check if the pointer represents device pointer or not.
+  bool is_device_ptr(const void *ptr) const {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return (mapped_address_space <= ptr) &&
+           (ptr < mapped_address_space + mapped_region_size);
+  }
+
+  /// Returns the instance of memory manager singleton.
+  static mem_mgr &instance() {
+    static mem_mgr m;
+    return m;
+  }
+
+private:
+  std::map<byte_t *, allocation> m_map;
+  mutable std::mutex m_mutex;
+  byte_t *mapped_address_space;
+  byte_t *next_free;
+  const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
+  const size_t alignment = 256;
+  /// This padding may be defined to some positive value to debug
+  /// out of bound accesses.
+  const size_t extra_padding = 0;
+
+  std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr) {
+    auto it = m_map.upper_bound((byte_t *)ptr);
+    if (it == m_map.end()) {
+      // Not a virtual pointer.
+      throw std::runtime_error("[Compat] can not get buffer from non-virtual pointer");
+    }
+    const allocation &alloc = it->second;
+    if (ptr < alloc.alloc_ptr) {
+      // Out of bound.
+      // This may happen if there's a gap between allocations due to alignment
+      // or extra padding and pointer points to this gap.
+      throw std::runtime_error("[Compat] invalid virtual pointer");
+    }
+    return it;
+  }
+};
+
+template <class T, memory_region Memory, size_t Dimension> class accessor;
+template <memory_region Memory, class T = byte_t> class memory_traits {
+public:
+  static constexpr sycl::access::address_space asp =
+      (Memory == memory_region::local)
+          ? sycl::access::address_space::local_space
+          : sycl::access::address_space::global_space;
+  static constexpr sycl::target target = (Memory == memory_region::local)
+                                             ? sycl::target::local
+                                             : sycl::target::device;
+  static constexpr sycl::access_mode mode = (Memory == memory_region::constant)
+                                                ? sycl::access_mode::read
+                                                : sycl::access_mode::read_write;
+  static constexpr size_t type_size = sizeof(T);
+  using element_t =
+      typename std::conditional_t<Memory == memory_region::constant, const T,
+                                  T>;
+  using value_t = typename std::remove_cv_t<T>;
+  template <size_t Dimension = 1>
+  using accessor_t =
+      typename std::conditional_t<target == sycl::target::local,
+                                  sycl::local_accessor<T, Dimension>,
+                                  sycl::accessor<T, Dimension, mode>>;
+  using pointer_t =
+      typename std::conditional_t<Memory == memory_region::constant, const T *,
+                                  T *>;
+};
+
+static inline void *malloc(size_t size, sycl::queue q) {
+#ifdef COMPAT_USM_LEVEL_NONE
+  return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
+#else
+  return sycl::malloc_device(size, q.get_device(), q.get_context());
+#endif // COMPAT_USM_LEVEL_NONE
+}
+
+/// Calculate pitch (padded length of major dimension \p x) by rounding up to
+/// multiple of 32.
+/// \param x The dimension to be padded (in bytes)
+/// \returns size_t representing pitched length of dimension x (in bytes).
+static inline constexpr size_t get_pitch(size_t x) {
+  return ((x) + 31) & ~(0x1F);
+}
+
+/// \brief Malloc pitched 3D data
+/// \param [out] pitch returns the calculated pitch (in bytes)
+/// \param [in] x width of the allocation (in bytes)
+/// \param [in] y height of the allocation
+/// \param [in] z depth of the allocation
+/// \param [in] q The queue in which the operation is done.
+/// \returns A pointer to the allocated memory
+static inline void *malloc(size_t &pitch, size_t x, size_t y, size_t z,
+                           sycl::queue q) {
+  pitch = get_pitch(x);
+  return malloc(pitch * y * z, q);
+}
+
+/// \brief Set \p pattern to the first \p count elements of type \p T
+/// starting from \p dev_ptr.
+///
+/// \tparam T Datatype of the pattern to be set.
+/// \param q The queue in which the operation is done.
+/// \param dev_ptr Pointer to the device memory address.
+/// \param pattern Pattern of type T to be set.
+/// \param count Number of elements to be set to the patten.
+/// \returns An event representing the fill operation.
+template <class T>
+static inline sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern,
+                               size_t count) {
+#ifdef COMPAT_USM_LEVEL_NONE
+  auto &mm = mem_mgr::instance();
+  assert(mm.is_device_ptr(dev_ptr));
+  auto alloc = mm.translate_ptr(dev_ptr);
+  size_t offset = (T *)dev_ptr - (T *)alloc.alloc_ptr;
+
+  return q.submit([&](sycl::handler &cgh) {
+    auto r = sycl::range<1>(count);
+    auto o = sycl::id<1>(offset);
+    auto new_buffer =
+        alloc.buffer.reinterpret<T>(sycl::range<1>(alloc.size / sizeof(T)));
+    sycl::accessor<T, 1, sycl::access_mode::write, sycl::access::target::device>
+        acc(new_buffer, cgh, r, o);
+    cgh.fill(acc, pattern);
+  });
+#else
+  return q.fill(dev_ptr, pattern, count);
+#endif
+}
+
+/// Set \p value to the first \p size bytes starting from \p dev_ptr in \p q.
+///
+/// \param q The queue in which the operation is done.
+/// \param dev_ptr Pointer to the device memory address.
+/// \param value Value to be set.
+/// \param size Number of bytes to be set to the value.
+/// \returns An event representing the memset operation.
+static inline sycl::event memset(sycl::queue q, void *dev_ptr, int value,
+                                 size_t size) {
+#ifdef COMPAT_USM_LEVEL_NONE
+  auto &mm = mem_mgr::instance();
+  assert(mm.is_device_ptr(dev_ptr));
+  auto alloc = mm.translate_ptr(dev_ptr);
+  size_t offset = (byte_t *)dev_ptr - (byte_t *)alloc.alloc_ptr;
+
+  return q.submit([&](sycl::handler &cgh) {
+    auto r = sycl::range<1>(size);
+    auto o = sycl::id<1>(offset);
+    auto new_buffer = alloc.buffer.reinterpret<byte_t>(
+        sycl::range<1>(alloc.size / sizeof(byte_t)));
+    sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                   sycl::access::target::device>
+        acc(new_buffer, cgh, r, o);
+    cgh.fill(acc, static_cast<unsigned char>(value));
+  });
+#else
+  return q.memset(dev_ptr, value, size);
+#endif // COMPAT_USM_LEVEL_NONE
+}
+
+/// \brief Sets \p value to the 3D memory region pointed by \p data in \p q.
+/// \tparam T The type of the element to be set.
+/// \param [in] q The queue in which the operation is done.
+/// \param [in] data Pointer to the pitched device memory region.
+/// \param [in] value The value to be set.
+/// \param [in] size 3D memory region by number of elements.
+/// \return An event list representing the memset operations.
+template <typename T>
+static inline std::vector<sycl::event>
+memset(sycl::queue q, pitched_data data, const T &value, sycl::range<3> size) {
+  std::vector<sycl::event> event_list;
+  size_t slice = data.get_pitch() * data.get_y();
+  unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
+  for (size_t z = 0; z < size.get(2); ++z) {
+    unsigned char *data_ptr = data_surface;
+    for (size_t y = 0; y < size.get(1); ++y) {
+      event_list.push_back(detail::fill<T>(q, data_ptr, value, size.get(0)));
+      data_ptr += data.get_pitch();
+    }
+    data_surface += slice;
+  }
+  return event_list;
+}
+
+/// \brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p
+/// q.
+/// \tparam T The type of the element to be set.
+/// \param [in] q The queue in which the operation is done.
+/// \param [in] ptr Pointer to the virtual device memory.
+/// \param [in] pitch The pitch size by number of elements, including padding.
+/// \param [in] value The value to be set.
+/// \param [in] x The width of memory region by number of elements.
+/// \param [in] y The height of memory region by number of elements.
+/// \return An event list representing the memset operations.
+template <typename T>
+static inline std::vector<sycl::event> memset(sycl::queue q, void *ptr,
+                                              size_t pitch, const T &value,
+                                              size_t x, size_t y) {
+  return memset(q, pitched_data(ptr, pitch, x, 1), value,
+                sycl::range<3>(x, y, 1));
+}
+
+enum class pointer_access_attribute {
+  host_only = 0,
+  device_only,
+  host_device,
+  end
+};
+
+static pointer_access_attribute get_pointer_attribute(sycl::queue q,
+                                                      const void *ptr) {
+#ifdef COMPAT_USM_LEVEL_NONE
+  return mem_mgr::instance().is_device_ptr(ptr)
+             ? pointer_access_attribute::device_only
+             : pointer_access_attribute::host_only;
+#else
+  switch (sycl::get_pointer_type(ptr, q.get_context())) {
+  case sycl::usm::alloc::unknown:
+    return pointer_access_attribute::host_only;
+  case sycl::usm::alloc::device:
+    return pointer_access_attribute::device_only;
+  case sycl::usm::alloc::shared:
+  case sycl::usm::alloc::host:
+    return pointer_access_attribute::host_device;
+  }
+#endif // COMPAT_USM_LEVEL_NONE
+}
+
+static memcpy_direction
+deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr) {
+  // table[to_attribute][from_attribute]
+  static const memcpy_direction
+      direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
+                     [static_cast<unsigned>(pointer_access_attribute::end)] = {
+                         {host_to_host, device_to_host, host_to_host},
+                         {host_to_device, device_to_device, device_to_device},
+                         {host_to_host, device_to_device, device_to_device}};
+  return direction_table[static_cast<unsigned>(get_pointer_attribute(
+      q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
+}
+
+static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
+                          size_t size,
+                          const std::vector<sycl::event> &dep_events = {}) {
+  if (!size)
+    return sycl::event{};
+#ifdef COMPAT_USM_LEVEL_NONE
+  auto &mm = mem_mgr::instance();
+  auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr);
+
+  switch (real_direction) {
+  case host_to_host:
+    return q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); });
+    });
+  case host_to_device: {
+    auto alloc = mm.translate_ptr(to_ptr);
+    size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
+    return q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      auto r = sycl::range<1>(size);
+      auto o = sycl::id<1>(offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                     sycl::access::target::device>
+          acc(alloc.buffer, cgh, r, o);
+      cgh.copy(from_ptr, acc);
+    });
+  }
+  case device_to_host: {
+    auto alloc = mm.translate_ptr(from_ptr);
+    size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
+    return q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      auto r = sycl::range<1>(size);
+      auto o = sycl::id<1>(offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                     sycl::access::target::device>
+          acc(alloc.buffer, cgh, r, o);
+      cgh.copy(acc, to_ptr);
+    });
+  }
+  case device_to_device: {
+    auto to_alloc = mm.translate_ptr(to_ptr);
+    auto from_alloc = mm.translate_ptr(from_ptr);
+    size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
+    size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
+    return q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      auto r = sycl::range<1>(size);
+      auto to_o = sycl::id<1>(to_offset);
+      auto from_o = sycl::id<1>(from_offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                     sycl::access::target::device>
+          to_acc(to_alloc.buffer, cgh, r, to_o);
+      sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                     sycl::access::target::device>
+          from_acc(from_alloc.buffer, cgh, r, from_o);
+      cgh.copy(from_acc, to_acc);
+    });
+  }
+  default:
+    throw std::runtime_error("[Compat] memcpy: invalid direction value");
+  }
+#else
+  return q.memcpy(to_ptr, from_ptr, size, dep_events);
+#endif // COMPAT_USM_LEVEL_NONE
+}
+
+// Get actual copy range and make sure it will not exceed range.
+static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
+                                    size_t pitch) {
+  return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
+}
+
+static inline size_t get_offset(sycl::id<3> id, size_t slice, size_t pitch) {
+  return slice * id.get(2) + pitch * id.get(1) + id.get(0);
+}
+
+// RAII for host pointer
+class host_buffer {
+  void *_buf;
+  size_t _size;
+  sycl::queue _q;
+  const std::vector<sycl::event> &_deps; // free operation depends
+
+public:
+  host_buffer(size_t size, sycl::queue q, const std::vector<sycl::event> &deps)
+      : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+  void *get_ptr() const { return _buf; }
+  size_t get_size() const { return _size; }
+  ~host_buffer() {
+    if (_buf) {
+      _q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(_deps);
+        cgh.host_task([buf = _buf] { std::free(buf); });
+      });
+    }
+  }
+};
+
+/// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+/// and \p from_range to another specified by \p to_ptr and \p to_range.
+static inline std::vector<sycl::event>
+memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
+       sycl::range<3> to_range, sycl::range<3> from_range, sycl::id<3> to_id,
+       sycl::id<3> from_id, sycl::range<3> size,
+       const std::vector<sycl::event> &dep_events = {}) {
+
+  std::vector<sycl::event> event_list;
+
+  size_t to_slice = to_range.get(1) * to_range.get(0);
+  size_t from_slice = from_range.get(1) * from_range.get(0);
+  unsigned char *to_surface =
+      (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+  const unsigned char *from_surface =
+      (const unsigned char *)from_ptr +
+      get_offset(from_id, from_slice, from_range.get(0));
+
+  if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) {
+    return {memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                   dep_events)};
+  }
+  using namespace experimental; // for memcpy_direction
+  memcpy_direction direction = deduce_memcpy_direction(q, to_ptr, from_ptr);
+  size_t size_slice = size.get(1) * size.get(0);
+  switch (direction) {
+  case host_to_host:
+    for (size_t z = 0; z < size.get(2); ++z) {
+      unsigned char *to_ptr = to_surface;
+      const unsigned char *from_ptr = from_surface;
+      if (to_range.get(0) == from_range.get(0) &&
+          to_range.get(0) == size.get(0)) {
+        event_list.push_back(
+            memcpy(q, to_ptr, from_ptr, size_slice, dep_events));
+      } else {
+        for (size_t y = 0; y < size.get(1); ++y) {
+          event_list.push_back(
+              memcpy(q, to_ptr, from_ptr, size.get(0), dep_events));
+          to_ptr += to_range.get(0);
+          from_ptr += from_range.get(0);
+        }
+      }
+      to_surface += to_slice;
+      from_surface += from_slice;
+    }
+    break;
+  case host_to_device: {
+    host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
+                    event_list);
+    std::vector<sycl::event> host_events;
+    if (to_slice == size_slice) {
+      // Copy host data to a temp host buffer with the shape of target.
+      host_events =
+          memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                 sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, dep_events);
+    } else {
+      // Copy host data to a temp host buffer with the shape of target.
+      host_events =
+          memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                 sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                 // If has padding data, not sure whether it is useless. So fill
+                 // temp buffer with it.
+                 std::vector<sycl::event>{memcpy(q, buf.get_ptr(), to_surface,
+                                                 buf.get_size(), dep_events)});
+    }
+    // Copy from temp host buffer to device with only one submit.
+    event_list.push_back(
+        memcpy(q, to_surface, buf.get_ptr(), buf.get_size(), host_events));
+    break;
+  }
+  case device_to_host: {
+    host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
+                    event_list);
+    // Copy from host temp buffer to host target with reshaping.
+    event_list =
+        memcpy(q, to_surface, buf.get_ptr(), to_range, from_range,
+               sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+               // Copy from device to temp host buffer with only one submit.
+               std::vector<sycl::event>{memcpy(q, buf.get_ptr(), from_surface,
+                                               buf.get_size(), dep_events)});
+    break;
+  }
+  case device_to_device:
+#ifdef COMPAT_USM_LEVEL_NONE
+  {
+    auto &mm = mem_mgr::instance();
+    auto to_alloc = mm.translate_ptr(to_surface);
+    auto from_alloc = mm.translate_ptr(from_surface);
+    size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
+    size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
+    event_list.push_back(q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      auto to_o = sycl::id<1>(to_offset);
+      auto from_o = sycl::id<1>(from_offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                     sycl::access::target::device>
+          to_acc(to_alloc.buffer, cgh,
+                 get_copy_range(size, to_slice, to_range.get(0)), to_o);
+      sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                     sycl::access::target::device>
+          from_acc(from_alloc.buffer, cgh,
+                   get_copy_range(size, from_slice, from_range.get(0)), from_o);
+      cgh.parallel_for<class compat_memcpy_3d_detail_usmnone>(
+          size, [=](sycl::id<3> id) {
+            to_acc[get_offset(id, to_slice, to_range.get(0))] =
+                from_acc[get_offset(id, from_slice, from_range.get(0))];
+          });
+    }));
+  }
+#else
+    event_list.push_back(q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      cgh.parallel_for<class memcpy_3d_detail>(size, [=](sycl::id<3> id) {
+        to_surface[get_offset(id, to_slice, to_range.get(0))] =
+            from_surface[get_offset(id, from_slice, from_range.get(0))];
+      });
+    }));
+#endif // COMPAT_USM_LEVEL_NONE
+    break;
+  default:
+    throw std::runtime_error("[Compat] memcpy: invalid direction value");
+  }
+  return event_list;
+}
+
+/// memcpy 2D/3D matrix specified by pitched_data.
+static inline std::vector<sycl::event>
+memcpy(sycl::queue q, pitched_data to, sycl::id<3> to_id, pitched_data from,
+       sycl::id<3> from_id, sycl::range<3> size) {
+  return memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id,
+                from_id, size);
+}
+
+/// memcpy 2D matrix with pitch.
+static inline std::vector<sycl::event>
+memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t to_pitch,
+       size_t from_pitch, size_t x, size_t y) {
+  return memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                sycl::range<3>(from_pitch, y, 1), sycl::id<3>(0, 0, 0),
+                sycl::id<3>(0, 0, 0), sycl::range<3>(x, y, 1));
+}
+
+// Takes a std::vector<sycl::event> & returns a single event
+// which simply depends on all of them
+static sycl::event combine_events(std::vector<sycl::event> &events,
+                                  sycl::queue q) {
+  return q.submit([&events](sycl::handler &cgh) {
+    cgh.depends_on(events);
+    cgh.host_task([]() {});
+  });
+}
+
+} // namespace detail
+
+#ifdef COMPAT_USM_LEVEL_NONE
+/// Check if the pointer \p ptr represents device pointer or not.
+///
+/// \param ptr The pointer to be checked.
+/// \returns true if \p ptr is a device pointer.
+template <class T> static inline bool is_device_ptr(T ptr) {
+  if constexpr (std::is_pointer<T>::value) {
+    return detail::mem_mgr::instance().is_device_ptr(ptr);
+  }
+  return false;
+}
+#endif
+
+/// Get the buffer and the offset of a piece of memory pointed to by \p ptr.
+///
+/// \param ptr Pointer to a piece of memory.
+/// If NULL is passed as an argument, an exception will be thrown.
+/// \returns a pair containing both the buffer and the offset.
+static std::pair<buffer_t, size_t> get_buffer_and_offset(const void *ptr) {
+  if (ptr) {
+    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+    size_t offset = (byte_t *)ptr - alloc.alloc_ptr;
+    return std::make_pair(alloc.buffer, offset);
+  } else {
+    throw std::runtime_error(
+        "[Compat] NULL pointer argument in get_buffer_and_offset function is invalid");
+  }
+}
+
+/// Get the data pointed from \p ptr as a 1D buffer reinterpreted as type T.
+template <typename T> static sycl::buffer<T> get_buffer(const void *ptr) {
+  if (!ptr)
+    return sycl::buffer<T>(sycl::range<1>(0));
+  auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+  return alloc.buffer.reinterpret<T>(sycl::range<1>(alloc.size / sizeof(T)));
+}
+
+/// Get the buffer of a piece of memory pointed to by \p ptr.
+///
+/// \param ptr Pointer to a piece of memory.
+/// \returns the buffer.
+static buffer_t get_buffer(const void *ptr) {
+  return detail::mem_mgr::instance().translate_ptr(ptr).buffer;
+}
+
+/// Get the host pointer from a buffer that is mapped to virtual pointer ptr.
+/// \param ptr Virtual Pointer mapped to device buffer
+/// \returns A host pointer
+template <typename T> static inline T *get_host_ptr(const void *ptr) {
+  auto BufferOffset = get_buffer_and_offset(ptr);
+  auto host_ptr = BufferOffset.first.get_host_access()
+                      .get_multi_ptr<sycl::access::decorated::no>();
+  return (T *)(host_ptr + BufferOffset.second);
+}
+
+/// A wrapper class contains an accessor and an offset.
+template <typename dataT,
+          sycl::access_mode accessMode = sycl::access_mode::read_write>
+class access_wrapper {
+  sycl::accessor<byte_t, 1, accessMode> accessor;
+  size_t offset;
+
+public:
+  /// Construct the accessor wrapper for memory pointed by \p ptr.
+  ///
+  /// \param ptr Pointer to memory.
+  /// \param cgh The command group handler.
+  access_wrapper(const void *ptr, sycl::handler &cgh)
+      : accessor(get_buffer(ptr).get_access<accessMode>(cgh)), offset(0) {
+    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+    offset = (byte_t *)ptr - alloc.alloc_ptr;
+  }
+
+  /// Get the device pointer.
+  ///
+  /// \returns a device pointer with offset.
+  dataT get_raw_pointer() const { return (dataT)(&accessor[0] + offset); }
+};
+
+/// Get the accessor for memory pointed by \p ptr.
+///
+/// \param ptr Pointer to memory.
+/// If NULL is passed as an argument, an exception will be thrown.
+/// \param cgh The command group handler.
+/// \returns an accessor.
+template <sycl::access_mode accessMode = sycl::access_mode::read_write>
+static sycl::accessor<byte_t, 1, accessMode> get_access(const void *ptr,
+                                                        sycl::handler &cgh) {
+  if (ptr) {
+    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+    return alloc.buffer.get_access<accessMode>(cgh);
+  } else {
+    throw std::runtime_error(
+        "[Compat] NULL pointer argument in get_access function is invalid");
+  }
+}
+
+namespace experimental {
+namespace detail {
+static inline std::vector<sycl::event>
+memcpy(sycl::queue q, const experimental::memcpy_parameter &param) {
+  auto to = param.to.pitched;
+  auto from = param.from.pitched;
+#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
+  if (param.to.image_bindless != nullptr &&
+      param.from.image_bindless != nullptr) {
+    throw std::runtime_error(
+        "[Compat] memcpy: Unsupported bindless_image API.");
+    // TODO: Need change logic when sycl support image_mem to image_mem copy.
+    std::vector<sycl::event> event_list;
+    compat::detail::host_buffer buf(param.size.size(), q, event_list);
+    to.set_data_ptr(buf.get_ptr());
+    experimental::detail::memcpy(param.from.image_bindless, param.from.pos, to,
+                                 sycl::id<3>(0, 0, 0), param.size, q);
+    from.set_data_ptr(buf.get_ptr());
+    event_list.push_back(experimental::detail::memcpy(
+        from, sycl::id<3>(0, 0, 0), param.to.image_bindless, param.to.pos,
+        param.size, q));
+    return event_list;
+  } else if (param.to.image_bindless != nullptr) {
+    throw std::runtime_error(
+        "[Compat] memcpy: Unsupported bindless_image API.");
+    return {experimental::detail::memcpy(from, param.from.pos,
+                                         param.to.image_bindless, param.to.pos,
+                                         param.size, q)};
+  } else if (param.from.image_bindless != nullptr) {
+    throw std::runtime_error(
+        "[Compat] memcpy: Unsupported bindless_image API.");
+    return {experimental::detail::memcpy(param.from.image_bindless,
+                                         param.from.pos, to, param.to.pos,
+                                         param.size, q)};
+  }
+#endif
+  if (param.to.image != nullptr) {
+    throw std::runtime_error("[Compat] memcpy: Unsupported image API.");
+    to = experimental::detail::to_pitched_data(param.to.image);
+  }
+  if (param.from.image != nullptr) {
+    throw std::runtime_error("[Compat] memcpy: Unsupported image API.");
+    from = experimental::detail::to_pitched_data(param.from.image);
+  }
+  return compat::detail::memcpy(q, to, param.to.pos, from, param.from.pos,
+                                    param.size);
+}
+} // namespace detail
+} // namespace experimental
+
+/// Allocate memory block on the device.
+/// \param num_bytes Number of bytes to allocate.
+/// \param q Queue to execute the allocate task.
+/// \returns A pointer to the newly allocated memory.
+static inline void *malloc(size_t num_bytes,
+                           sycl::queue q = get_default_queue()) {
+  return detail::malloc(num_bytes, q);
+}
+
+/// Allocate memory block on the device.
+/// \param T Datatype to allocate
+/// \param count Number of elements to allocate.
+/// \param q Queue to execute the allocate task.
+/// \returns A pointer to the newly allocated memory.
+template <typename T>
+static inline T *malloc(size_t count, sycl::queue q = get_default_queue()) {
+  return static_cast<T *>(detail::malloc(count * sizeof(T), q));
+}
+
+/// Allocate memory block on the host.
+/// \param num_bytes Number of bytes to allocate.
+/// \param q Queue to execute the allocate task.
+/// \returns A pointer to the newly allocated memory.
+static inline void *malloc_host(size_t num_bytes,
+                                sycl::queue q = get_default_queue()) {
+  return sycl::malloc_host(num_bytes, q);
+}
+
+/// Allocate memory block on the host.
+/// \param T Datatype to allocate
+/// \param num_bytes Number of bytes to allocate.
+/// \param q Queue to execute the allocate task.
+/// \returns A pointer to the newly allocated memory.
+template <typename T>
+static inline T *malloc_host(size_t count,
+                             sycl::queue q = get_default_queue()) {
+  return static_cast<T *>(sycl::malloc_host(count * sizeof(T), q));
+}
+
+/// Allocate memory block of usm_shared memory.
+/// \param num_bytes Number of bytes to allocate.
+/// \param q Queue to execute the allocate task.
+/// \returns A pointer to the newly allocated memory.
+static inline void *malloc_shared(size_t num_bytes,
+                                  sycl::queue q = get_default_queue()) {
+  return sycl::malloc_shared(num_bytes, q);
+}
+
+/// Allocate memory block of usm_shared memory.
+/// \param num_bytes Number of bytes to allocate.
+/// \param q Queue to execute the allocate task.
+/// \returns A pointer to the newly allocated memory.
+template <typename T>
+static inline T *malloc_shared(size_t count,
+                               sycl::queue q = get_default_queue()) {
+  return static_cast<T *>(sycl::malloc_shared(count * sizeof(T), q));
+}
+
+/// Allocate memory block for 3D array on the device.
+/// \param size Size of the memory block, in bytes.
+/// \param q Queue to execute the allocate task.
+/// \returns A pitched_data object which stores the memory info.
+static inline pitched_data malloc(sycl::range<3> size,
+                                  sycl::queue q = get_default_queue()) {
+  pitched_data pitch(nullptr, 0, size.get(0), size.get(1));
+  size_t pitch_size;
+  pitch.set_data_ptr(
+      detail::malloc(pitch_size, size.get(0), size.get(1), size.get(2), q));
+  pitch.set_pitch(pitch_size);
+  return pitch;
+}
+
+/// Allocate memory block for 2D array on the device.
+/// \param [out] pitch Aligned size of x in bytes.
+/// \param x Range in dim x.
+/// \param y Range in dim y.
+/// \param q Queue to execute the allocate task.
+/// \returns A pointer to the newly allocated memory.
+static inline void *malloc(size_t &pitch, size_t x, size_t y,
+                           sycl::queue q = get_default_queue()) {
+  return detail::malloc(pitch, x, y, 1, q);
+}
+
+namespace detail {
+
+inline void free(void *ptr, const sycl::queue &q) {
+  if (ptr) {
+#ifdef COMPAT_USM_LEVEL_NONE
+    detail::mem_mgr::instance().mem_free(ptr);
+#else
+    sycl::free(ptr, q.get_context());
+#endif // COMPAT_USM_LEVEL_NONE
+  }
+}
+} // namespace detail
+
+/// Wait on the queue \p q and free the memory \p ptr.
+/// \param ptr Point to free.
+/// \param q Queue to execute the free task.
+/// \returns no return value.
+static inline void wait_and_free(void *ptr,
+                                 sycl::queue q = get_default_queue()) {
+  get_current_device().queues_wait_and_throw();
+  q.wait();
+  if (ptr) {
+    detail::free(ptr, q);
+  }
+}
+
+// Anonymous namespace to disable ADL for functions which might clash (memcpy,
+// memset, free)
+namespace {
+/// Free the memory \p ptr on the default queue without synchronizing
+/// \param ptr Point to free.
+/// \returns no return value.
+static inline void free(void *ptr, sycl::queue q = get_default_queue()) {
+  detail::free(ptr, q);
+}
+} // namespace
+
+/// Enqueues the release of all pointers in /p pointers on the /p q.
+/// The command waits on all passed /p events and returns an event that
+/// track the commands execution on the queue.
+///
+/// \param pointers The pointers point to the device memory requested to be
+/// freed.
+/// \param events The events to be waited on.
+/// \param q The sycl::queue the memory relates to.
+// Can't be static due to the friend declaration in the memory header.
+inline sycl::event enqueue_free(const std::vector<void *> &pointers,
+                                const std::vector<sycl::event> &events,
+                                sycl::queue q = get_default_queue()) {
+  auto event = q.submit(
+      [&pointers, &events, &q](sycl::handler &cgh) {
+        cgh.depends_on(events);
+        cgh.host_task([=]() {
+          for (auto p : pointers)
+            detail::free(p, q);
+        });
+      });
+  get_current_device().add_event(event);
+  return event;
+}
+
+namespace {
+/// Synchronously copies \p size bytes from the address specified by \p from_ptr
+/// to the address specified by \p to_ptr. The function will
+/// return after the copy is completed.
+///
+/// \param to_ptr Pointer to destination memory address.
+/// \param from_ptr Pointer to source memory address.
+/// \param size Number of bytes to be copied.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static void memcpy(void *to_ptr, const void *from_ptr, size_t size,
+                   sycl::queue q = get_default_queue()) {
+  detail::memcpy(q, to_ptr, from_ptr, size).wait();
+}
+
+} // namespace
+
+/// Asynchronously copies \p size bytes from the address specified by \p
+/// from_ptr to the address specified by \p to_ptr. The return of the function
+/// does NOT guarantee the copy is completed.
+///
+/// \param to_ptr Pointer to destination memory address.
+/// \param from_ptr Pointer to source memory address.
+/// \param size Number of bytes to be copied.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size,
+                                sycl::queue q = get_default_queue()) {
+  return detail::memcpy(q, to_ptr, from_ptr, size);
+}
+
+/// Asynchronously copies \p count T's from the address specified by \p
+/// from_ptr to the address specified by \p to_ptr. The return of the function
+/// does NOT guarantee the copy is completed.
+///
+/// \tparam T Datatype to be copied.
+/// \param to_ptr Pointer to destination memory address.
+/// \param from_ptr Pointer to source memory address.
+/// \param count Number of T to be copied.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+template <typename T>
+static sycl::event
+memcpy_async(type_identity_t<T> *to_ptr, const type_identity_t<T> *from_ptr,
+             size_t count, sycl::queue q = get_default_queue()) {
+  return detail::memcpy(q, static_cast<void *>(to_ptr),
+                        static_cast<const void *>(from_ptr), count * sizeof(T));
+}
+
+namespace {
+/// Synchronously copies \p count T's from the address specified by \p from_ptr
+/// to the address specified by \p to_ptr. The function will
+/// return after the copy is completed.
+///
+/// \tparam T Datatype to be copied.
+/// \param to_ptr Pointer to destination memory address.
+/// \param from_ptr Pointer to source memory address.
+/// \param count Number of T to be copied.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+template <typename T>
+static void memcpy(type_identity_t<T> *to_ptr,
+                   const type_identity_t<T> *from_ptr, size_t count,
+                   sycl::queue q = get_default_queue()) {
+  detail::memcpy(q, static_cast<void *>(to_ptr),
+                 static_cast<const void *>(from_ptr), count * sizeof(T))
+      .wait();
+}
+
+/// Synchronously copies 2D matrix specified by \p x and \p y from the address
+/// specified by \p from_ptr to the address specified by \p to_ptr, while \p
+/// from_pitch and \p to_pitch are the range of dim x in bytes of the matrix
+/// specified by \p from_ptr and \p to_ptr. The function will return after the
+/// copy is completed.
+///
+/// \param to_ptr Pointer to destination memory address.
+/// \param to_pitch Range of dim x in bytes of destination matrix.
+/// \param from_ptr Pointer to source memory address.
+/// \param from_pitch Range of dim x in bytes of source matrix.
+/// \param x Range of dim x of matrix to be copied.
+/// \param y Range of dim y of matrix to be copied.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
+                          size_t from_pitch, size_t x, size_t y,
+                          sycl::queue q = get_default_queue()) {
+  sycl::event::wait(
+      detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y));
+}
+
+} // namespace
+
+/// Asynchronously copies 2D matrix specified by \p x and \p y from the address
+/// specified by \p from_ptr to the address specified by \p to_ptr, while \p
+/// \p from_pitch and \p to_pitch are the range of dim x in bytes of the matrix
+/// specified by \p from_ptr and \p to_ptr. The return of the function does NOT
+/// guarantee the copy is completed.
+///
+/// \param to_ptr Pointer to destination memory address.
+/// \param to_pitch Range of dim x in bytes of destination matrix.
+/// \param from_ptr Pointer to source memory address.
+/// \param from_pitch Range of dim x in bytes of source matrix.
+/// \param x Range of dim x of matrix to be copied.
+/// \param y Range of dim y of matrix to be copied.
+/// \param q Queue to execute the copy task.
+/// \returns An event representing the memcpy operation.
+static inline sycl::event memcpy_async(void *to_ptr, size_t to_pitch,
+                                       const void *from_ptr, size_t from_pitch,
+                                       size_t x, size_t y,
+                                       sycl::queue q = get_default_queue()) {
+  auto events = detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y);
+  return detail::combine_events(events, q);
+}
+
+namespace {
+/// Synchronously copies a subset of a 3D matrix specified by \p to to another
+/// 3D matrix specified by \p from. The from and to position info are specified
+/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size.
+// The function will return after the copy is completed.
+///
+/// \param to Destination matrix info.
+/// \param to_pos Position of destination.
+/// \param from Source matrix info.
+/// \param from_pos Position of destination.
+/// \param size Range of the submatrix to be copied.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void memcpy(pitched_data to, sycl::id<3> to_pos,
+                          pitched_data from, sycl::id<3> from_pos,
+                          sycl::range<3> size,
+                          sycl::queue q = get_default_queue()) {
+  sycl::event::wait(detail::memcpy(q, to, to_pos, from, from_pos, size));
+}
+} // namespace
+
+/// Asynchronously copies a subset of a 3D matrix specified by \p to to another
+/// 3D matrix specified by \p from. The from and to position info are specified
+/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size.
+/// The return of the function does NOT guarantee the copy is completed.
+///
+/// \param to Destination matrix info.
+/// \param to_pos Position of destination.
+/// \param from Source matrix info.
+/// \param from_pos Position of destination.
+/// \param size Range of the submatrix to be copied.
+/// \param q Queue to execute the copy task.
+/// \returns An event representing the memcpy operation.
+static inline sycl::event memcpy_async(pitched_data to, sycl::id<3> to_pos,
+                                       pitched_data from, sycl::id<3> from_pos,
+                                       sycl::range<3> size,
+                                       sycl::queue q = get_default_queue()) {
+  auto events = detail::memcpy(q, to, to_pos, from, from_pos, size);
+  return detail::combine_events(events, q);
+}
+
+namespace {
+/// Synchronously sets \p pattern to the first \p count elements starting from
+/// \p dev_ptr. The function will return after the fill operation is completed.
+///
+/// \tparam T Datatype of the value to be set.
+/// \param dev_ptr Pointer to the device memory address.
+/// \param pattern Pattern of type \p T to be set.
+/// \param count Number of elements to be set to the patten.
+/// \param q The queue in which the operation is done.
+/// \returns no return value.
+template <class T>
+static void inline fill(void *dev_ptr, const T &pattern, size_t count,
+                        sycl::queue q = get_default_queue()) {
+  detail::fill(q, dev_ptr, pattern, count).wait();
+}
+} // namespace
+
+/// Asynchronously sets \p pattern to the first \p count elements starting from
+/// \p dev_ptr.
+/// The return of the function does NOT guarantee the fill operation is
+/// completed.
+///
+/// \tparam T Datatype of the pattern to be set.
+/// \param dev_ptr Pointer to the device memory address.
+/// \param pattern Pattern of type \p T to be set.
+/// \param count Number of elements to be set to the patten.
+/// \param q The queue in which the operation is done.
+/// \returns An event representing the fill operation.
+template <class T>
+static sycl::event inline fill_async(void *dev_ptr, const T &pattern,
+                                     size_t count,
+                                     sycl::queue q = get_default_queue()) {
+  return detail::fill(q, dev_ptr, pattern, count);
+}
+
+namespace experimental {
+
+/// [UNSUPPORTED] Synchronously copies 2D/3D memory data specified by \p param .
+/// The function will return after the copy is completed.
+///
+/// \param param Memory copy parameters.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void memcpy(const memcpy_parameter &param,
+                          sycl::queue q = get_default_queue()) {
+  sycl::event::wait(compat::experimental::detail::memcpy(q, param));
+}
+
+/// [UNSUPPORTED] Asynchronously copies 2D/3D memory data specified by \p param
+/// . The return of the function does NOT guarantee the copy is completed.
+///
+/// \param param Memory copy parameters.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void memcpy_async(const memcpy_parameter &param,
+                                sycl::queue q = get_default_queue()) {
+  compat::experimental::detail::memcpy(q, param);
+}
+} // namespace experimental
+
+namespace {
+/// Synchronously sets \p value to the first \p size bytes starting from \p
+/// dev_ptr. The function will return after the memset operation is completed.
+///
+/// \param dev_ptr Pointer to the device memory address.
+/// \param value Value to be set.
+/// \param size Number of bytes to be set to the value.
+/// \param q The queue in which the operation is done.
+/// \returns no return value.
+static void memset(void *dev_ptr, int value, size_t size,
+                   sycl::queue q = get_default_queue()) {
+  detail::memset(q, dev_ptr, value, size).wait();
+}
+} // namespace
+
+/// \brief Sets 2 bytes data \p value to the first \p size elements starting
+/// from \p dev_ptr in \p q synchronously.
+/// \param [in] dev_ptr Pointer to the virtual device memory address.
+/// \param [in] value The value to be set.
+/// \param [in] size Number of elements to be set to the value.
+/// \param [in] q The queue in which the operation is done.
+static inline void memset_d16(void *dev_ptr, unsigned short value, size_t size,
+                              sycl::queue q = get_default_queue()) {
+  detail::fill<unsigned short>(q, dev_ptr, value, size).wait();
+}
+
+/// \brief Sets 4 bytes data \p value to the first \p size elements starting
+/// from \p dev_ptr in \p q synchronously.
+/// \param [in] dev_ptr Pointer to the virtual device memory address.
+/// \param [in] value The value to be set.
+/// \param [in] size Number of elements to be set to the value.
+/// \param [in] q The queue in which the operation is done.
+static inline void memset_d32(void *dev_ptr, unsigned int value, size_t size,
+                              sycl::queue q = get_default_queue()) {
+  detail::fill<unsigned int>(q, dev_ptr, value, size).wait();
+}
+
+/// \brief Sets 1 byte data \p value to the first \p size elements starting
+/// from \p dev_ptr in \p q asynchronously.
+/// \param dev_ptr Pointer to the device memory address.
+/// \param value Value to be set.
+/// \param size Number of bytes to be set to the value.
+/// \returns An event representing the memset operation.
+static inline sycl::event memset_async(void *dev_ptr, int value, size_t size,
+                                       sycl::queue q = get_default_queue()) {
+  return detail::memset(q, dev_ptr, value, size);
+}
+
+/// \brief Sets 2 bytes data \p value to the first \p size elements starting
+/// from \p dev_ptr in \p q asynchronously.
+/// \param [in] dev_ptr Pointer to the virtual device memory address.
+/// \param [in] value The value to be set.
+/// \param [in] size Number of elements to be set to the value.
+/// \param [in] q The queue in which the operation is done.
+/// \returns An event representing the memset operation.
+static inline sycl::event
+memset_d16_async(void *dev_ptr, unsigned short value, size_t size,
+                 sycl::queue q = get_default_queue()) {
+  return detail::fill<unsigned short>(q, dev_ptr, value, size);
+}
+
+/// \brief Sets 4 bytes data \p value to the first \p size elements starting
+/// from \p dev_ptr in \p q asynchronously.
+/// \param [in] dev_ptr Pointer to the virtual device memory address.
+/// \param [in] value The value to be set.
+/// \param [in] size Number of elements to be set to the value.
+/// \param [in] q The queue in which the operation is done.
+/// \returns An event representing the memset operation.
+static inline sycl::event
+memset_d32_async(void *dev_ptr, unsigned int value, size_t size,
+                 sycl::queue q = get_default_queue()) {
+  return detail::fill<unsigned int>(q, dev_ptr, value, size);
+}
+
+namespace {
+/// \brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p
+/// ptr in \p q synchronously.
+/// \param [in] ptr Pointer to the virtual device memory.
+/// \param [in] pitch The pitch size by number of elements, including padding.
+/// \param [in] val The value to be set.
+/// \param [in] x The width of memory region by number of elements.
+/// \param [in] y The height of memory region by number of elements.
+/// \param [in] q The queue in which the operation is done.
+static inline void memset(void *ptr, size_t pitch, int val, size_t x, size_t y,
+                          sycl::queue q = get_default_queue()) {
+  sycl::event::wait(detail::memset<unsigned char>(q, ptr, pitch, val, x, y));
+}
+} // namespace
+
+/// \brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by
+/// ptr in \p q synchronously.
+/// \param [in] ptr Pointer to the virtual device memory.
+/// \param [in] pitch The pitch size by number of elements, including padding.
+/// \param [in] val The value to be set.
+/// \param [in] x The width of memory region by number of elements.
+/// \param [in] y The height of memory region by number of elements.
+/// \param [in] q The queue in which the operation is done.
+static inline void memset_d16(void *ptr, size_t pitch, unsigned short val,
+                              size_t x, size_t y,
+                              sycl::queue q = get_default_queue()) {
+  sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y));
+}
+
+/// \brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by
+/// ptr in \p q synchronously.
+/// \param [in] ptr Pointer to the virtual device memory.
+/// \param [in] pitch The pitch size by number of elements, including padding.
+/// \param [in] val The value to be set.
+/// \param [in] x The width of memory region by number of elements.
+/// \param [in] y The height of memory region by number of elements.
+/// \param [in] q The queue in which the operation is done.
+static inline void memset_d32(void *ptr, size_t pitch, unsigned int val,
+                              size_t x, size_t y,
+                              sycl::queue q = get_default_queue()) {
+  sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y));
+}
+
+/// \brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p
+/// ptr in \p q asynchronously.
+/// \param [in] ptr Pointer to the virtual device memory.
+/// \param [in] pitch The pitch size by number of elements, including padding.
+/// \param [in] val The value to be set.
+/// \param [in] x The width of memory region by number of elements.
+/// \param [in] y The height of memory region by number of elements.
+/// \param [in] q The queue in which the operation is done.
+/// \returns An event representing the memset operation.
+static inline sycl::event memset_async(void *ptr, size_t pitch, int val,
+                                       size_t x, size_t y,
+                                       sycl::queue q = get_default_queue()) {
+
+  auto events = detail::memset<unsigned char>(q, ptr, pitch, val, x, y);
+  return detail::combine_events(events, q);
+}
+
+/// \brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by
+/// \p ptr in \p q asynchronously.
+/// \param [in] ptr Pointer to the virtual device memory.
+/// \param [in] pitch The pitch size by number of elements, including padding.
+/// \param [in] val The value to be set.
+/// \param [in] x The width of memory region by number of elements.
+/// \param [in] y The height of memory region by number of elements.
+/// \param [in] q The queue in which the operation is done.
+/// \returns An event representing the memset operation.
+static inline sycl::event
+memset_d16_async(void *ptr, size_t pitch, unsigned short val, size_t x,
+                 size_t y, sycl::queue q = get_default_queue()) {
+  auto events = detail::memset(q, ptr, pitch, val, x, y);
+  return detail::combine_events(events, q);
+}
+
+/// \brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by
+/// \p ptr in \p q asynchronously.
+/// \param [in] ptr Pointer to the virtual device memory.
+/// \param [in] pitch The pitch size by number of elements, including padding.
+/// \param [in] val The value to be set.
+/// \param [in] x The width of memory region by number of elements.
+/// \param [in] y The height of memory region by number of elements.
+/// \param [in] q The queue in which the operation is done.
+/// \returns An event representing the memset operation.
+static inline sycl::event
+memset_d32_async(void *ptr, size_t pitch, unsigned int val, size_t x, size_t y,
+                 sycl::queue q = get_default_queue()) {
+  auto events = detail::memset(q, ptr, pitch, val, x, y);
+  return detail::combine_events(events, q);
+}
+
+namespace {
+/// Sets \p value to the 3D memory region specified by \p pitch in \p q. \p size
+/// specify the setted 3D memory size. The function will return after the
+/// memset operation is completed.
+///
+/// \param pitch Specify the 3D memory region.
+/// \param value Value to be set.
+/// \param size The setted 3D memory size.
+/// \param q The queue in which the operation is done.
+/// \returns no return value.
+static inline void memset(pitched_data pitch, int val, sycl::range<3> size,
+                          sycl::queue q = get_default_queue()) {
+  sycl::event::wait(detail::memset<unsigned char>(q, pitch, val, size));
+}
+} // namespace
+
+/// Sets \p value to the 3D memory region specified by \p pitch in \p q. \p size
+/// specify the setted 3D memory size. The return of the function does NOT
+/// guarantee the memset operation is completed.
+///
+/// \param pitch Specify the 3D memory region.
+/// \param value Value to be set.
+/// \param size The setted 3D memory size.
+/// \param q The queue in which the operation is done.
+/// \returns An event representing the memset operation.
+static inline sycl::event memset_async(pitched_data pitch, int val,
+                                       sycl::range<3> size,
+                                       sycl::queue q = get_default_queue()) {
+  auto events = detail::memset<unsigned char>(q, pitch, val, size);
+  return detail::combine_events(events, q);
+}
+
+/// accessor used as device function parameter.
+template <class T, memory_region Memory, size_t Dimension> class accessor;
+template <class T, memory_region Memory> class accessor<T, Memory, 3> {
+public:
+  using memory_t = detail::memory_traits<Memory, T>;
+  using element_t = typename memory_t::element_t;
+  using pointer_t = typename memory_t::pointer_t;
+  using accessor_t = typename memory_t::template accessor_t<3>;
+  accessor(pointer_t data, const sycl::range<3> &in_range)
+      : _data(data), _range(in_range) {}
+  template <memory_region M = Memory>
+  accessor(typename std::enable_if<M != memory_region::local,
+                                   const accessor_t>::type &acc)
+      : accessor(acc, acc.get_range()) {}
+  accessor(const accessor_t &acc, const sycl::range<3> &in_range)
+      : accessor(
+            acc.template get_multi_ptr<sycl::access::decorated::no>().get(),
+            in_range) {}
+  accessor<T, Memory, 2> operator[](size_t index) const {
+    sycl::range<2> sub(_range.get(1), _range.get(2));
+    return accessor<T, Memory, 2>(_data + index * sub.size(), sub);
+  }
+
+  pointer_t get_ptr() const { return _data; }
+
+private:
+  pointer_t _data;
+  sycl::range<3> _range;
+};
+template <class T, memory_region Memory> class accessor<T, Memory, 2> {
+public:
+  using memory_t = detail::memory_traits<Memory, T>;
+  using element_t = typename memory_t::element_t;
+  using pointer_t = typename memory_t::pointer_t;
+  using accessor_t = typename memory_t::template accessor_t<2>;
+  accessor(pointer_t data, const sycl::range<2> &in_range)
+      : _data(data), _range(in_range) {}
+  template <memory_region Mem = Memory>
+  accessor(typename std::enable_if<Mem != memory_region::local,
+                                   const accessor_t>::type &acc)
+      : accessor(acc, acc.get_range()) {}
+  accessor(const accessor_t &acc, const sycl::range<2> &in_range)
+      : accessor(
+            acc.template get_multi_ptr<sycl::access::decorated::no>().get(),
+            in_range) {}
+
+  pointer_t operator[](size_t index) const {
+    return _data + _range.get(1) * index;
+  }
+
+  pointer_t get_ptr() const { return _data; }
+
+private:
+  pointer_t _data;
+  sycl::range<2> _range;
+};
+
+/// Device variable with address space of shared or global.
+// TODO(compat-lib-reviewers): This doesn't yet support multi-device (ptr
+// per device)
+template <class T, memory_region Memory, size_t Dimension> class device_memory {
+public:
+  using accessor_t =
+      typename detail::memory_traits<Memory, T>::template accessor_t<Dimension>;
+  using value_t = typename detail::memory_traits<Memory, T>::value_t;
+  using compat_accessor_t = compat::accessor<T, Memory, Dimension>;
+
+  device_memory(sycl::queue q = get_default_queue())
+      : device_memory(sycl::range<Dimension>(1), q) {}
+
+  /// Constructor of 1-D array with initializer list
+  device_memory(const sycl::range<Dimension> &in_range,
+                std::initializer_list<value_t> &&init_list,
+                sycl::queue q = get_default_queue())
+      : device_memory(in_range, q) {
+    assert(init_list.size() <= in_range.size());
+    _host_ptr = (value_t *)std::malloc(_size);
+    std::memset(_host_ptr, 0, _size);
+    std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
+  }
+
+  /// Constructor of 2-D array with initializer list
+  template <size_t Dim = Dimension>
+  device_memory(
+      const typename std::enable_if<Dim == 2, sycl::range<2>>::type &in_range,
+      std::initializer_list<std::initializer_list<value_t>> &&init_list,
+      sycl::queue q = get_default_queue())
+      : device_memory(in_range, q) {
+    assert(init_list.size() <= in_range[0]);
+    _host_ptr = (value_t *)std::malloc(_size);
+    std::memset(_host_ptr, 0, _size);
+    auto tmp_data = _host_ptr;
+    for (auto sub_list : init_list) {
+      assert(sub_list.size() <= in_range[1]);
+      std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T));
+      tmp_data += in_range[1];
+    }
+  }
+
+  /// Constructor with range
+  device_memory(const sycl::range<Dimension> &range_in,
+                sycl::queue q = get_default_queue())
+      : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
+        _host_ptr(nullptr), _device_ptr(nullptr), _q(q) {
+    static_assert((Memory == memory_region::global) ||
+                      (Memory == memory_region::constant) ||
+                      (Memory == memory_region::usm_shared),
+                  "device memory region should be global, constant or shared");
+    // Make sure that singleton class dev_mgr will destruct later than this.
+    detail::dev_mgr::instance();
+#ifdef COMPAT_USM_LEVEL_NONE
+    detail::mem_mgr::instance();
+#endif
+  }
+
+  /// Constructor with range
+  // enable_if_t SFINAE to avoid ambiguity with
+  // device_memory(Args... Arguments, sycl::queue q)
+  template <class... Args, size_t Dim = Dimension,
+            typename = std::enable_if_t<sizeof...(Args) == Dim>>
+  device_memory(Args... Arguments)
+      : device_memory(sycl::range<Dimension>(Arguments...),
+                      get_default_queue()) {}
+
+  /// Constructor with range and queue
+  template <class... Args>
+  device_memory(Args... Arguments, sycl::queue q)
+      : device_memory(sycl::range<Dimension>(Arguments...), q) {}
+
+  ~device_memory() {
+    if (_device_ptr && !_reference)
+      compat::free(_device_ptr, _q);
+    if (_host_ptr)
+      std::free(_host_ptr);
+  }
+
+  /// Allocate memory with the queue specified in the constuctor, and init
+  /// memory if has initial value
+  void init() { init(_q); }
+  /// Allocate memory with specified queue, and init memory if has initial
+  /// value.
+  void init(sycl::queue q) {
+    if (_device_ptr)
+      return;
+    if (!_size)
+      return;
+    allocate_device(q);
+    if (_host_ptr)
+      detail::memcpy(q, _device_ptr, _host_ptr, _size);
+  }
+
+  /// The variable is assigned to a device pointer.
+  void assign(value_t *src, size_t size) {
+    this->~device_memory();
+    new (this) device_memory(src, size, _q);
+  }
+
+  // Get memory pointer of the memory object, a device USM pointer.
+  value_t *get_ptr() { return get_ptr(_q); }
+
+  // Get memory pointer of the memory object, a device USM pointer.
+  value_t *get_ptr(sycl::queue q) {
+    init(q);
+    return _device_ptr;
+  }
+
+  /// Get the device memory object size in bytes.
+  size_t get_size() { return _size; }
+
+  template <size_t Dim = Dimension>
+  typename std::enable_if<Dim == 1, T>::type &operator[](size_t index) {
+    init();
+#ifdef COMPAT_USM_LEVEL_NONE
+    return compat::get_buffer<typename std::enable_if<Dim == 1, T>::type>(
+               _device_ptr)
+        .template get_access<sycl::access_mode::read_write>()[index];
+#else
+    return _device_ptr[index];
+#endif // COMPAT_USM_LEVEL_NONE
+  }
+
+#ifdef COMPAT_USM_LEVEL_NONE
+  /// Get sycl::accessor for the device memory object when usm is not used.
+  accessor_t get_access(sycl::handler &cgh) {
+    return get_buffer(_device_ptr)
+        .template reinterpret<T, Dimension>(_range)
+        .template get_access<detail::memory_traits<Memory, T>::mode,
+                             detail::memory_traits<Memory, T>::target>(cgh);
+  }
+#else
+  /// Get compat_accessor with dimension info for the device memory object
+  /// when usm is used and dimension is greater than 1.
+  template <size_t Dim = Dimension>
+  typename std::enable_if<Dim != 1, compat_accessor_t>::type
+  get_access(sycl::handler &cgh) {
+    return compat_accessor_t((T *)_device_ptr, _range);
+  }
+#endif // COMPAT_USM_LEVEL_NONE
+
+private:
+  device_memory(value_t *memory_ptr, size_t size,
+                sycl::queue q = get_default_queue())
+      : _size(size), _range(size / sizeof(T)), _reference(true),
+        _device_ptr(memory_ptr), _q(q) {}
+
+  void allocate_device(sycl::queue q) {
+#ifndef COMPAT_USM_LEVEL_NONE
+    if (Memory == memory_region::usm_shared) {
+      _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
+                                                   q.get_context());
+      return;
+    }
+#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
+    if (Memory == memory_region::constant) {
+      _device_ptr = (value_t *)sycl::malloc_device(
+          _size, q.get_device(), q.get_context(),
+          sycl::ext::oneapi::property::usm::device_read_only());
+      return;
+    }
+#endif
+#endif
+    _device_ptr = (value_t *)detail::malloc(_size, q);
+  }
+
+  size_t _size;
+  sycl::range<Dimension> _range;
+  bool _reference;
+  value_t *_host_ptr;
+  value_t *_device_ptr;
+  sycl::queue _q;
+};
+template <class T, memory_region Memory>
+class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
+public:
+  using base = device_memory<T, Memory, 1>;
+  using value_t = typename base::value_t;
+  using accessor_t =
+      typename detail::memory_traits<Memory, T>::template accessor_t<0>;
+
+  /// Constructor with initial value.
+  device_memory(const value_t &val, sycl::queue q = get_default_queue())
+      : base(sycl::range<1>(1), {val}, q) {}
+
+  /// Default constructor
+  device_memory(sycl::queue q = get_default_queue()) : base(1, q) {}
+#ifdef COMPAT_USM_LEVEL_NONE
+  /// Get sycl::accessor for the device memory object when usm is not used.
+  accessor_t get_access(sycl::handler &cgh) {
+    auto buf = get_buffer(base::get_ptr())
+                   .template reinterpret<T, 1>(sycl::range<1>(1));
+    return accessor_t(buf, cgh);
+  }
+#endif // COMPAT_USM_LEVEL_NONE
+};
+
+template <class T, size_t Dimension>
+using global_memory = device_memory<T, memory_region::global, Dimension>;
+template <class T, size_t Dimension>
+using constant_memory = device_memory<T, memory_region::constant, Dimension>;
+template <class T, size_t Dimension>
+using shared_memory = device_memory<T, memory_region::usm_shared, Dimension>;
+
+class pointer_attributes {
+public:
+  void init(const void *ptr, sycl::queue q = get_default_queue()) {
+#ifdef COMPAT_USM_LEVEL_NONE
+    throw std::runtime_error(
+        "[Compat] pointer_attributes: only works for USM pointer.");
+#else
+    memory_type = sycl::get_pointer_type(ptr, q.get_context());
+    device_pointer = (memory_type != sycl::usm::alloc::unknown) ? ptr : nullptr;
+    host_pointer = (memory_type != sycl::usm::alloc::unknown) &&
+                           (memory_type != sycl::usm::alloc::device)
+                       ? ptr
+                       : nullptr;
+    sycl::device device_obj = sycl::get_pointer_device(ptr, q.get_context());
+    device_id = detail::dev_mgr::instance().get_device_id(device_obj);
+#endif // COMPAT_USM_LEVEL_NONE
+  }
+
+  sycl::usm::alloc get_memory_type() { return memory_type; }
+
+  const void *get_device_pointer() { return device_pointer; }
+
+  const void *get_host_pointer() { return host_pointer; }
+
+  bool is_memory_shared() { return memory_type == sycl::usm::alloc::shared; }
+
+  unsigned int get_device_id() { return device_id; }
+
+private:
+  sycl::usm::alloc memory_type = sycl::usm::alloc::unknown;
+  const void *device_pointer = nullptr;
+  const void *host_pointer = nullptr;
+  unsigned int device_id = 0;
+};
+
+} // namespace compat
diff --git a/tools/util/include/compat/traits.hpp b/tools/util/include/compat/traits.hpp
new file mode 100644
index 0000000000..a4c293822c
--- /dev/null
+++ b/tools/util/include/compat/traits.hpp
@@ -0,0 +1,294 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  traits.hpp
+ *
+ *  Description:
+ *    Type traits for the SYCL compatibility extension
+ **************************************************************************/
+
+#pragma once
+
+#include <sycl/feature_test.hpp>
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+#include <sycl/ext/oneapi/bfloat16.hpp>
+#endif
+#include <cstddef>
+#include <sycl/ext/oneapi/properties/properties.hpp>
+#include <sycl/ext/oneapi/properties/property_value.hpp>
+#include <sycl/range.hpp>
+#include <sycl/nd_range.hpp>
+#include <type_traits>
+
+namespace compat {
+
+// Equivalent to C++20's std::type_identity (used to create non-deduced
+// contexts)
+template <class T> struct type_identity {
+  using type = T;
+};
+template <class T> using type_identity_t = typename type_identity<T>::type;
+
+// Defines the operand type for arithemtic operations on T. This is identity
+// for all types except pointers, for which it is std::ptrdiff_t
+template <typename T> struct arith {
+  using type = std::conditional_t<std::is_pointer_v<T>, std::ptrdiff_t, T>;
+};
+template <typename T> using arith_t = typename arith<T>::type;
+
+// Traits to check device function signature matches args (with or without local
+// mem)
+template <auto F, typename... Args>
+struct device_fn_invocable : std::is_invocable<decltype(F), Args...> {};
+
+template <auto F, typename... Args>
+struct device_fn_lmem_invocable
+    : std::is_invocable<decltype(F), Args..., char *> {};
+
+template <typename LaunchPolicy, auto F, typename... Args>
+constexpr inline bool args_compatible =
+    std::conditional_t<LaunchPolicy::HasLocalMem,
+                       device_fn_lmem_invocable<F, Args...>,
+                       device_fn_invocable<F, Args...>>::value;
+
+namespace detail {
+
+// Trait for identifying sycl::range and sycl::nd_range.
+template <typename T> struct is_range : std::false_type {};
+template <int Dim> struct is_range<sycl::range<Dim>> : std::true_type {};
+
+template <typename T> constexpr bool is_range_v = is_range<T>::value;
+
+template <typename T> struct is_nd_range : std::false_type {};
+template <int Dim> struct is_nd_range<sycl::nd_range<Dim>> : std::true_type {};
+
+template <typename T> constexpr bool is_nd_range_v = is_nd_range<T>::value;
+
+template <typename T>
+constexpr bool is_range_or_nd_range_v =
+    std::disjunction_v<is_range<T>, is_nd_range<T>>;
+
+// Trait range_to_item_t to convert nd_range -> nd_item, range -> item
+template <typename T> struct range_to_item_map;
+template <int Dim> struct range_to_item_map<sycl::nd_range<Dim>> {
+  using ItemT = sycl::nd_item<Dim>;
+};
+template <int Dim> struct range_to_item_map<sycl::range<Dim>> {
+  using ItemT = sycl::item<Dim>;
+};
+
+template <typename T>
+using range_to_item_t = typename range_to_item_map<T>::ItemT;
+
+} // namespace detail
+
+// Forward decls
+namespace experimental {
+
+template <typename Properties> struct kernel_properties;
+template <typename Properties> struct launch_properties;
+struct local_mem_size;
+
+template <typename Range, typename KProps, typename LProps, bool LocalMem>
+class launch_policy;
+} // namespace experimental
+
+namespace experimental::detail {
+
+// Helper for tuple_template_index
+template <template <typename TT> typename PropertyContainer, typename Tuple>
+struct tuple_template_index_helper;
+
+template <template <typename TT> typename PropertyContainer>
+struct tuple_template_index_helper<PropertyContainer, std::tuple<>> {
+  static constexpr std::size_t value = 0;
+};
+
+template <template <typename TT> typename PropertyContainer, typename T,
+          typename... Rest>
+struct tuple_template_index_helper<PropertyContainer,
+                                   std::tuple<PropertyContainer<T>, Rest...>> {
+  static constexpr std::size_t value = 0;
+  using RestTuple = std::tuple<Rest...>;
+  static_assert(
+      tuple_template_index_helper<PropertyContainer, RestTuple>::value ==
+          std::tuple_size_v<RestTuple>,
+      "type appears more than once in tuple");
+};
+
+template <template <typename TT> typename PropertyContainer, typename First,
+          typename... Rest>
+struct tuple_template_index_helper<PropertyContainer,
+                                   std::tuple<First, Rest...>> {
+  using RestTuple = std::tuple<Rest...>;
+  static constexpr std::size_t value =
+      1 + tuple_template_index_helper<PropertyContainer, RestTuple>::value;
+};
+
+// tuple_template_index is a trait helper which finds the index of a class
+// template in a std::tuple<Ts...>. During template argument deduction for
+// launch, this enables us to search the tuple for e.g. `kernel_properties`
+// without knowing the concrete type (e.g. kernel_properties<KProps>) A compile
+// time error is raised if the class template is found more than once. If not
+// found, returns the tuple size (i.e. this is not an error).
+template <template <typename TT> typename PropertyContainer, typename Tuple>
+struct tuple_template_index {
+  static constexpr std::size_t value =
+      tuple_template_index_helper<PropertyContainer, Tuple>::value;
+};
+
+// tuple_contains_template piggy-backs on the functionality of
+// tuple_template_index to detect whether a class template exists in the tuple
+template <template <typename TT> typename PropertyContainer, typename Tuple>
+    struct tuple_contains_template
+    : std::conditional_t <
+      tuple_template_index<PropertyContainer, Tuple>::value<
+          std::tuple_size_v<Tuple>, std::true_type, std::false_type> {};
+
+template <bool TupleContains, typename PropertyContainerConcrete,
+          typename Tuple>
+struct property_getter_helper;
+
+template <typename PropertyContainerConcrete, typename Tuple>
+struct property_getter_helper<true, PropertyContainerConcrete, Tuple> {
+  PropertyContainerConcrete operator()(Tuple tuple) {
+    return std::get<PropertyContainerConcrete>(tuple);
+  }
+};
+
+template <typename PropertyContainerConcrete, typename Tuple>
+struct property_getter_helper<false, PropertyContainerConcrete, Tuple> {
+  PropertyContainerConcrete operator()(Tuple) {
+    return {};
+  }
+};
+
+// For local_mem_size
+template <typename T, typename Tuple> struct has_type;
+
+template <typename T, typename... Us>
+struct has_type<T, std::tuple<Us...>>
+    : std::disjunction<std::is_same<T, Us>...> {};
+
+template <template <typename TT> typename PropertyContainer,
+          typename PropertyContainerConcrete, typename Tuple>
+using property_getter = property_getter_helper<
+    detail::tuple_contains_template<PropertyContainer, Tuple>::value,
+    PropertyContainerConcrete, Tuple>;
+
+template <typename PropertyContainerConcrete, typename Tuple>
+using local_mem_getter =
+    property_getter_helper<has_type<PropertyContainerConcrete, Tuple>::value,
+                           PropertyContainerConcrete, Tuple>;
+
+// Helpers for properties_or_empty
+template <bool InTuple, template <typename TT> typename PropertyContainer,
+          typename... Ts>
+struct properties_or_empty_helper;
+
+template <template <typename TT> typename PropertyContainer, typename... Ts>
+struct properties_or_empty_helper<false, PropertyContainer, Ts...> {
+  using Props = sycl::ext::oneapi::experimental::empty_properties_t;
+};
+
+template <template <typename TT> typename PropertyContainer, typename... Ts>
+struct properties_or_empty_helper<true, PropertyContainer, Ts...> {
+  using Props = typename std::tuple_element_t<
+      tuple_template_index<PropertyContainer, std::tuple<Ts...>>::value,
+      std::tuple<Ts...>>::Props;
+};
+
+// Template type alias which searches variadic types for e.g.
+// compat::experimental::kernel_properties, launch_properties and returns
+// the contained sycl_exp::properties. If not found, returns
+// sycl_exp::empty_properties_t
+template <template <typename TT> typename PropertyContainer, typename... Ts>
+using properties_or_empty = typename properties_or_empty_helper<
+    tuple_contains_template<PropertyContainer, std::tuple<Ts...>>::value,
+    PropertyContainer, Ts...>::Props;
+
+// Traits to detect objects related to compat_exp::launch
+// ========================================================
+
+// Trait to detect compat_exp::kernel_properties
+template <typename T> struct is_kernel_properties : std::false_type {};
+template <typename TT>
+struct is_kernel_properties<kernel_properties<TT>> : std::true_type {};
+
+// Trait to detect compat_exp::launch_properties
+template <typename T> struct is_launch_properties : std::false_type {};
+template <typename TT>
+struct is_launch_properties<launch_properties<TT>> : std::true_type {};
+
+// Trait to detect compat_exp::local_mem_size
+template <typename T> struct is_local_mem_size : std::false_type {};
+template <> struct is_local_mem_size<local_mem_size> : std::true_type {};
+
+// Traits to detect compat_exp::launch_policy
+template <typename T> struct is_launch_policy : std::false_type {};
+
+template <typename RangeT, typename KProps, typename LProps, bool LocalMem>
+struct is_launch_policy<launch_policy<RangeT, KProps, LProps, LocalMem>>
+    : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_launch_policy_v = is_launch_policy<T>::value;
+
+// Trait to detect if all args are sycl_exp property types
+template <typename... Args>
+using are_all_props = std::conjunction<
+    sycl::ext::oneapi::experimental::is_property_value<Args>...>;
+
+} // namespace experimental::detail
+
+// Trait for extended floating point definition
+template <typename T>
+struct is_floating_point : std::is_floating_point<T>{};
+
+template <> struct is_floating_point<sycl::half> : std::true_type {};
+
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+template <> struct is_floating_point<sycl::ext::oneapi::bfloat16> : std::true_type {};
+#endif
+
+template <typename T>
+inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
+
+} // namespace compat
+
+// Specialize std::common_type for bfloat16
+// Semantics here match bfloat16.hpp operator overloads (all mixed type math
+// ops return bfloat16)
+// TODO(compat-lib-reviewers) Move this to bfloat extension
+namespace std {
+template <> struct common_type<sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <>
+struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<sycl::ext::oneapi::bfloat16, T> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<T, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+} // namespace std
diff --git a/tools/util/include/compat/util.hpp b/tools/util/include/compat/util.hpp
new file mode 100644
index 0000000000..d0a720a8ea
--- /dev/null
+++ b/tools/util/include/compat/util.hpp
@@ -0,0 +1,1188 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *  Copyright (C) 2025 Intel Corporation, All rights reserved.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  util.hpp
+ *
+ *  Description:
+ *    util functionality for the SYCL compatibility extension
+ **************************************************************************/
+
+// The original source was under the license below:
+//==---- util.hpp ---------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <type_traits>
+
+#include <sycl/atomic_ref.hpp>
+#include <sycl/group_barrier.hpp>
+#include <sycl/kernel_bundle.hpp>
+
+#include <compat/math.hpp>
+#include <compat/memory.hpp>
+#include <compat/dims.hpp>
+
+#if defined(__NVPTX__)
+#include <sycl/ext/oneapi/experimental/cuda/masked_shuffles.hpp>
+#endif
+
+// TODO: Remove these function definitions once they exist in the DPC++ compiler
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
+template <typename T>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
+    __attribute__((noduplicate)) T
+    __spirv_GroupNonUniformShuffle(__spv::Scope::Flag, T, unsigned) noexcept;
+
+template <typename T>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
+    __attribute__((noduplicate)) T
+    __spirv_GroupNonUniformShuffleDown(__spv::Scope::Flag, T,
+                                       unsigned) noexcept;
+
+template <typename T>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
+    __attribute__((noduplicate)) T
+    __spirv_GroupNonUniformShuffleUp(__spv::Scope::Flag, T, unsigned) noexcept;
+#endif
+
+namespace compat {
+
+namespace detail {
+
+template <typename tag, typename T> class generic_error_type {
+public:
+  generic_error_type() = default;
+  generic_error_type(T value) : value{value} {}
+  operator T() const { return value; }
+
+private:
+  T value;
+};
+
+template <typename T> struct DataType {
+  using T2 = T;
+};
+template <typename T> struct DataType<sycl::vec<T, 2>> {
+  using T2 = detail::complex_type<T>;
+};
+
+inline void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld,
+                            int from_ld, int rows, int cols, int elem_size,
+                            sycl::queue queue = compat::get_default_queue(),
+                            bool async = false) {
+  if (to_ptr == from_ptr && to_ld == from_ld) {
+    return;
+  }
+
+  if (to_ld == from_ld) {
+    size_t copy_size = elem_size * ((cols - 1) * (size_t)to_ld + rows);
+    if (async)
+      detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, copy_size);
+    else
+      detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, copy_size).wait();
+  } else {
+    if (async)
+      detail::memcpy(queue, to_ptr, from_ptr, elem_size * to_ld,
+                     elem_size * from_ld, elem_size * rows, cols);
+    else
+      sycl::event::wait(detail::memcpy(queue, to_ptr, from_ptr,
+                                       elem_size * to_ld, elem_size * from_ld,
+                                       elem_size * rows, cols));
+  }
+}
+
+/// Copy matrix data. The default leading dimension is column.
+/// \param [out] to_ptr A pointer points to the destination location.
+/// \param [in] from_ptr A pointer points to the source location.
+/// \param [in] to_ld The leading dimension the destination matrix.
+/// \param [in] from_ld The leading dimension the source matrix.
+/// \param [in] rows The number of rows of the source matrix.
+/// \param [in] cols The number of columns of the source matrix.
+/// \param [in] queue The queue where the routine should be executed.
+/// \param [in] async If this argument is true, the return of the function
+/// does NOT guarantee the copy is completed.
+template <typename T>
+inline void matrix_mem_copy(T *to_ptr, const T *from_ptr, int to_ld,
+                            int from_ld, int rows, int cols,
+                            sycl::queue queue = get_default_queue(),
+                            bool async = false) {
+  using Ty = typename DataType<T>::T2;
+  matrix_mem_copy((void *)to_ptr, (void *)from_ptr, to_ld, from_ld, rows, cols,
+                  sizeof(Ty), queue, async);
+}
+} // namespace detail
+
+using err0 = detail::generic_error_type<struct err0_tag, int>;
+using err1 = detail::generic_error_type<struct err1_tag, int>;
+
+/// Cast the high or low 32 bits of a double to an integer.
+/// \param [in] d The double value.
+/// \param [in] use_high32 Cast the high 32 bits of the double if true;
+/// otherwise cast the low 32 bits.
+inline int cast_double_to_int(double d, bool use_high32 = true) {
+  sycl::vec<double, 1> v0{d};
+  auto v1 = v0.as<sycl::int2>();
+  if (use_high32)
+    return v1[0];
+  return v1[1];
+}
+
+/// Combine two integers, the first as the high 32 bits and the second
+/// as the low 32 bits, into a double.
+/// \param [in] high32 The integer as the high 32 bits
+/// \param [in] low32 The integer as the low 32 bits
+inline double cast_ints_to_double(int high32, int low32) {
+  sycl::int2 v0{high32, low32};
+  auto v1 = v0.as<sycl::vec<double, 1>>();
+  return v1;
+}
+
+/// Reverse the bit order of an unsigned integer
+/// \param [in] a Input unsigned integer value
+/// \returns Value of a with the bit order reversed
+template <typename T> inline T reverse_bits(T a) {
+  static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
+                "unsigned integer required");
+#if defined(__NVPTX__)
+  if constexpr (sizeof(T) == 4) {
+    unsigned result;
+    asm volatile("brev.b32 %0, %1;" : "=r"(result) : "r"(a));
+    return result;
+  }
+#endif // __NVPTX__
+  if (!a)
+    return 0;
+  T mask = 0;
+  size_t count = 4 * sizeof(T);
+  mask = ~mask >> count;
+  while (count) {
+    a = ((a & mask) << count) | ((a & ~mask) >> count);
+    count = count >> 1;
+    mask = mask ^ (mask << count);
+  }
+  return a;
+}
+
+/// \param [in] a The first value contains 4 bytes
+/// \param [in] b The second value contains 4 bytes
+/// \param [in] s The selector value, only lower 16bit used
+/// \returns the permutation result of 4 bytes selected in the way
+/// specified by \p s from \p a and \p b
+inline unsigned int byte_level_permute(unsigned int a, unsigned int b,
+                                       unsigned int s) {
+  unsigned int ret;
+  ret =
+      ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
+      (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff) << 8) |
+      (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff) << 16) |
+      (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff) << 24);
+  return ret;
+}
+
+/// \brief The function performs bitwise logical operations on three input
+/// values of \p a, \p b and \p c based on the specified 8-bit truth table \p
+/// lut and return the result
+///
+/// \param [in] a Input value
+/// \param [in] b Input value
+/// \param [in] c Input value
+/// \param [in] lut truth table for looking up
+/// \returns The result
+inline uint32_t ternary_logic_op(uint32_t a, uint32_t b, uint32_t c,
+                                 uint8_t lut) {
+  uint32_t result = 0;
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;"
+               : "=r"(result)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+#else
+  switch (lut) {
+  case 0x0:
+    result = 0;
+    break;
+  case 0x1:
+    result = ~a & ~b & ~c;
+    break;
+  case 0x2:
+    result = ~a & ~b & c;
+  case 0x4:
+    result = ~a & b & ~c;
+    break;
+  case 0x8:
+    result = ~a & b & c;
+    break;
+  case 0x10:
+    result = a & ~b & ~c;
+    break;
+  case 0x20:
+    result = a & ~b & c;
+    break;
+  case 0x40:
+    result = a & b & ~c;
+    break;
+  case 0x80:
+    result = a & b & c;
+    break;
+  case 0x1a:
+    result = (a & b | c) ^ a;
+    break;
+  case 0x1e:
+    result = a ^ (b | c);
+    break;
+  case 0x2d:
+    result = ~a ^ (~b & c);
+    break;
+  case 0x78:
+    result = a ^ (b & c);
+    break;
+  case 0x96:
+    result = a ^ b ^ c;
+    break;
+  case 0xb4:
+    result = a ^ (b & ~c);
+    break;
+  case 0xb8:
+    result = a ^ (b & (c ^ a));
+    break;
+  case 0xd2:
+    result = a ^ (~b & c);
+    break;
+  case 0xe8:
+    result = a & (b | c) | (b & c);
+    break;
+  case 0xea:
+    result = a & b | c;
+    break;
+  case 0xfe:
+    result = a | b | c;
+    break;
+  case 0xff:
+    result = -1;
+    break;
+  default: {
+    if (lut & 0x01)
+      result |= ~a & ~b & ~c;
+    if (lut & 0x02)
+      result |= ~a & ~b & c;
+    if (lut & 0x04)
+      result |= ~a & b & ~c;
+    if (lut & 0x08)
+      result |= ~a & b & c;
+    if (lut & 0x10)
+      result |= a & ~b & ~c;
+    if (lut & 0x20)
+      result |= a & ~b & c;
+    if (lut & 0x40)
+      result |= a & b & ~c;
+    if (lut & 0x80)
+      result |= a & b & c;
+    break;
+  }
+  }
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return result;
+}
+
+/// Find position of first least significant set bit in an integer.
+/// ffs(0) returns 0.
+///
+/// \param [in] a Input integer value
+/// \returns The position
+template <typename T> inline int ffs(T a) {
+  static_assert(std::is_integral<T>::value, "integer required");
+  return (sycl::ctz(a) + 1) % (sizeof(T) * 8 + 1);
+}
+
+/// select_from_sub_group allows work-items to obtain a copy of a value held by
+/// any other work-item in the sub_group. The input sub_group will be divided
+/// into several logical sub_groups with id range [0, \p logical_sub_group_size
+/// - 1]. Each work-item in logical sub_group gets value from another work-item
+/// whose id is \p remote_local_id. If \p remote_local_id is outside the
+/// logical sub_group id range, \p remote_local_id will modulo with \p
+/// logical_sub_group_size. The \p logical_sub_group_size must be a power of 2
+/// and not exceed input sub_group size.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] remote_local_id Input source work item id
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id,
+                        int logical_sub_group_size = 32) {
+  unsigned int start_index =
+      g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
+  return sycl::select_from_group(
+      g, x, start_index + remote_local_id % logical_sub_group_size);
+}
+
+/// shift_sub_group_left move values held by the work-items in a sub_group
+/// directly to another work-item in the sub_group, by shifting values a fixed
+/// number of work-items to the left. The input sub_group will be divided into
+/// several logical sub_groups with id range [0, \p logical_sub_group_size - 1].
+/// Each work-item in logical sub_group gets value from another work-item whose
+/// id is caller's id adds \p delta. If calculated id is outside the logical
+/// sub_group id range, the work-item will get value from itself. The \p
+/// logical_sub_group_size must be a power of 2 and not exceed input sub_group
+/// size.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] delta Input delta
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta,
+                       int logical_sub_group_size = 32) {
+  unsigned int id = g.get_local_linear_id();
+  unsigned int end_index =
+      (id / logical_sub_group_size + 1) * logical_sub_group_size;
+  T result = sycl::shift_group_left(g, x, delta);
+  if ((id + delta) >= end_index) {
+    result = x;
+  }
+  return result;
+}
+
+/// shift_sub_group_right move values held by the work-items in a sub_group
+/// directly to another work-item in the sub_group, by shifting values a fixed
+/// number of work-items to the right. The input sub_group will be divided into
+/// several logical_sub_groups with id range [0, \p logical_sub_group_size - 1].
+/// Each work-item in logical_sub_group gets value from another work-item whose
+/// id is caller's id subtracts \p delta. If calculated id is outside the
+/// logical sub_group id range, the work-item will get value from itself. The \p
+/// logical_sub_group_size must be a power of 2 and not exceed input sub_group
+/// size.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] delta Input delta
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta,
+                        int logical_sub_group_size = 32) {
+  unsigned int id = g.get_local_linear_id();
+  unsigned int start_index =
+      id / logical_sub_group_size * logical_sub_group_size;
+  T result = sycl::shift_group_right(g, x, delta);
+  if ((id - start_index) < delta) {
+    result = x;
+  }
+  return result;
+}
+
+/// permute_sub_group_by_xor permutes values by exchanging values held by pairs
+/// of work-items identified by computing the bitwise exclusive OR of the
+/// work-item id and some fixed mask. The input sub_group will be divided into
+/// several logical sub_groups with id range [0, \p logical_sub_group_size - 1].
+/// Each work-item in logical sub_group gets value from another work-item whose
+/// id is bitwise exclusive OR of the caller's id and \p mask. If calculated id
+/// is outside the logical sub_group id range, the work-item will get value from
+/// itself. The \p logical_sub_group_size must be a power of 2 and not exceed
+/// input sub_group size.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] mask Input mask
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
+                           int logical_sub_group_size = 32) {
+  if (logical_sub_group_size == 32) {
+    return permute_group_by_xor(g, x, mask);
+  }
+  unsigned int id = g.get_local_linear_id();
+  unsigned int start_index =
+      id / logical_sub_group_size * logical_sub_group_size;
+  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
+  return sycl::select_from_group(g, x,
+                                 target_offset < logical_sub_group_size
+                                     ? start_index + target_offset
+                                     : id);
+}
+
+namespace experimental {
+/// Masked version of select_from_sub_group, which execute masked sub-group
+/// operation. The parameter member_mask indicating the work-items participating
+/// the call. Whether the n-th bit is set to 1 representing whether the
+/// work-item with id n is participating the call. All work-items named in
+/// member_mask must be executed with the same member_mask, or the result is
+/// undefined.
+/// \tparam T Input value type
+/// \param [in] member_mask Input mask
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] remote_local_id Input source work item id
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x,
+                        int remote_local_id, int logical_sub_group_size = 32) {
+  unsigned int start_index =
+      g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
+  unsigned logical_remote_id =
+      start_index + remote_local_id % logical_sub_group_size;
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
+#if defined(__SPIR__)
+  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,
+                                        logical_remote_id);
+#elif defined(__NVPTX__)
+  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
+  return cuda_shfl_sync_idx_i32(member_mask, x, remote_local_id, cVal);
+#else
+  throw sycl::exception(sycl::errc::runtime,
+                        "[Compat] Masked version of select_from_sub_group "
+                        "only supports SPIR-V or cuda backends.");
+#endif // __SPIR__
+#else
+  (void)g;
+  (void)x;
+  (void)remote_local_id;
+  (void)logical_sub_group_size;
+  (void)member_mask;
+  throw sycl::exception(
+      sycl::errc::runtime,
+      "[Compat] Masked version of select_from_sub_group not "
+      "supported on host device and non intel compiler.");
+#endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
+}
+
+/// Masked version of shift_sub_group_left, which execute masked sub-group
+/// operation. The parameter member_mask indicating the work-items participating
+/// the call. Whether the n-th bit is set to 1 representing whether the
+/// work-item with id n is participating the call. All work-items named in
+/// member_mask must be executed with the same member_mask, or the result is
+/// undefined.
+/// \tparam T Input value type
+/// \param [in] member_mask Input mask
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] delta Input delta
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x,
+                       unsigned int delta, int logical_sub_group_size = 32) {
+  unsigned int id = g.get_local_linear_id();
+  unsigned int end_index =
+      (id / logical_sub_group_size + 1) * logical_sub_group_size;
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
+#if defined(__SPIR__)
+  T result =
+      __spirv_GroupNonUniformShuffleDown(__spv::Scope::Subgroup, x, delta);
+  if ((id + delta) >= end_index) {
+    result = x;
+  }
+  return result;
+#elif defined(__NVPTX__)
+  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
+  return cuda_shfl_sync_down_i32(member_mask, x, delta, cVal);
+#else
+  throw sycl::exception(sycl::errc::runtime,
+                        "[Compat] Masked version of shift_sub_group_left "
+                        "only supports SPIR-V or cuda backends.");
+#endif // __SPIR__
+#else
+  (void)g;
+  (void)x;
+  (void)delta;
+  (void)logical_sub_group_size;
+  (void)member_mask;
+  throw sycl::exception(
+      sycl::errc::runtime,
+      "[Compat] Masked version of shift_sub_group_left not "
+      "supported on host device and non intel compiler.");
+#endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
+}
+
+/// Masked version of shift_sub_group_right, which execute masked sub-group
+/// operation. The parameter member_mask indicating the work-items participating
+/// the call. Whether the n-th bit is set to 1 representing whether the
+/// work-item with id n is participating the call. All work-items named in
+/// member_mask must be executed with the same member_mask, or the result is
+/// undefined.
+/// \tparam T Input value type
+/// \param [in] member_mask Input mask
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] delta Input delta
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x,
+                        unsigned int delta, int logical_sub_group_size = 32) {
+  unsigned int id = g.get_local_linear_id();
+  unsigned int start_index =
+      id / logical_sub_group_size * logical_sub_group_size;
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
+#if defined(__SPIR__)
+  T result = __spirv_GroupNonUniformShuffleUp(__spv::Scope::Subgroup, x, delta);
+  if ((id - start_index) < delta) {
+    result = x;
+  }
+  return result;
+#elif defined(__NVPTX__)
+  int cVal = ((32 - logical_sub_group_size) << 8);
+  return cuda_shfl_sync_up_i32(member_mask, x, delta, cVal);
+#else
+  throw sycl::exception(sycl::errc::runtime,
+                        "Masked version of shift_sub_group_right "
+                        "only supports SPIR-V or cuda backends.");
+#endif // __SPIR__
+#else
+  (void)g;
+  (void)x;
+  (void)delta;
+  (void)logical_sub_group_size;
+  (void)member_mask;
+  throw sycl::exception(sycl::errc::runtime,
+                        "Masked version of shift_sub_group_right not "
+                        "supported on host device and non intel compiler.");
+#endif // __SYCL_DEVICE_ONLY && __INTEL_LLVM_COMPILER
+}
+
+/// Masked version of permute_sub_group_by_xor, which execute masked sub-group
+/// operation. The parameter member_mask indicating the work-items participating
+/// the call. Whether the n-th bit is set to 1 representing whether the
+/// work-item with id n is participating the call. All work-items named in
+/// member_mask must be executed with the same member_mask, or the result is
+/// undefined.
+/// \tparam T Input value type
+/// \param [in] member_mask Input mask
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] mask Input mask
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x,
+                           unsigned int mask, int logical_sub_group_size = 32) {
+  unsigned int id = g.get_local_linear_id();
+  unsigned int start_index =
+      id / logical_sub_group_size * logical_sub_group_size;
+  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
+  unsigned logical_remote_id = (target_offset < logical_sub_group_size)
+                                   ? start_index + target_offset
+                                   : id;
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
+#if defined(__SPIR__)
+  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,
+                                        logical_remote_id);
+#elif defined(__NVPTX__)
+  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
+  return cuda_shfl_sync_bfly_i32(member_mask, x, mask, cVal);
+#else
+  throw sycl::exception(
+      sycl::errc::runtime,
+      "[Compat] Masked version of permute_sub_group_by_xor "
+      "only supports SPIR-V or cuda backends.");
+#endif // __SPIR__
+#else
+  (void)g;
+  (void)x;
+  (void)mask;
+  (void)logical_sub_group_size;
+  (void)member_mask;
+  throw sycl::exception(
+      sycl::errc::runtime,
+      "[Compat]Masked version of permute_sub_group_by_xor not "
+      "supported on host device and non intel compiler.");
+#endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
+}
+} // namespace experimental
+
+/// Inherited from the original SYCLomatic compatibility headers.
+/// @return compiler's SYCL version if defined, 202000 otherwise.
+inline int get_sycl_language_version() {
+#ifdef SYCL_LANGUAGE_VERSION
+  return SYCL_LANGUAGE_VERSION;
+#else
+  return 202000;
+#endif
+}
+
+/// The function match_any_over_sub_group conducts a comparison of values
+/// across work-items within a sub-group. match_any_over_sub_group return a mask
+/// in which some bits are set to 1, indicating that the \p value provided by
+/// the work-item represented by these bits are equal. The n-th bit of mask
+/// representing the work-item with id n. The parameter \p member_mask
+/// indicating the work-items participating the call.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] member_mask Input mask
+/// \param [in] value Input value
+/// \returns The result
+template <typename T>
+unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask,
+                                      T value) {
+  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");
+  if (!member_mask) {
+    return 0;
+  }
+  unsigned int id = g.get_local_linear_id();
+  unsigned int flag = 0, result = 0, reduce_result = 0;
+  unsigned int bit_index = 0x1 << id;
+  bool is_participate = member_mask & bit_index;
+  T broadcast_value = 0;
+  bool matched = false;
+  while (flag != member_mask) {
+    broadcast_value =
+        sycl::select_from_group(g, value, sycl::ctz((~flag & member_mask)));
+    reduce_result = sycl::reduce_over_group(
+        g, is_participate ? (broadcast_value == value ? bit_index : 0) : 0,
+        sycl::plus<>());
+    flag |= reduce_result;
+    matched = reduce_result & bit_index;
+    result = matched * reduce_result + (1 - matched) * result;
+  }
+  return result;
+}
+
+/// The function match_all_over_sub_group conducts a comparison of values
+/// across work-items within a sub-group. match_all_over_sub_group return \p
+/// member_mask and predicate \p pred will be set to 1 if all \p value that
+/// provided by each work-item in \p member_mask are equal, otherwise return 0
+/// and the predicate \p pred will be set to 0. The n-th bit of \p member_mask
+/// representing the work-item with id n. The parameter \p member_mask
+/// indicating the work-items participating the call.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] member_mask Input mask
+/// \param [in] value Input value
+/// \param [out] pred Output predicate
+/// \returns The result
+template <typename T>
+unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask,
+                                      T value, int *pred) {
+  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");
+  if (!member_mask) {
+    return 0;
+  }
+  unsigned int id = g.get_local_linear_id();
+  unsigned int bit_index = 0x1 << id;
+  bool is_participate = member_mask & bit_index;
+  T broadcast_value = sycl::select_from_group(g, value, sycl::ctz(member_mask));
+  unsigned int reduce_result = sycl::reduce_over_group(
+      g,
+      (member_mask & bit_index) ? (broadcast_value == value ? bit_index : 0)
+                                : 0,
+      sycl::plus<>());
+  bool all_equal = (reduce_result == member_mask);
+  *pred = is_participate & all_equal;
+  return (is_participate & all_equal) * member_mask;
+}
+
+namespace experimental {
+
+// FIXME(@intel/compat-lib-reviewers): unify once supported in the CUDA and
+// AMD backends.
+#if defined(__AMDGPU__) || defined(__NVPTX__)
+constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::acq_rel;
+#else
+constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::seq_cst;
+#endif
+
+/// Synchronize work items from all work groups within a SYCL kernel.
+/// \param [in] item:  Represents a work group.
+/// \param [in] counter: An atomic object defined on a device memory which can
+/// be accessed by work items in all work groups. The initial value of the
+/// counter should be zero.
+/// Note: Please make sure that all the work items of all work groups within
+/// a SYCL kernel can be scheduled actively at the same time on a device.
+template <int dimensions = 3>
+inline void nd_range_barrier(
+    const sycl::nd_item<dimensions> &item,
+    sycl::atomic_ref<unsigned int, barrier_memory_order,
+                     sycl::memory_scope::device,
+                     sycl::access::address_space::global_space> &counter) {
+
+  static_assert(dimensions == 3, "dimensions must be 3.");
+  constexpr unsigned int MSB32_MASK = 0x80000000;
+
+  unsigned int num_groups = item.get_group_range(2) * item.get_group_range(1) *
+                            item.get_group_range(0);
+
+  item.barrier();
+
+  if (item.get_local_linear_id() == 0) {
+    unsigned int inc = 1;
+    unsigned int old_arrive = 0;
+    bool is_group0 =
+        (item.get_group(2) + item.get_group(1) + item.get_group(0) == 0);
+    if (is_group0) {
+      inc = MSB32_MASK - (num_groups - 1);
+    }
+
+    old_arrive = counter.fetch_add(inc);
+    // Synchronize all the work groups
+    while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
+      ;
+  }
+
+  item.barrier();
+}
+
+/// Synchronize work items from all work groups within a SYCL kernel.
+/// \param [in] item:  Represents a work group.
+/// \param [in] counter: An atomic object defined on a device memory which can
+/// be accessed by work items in all work groups. The initial value of the
+/// counter should be zero.
+/// Note: Please make sure that all the work items of all work groups within
+/// a SYCL kernel can be scheduled actively at the same time on a device.
+template <>
+inline void nd_range_barrier(
+    const sycl::nd_item<1> &item,
+    sycl::atomic_ref<unsigned int, barrier_memory_order,
+                     sycl::memory_scope::device,
+                     sycl::access::address_space::global_space> &counter) {
+  unsigned int num_groups = item.get_group_range(0);
+  constexpr unsigned int MSB32_MASK = 0x80000000;
+
+  item.barrier();
+
+  if (item.get_local_linear_id() == 0) {
+    unsigned int inc = 1;
+    unsigned int old_arrive = 0;
+    bool is_group0 = (item.get_group(0) == 0);
+    if (is_group0) {
+      inc = MSB32_MASK - (num_groups - 1);
+    }
+
+    old_arrive = counter.fetch_add(inc);
+    // Synchronize all the work groups
+    while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
+      ;
+  }
+
+  item.barrier();
+}
+
+/// The logical-group is a logical collection of some work-items within a
+/// work-group.
+/// Note: Please make sure that the logical-group size is a power of 2 in the
+/// range [1, current_sub_group_size].
+template <int dimensions = 3> class logical_group {
+  sycl::nd_item<dimensions> _item;
+  sycl::group<dimensions> _g;
+  uint32_t _logical_group_size;
+  uint32_t _group_linear_range_in_parent;
+
+public:
+  /// Dividing \p parent_group into several logical-groups.
+  /// \param [in] item Current work-item.
+  /// \param [in] parent_group The group to be divided.
+  /// \param [in] size The logical-group size.
+  logical_group(sycl::nd_item<dimensions> item,
+                sycl::group<dimensions> parent_group, uint32_t size)
+      : _item(item), _g(parent_group), _logical_group_size(size) {
+    _group_linear_range_in_parent =
+        (_g.get_local_linear_range() - 1) / _logical_group_size + 1;
+  }
+  logical_group(sycl::nd_item<dimensions> item)
+      : _item(item), _g(item.get_group()) {}
+  /// Returns the index of the work-item within the logical-group.
+  uint32_t get_local_linear_id() const {
+    return _item.get_local_linear_id() % _logical_group_size;
+  }
+  /// Returns the index of the logical-group in the parent group.
+  uint32_t get_group_linear_id() const {
+    return _item.get_local_linear_id() / _logical_group_size;
+  }
+  /// Returns the number of work-items in the logical-group.
+  uint32_t get_local_linear_range() const {
+    if (_g.get_local_linear_range() % _logical_group_size == 0) {
+      return _logical_group_size;
+    }
+    uint32_t last_item_group_id =
+        _g.get_local_linear_range() / _logical_group_size;
+    uint32_t first_of_last_group = last_item_group_id * _logical_group_size;
+    if (_item.get_local_linear_id() >= first_of_last_group) {
+      return _g.get_local_linear_range() - first_of_last_group;
+    } else {
+      return _logical_group_size;
+    }
+  }
+  /// Returns the number of logical-group in the parent group.
+  uint32_t get_group_linear_range() const {
+    return _group_linear_range_in_parent;
+  }
+};
+
+// The original source of the functions calculate_max_active_wg_per_xecore and
+// calculate_max_potential_wg were under the license below:
+//
+// Copyright (C) Intel Corporation
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+/// This function is used for occupancy calculation, it computes the max active
+/// work-group number per Xe-Core. Ref to
+/// https://github.com/oneapi-src/oneAPI-samples/tree/master/Tools/GPU-Occupancy-Calculator
+/// \param [out] num_wg Active work-group number.
+/// \param [in] wg_size Work-group size.
+/// \param [in] slm_size Share local memory size.
+/// \param [in] sg_size Sub-group size.
+/// \param [in] used_barrier Whether barrier is used.
+/// \param [in] used_large_grf Whether large General Register File is used.
+/// \return If no error, returns 0.
+/// If \p wg_size exceeds the max work-group size, the max work-group size will
+/// be used instead of \p wg_size and returns -1.
+inline int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size,
+                                              int slm_size = 0,
+                                              int sg_size = 32,
+                                              bool used_barrier = false,
+                                              bool used_large_grf = false) {
+  int ret = 0;
+  const int slm_size_per_xe_core = 64 * 1024;
+  const int max_barrier_registers = 32;
+  compat::device_ext &dev = compat::get_current_device();
+
+  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
+  if (wg_size > max_wg_size) {
+    wg_size = max_wg_size;
+    ret = -1;
+  }
+
+  int num_threads_ss = 56;
+  int max_num_wg = 56;
+  if (dev.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice) &&
+      dev.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+    auto eu_count =
+        dev.get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>();
+    auto threads_count =
+        dev.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
+    num_threads_ss = eu_count * threads_count;
+    max_num_wg = eu_count * threads_count;
+  }
+
+  if (used_barrier) {
+    max_num_wg = max_barrier_registers;
+  }
+
+  // Calculate num_wg_slm
+  int num_wg_slm = 0;
+  if (slm_size == 0) {
+    num_wg_slm = max_num_wg;
+  } else {
+    num_wg_slm = std::floor((float)slm_size_per_xe_core / slm_size);
+  }
+
+  // Calculate num_wg_threads
+  if (used_large_grf)
+    num_threads_ss = num_threads_ss / 2;
+  int num_threads = std::ceil((float)wg_size / sg_size);
+  int num_wg_threads = std::floor((float)num_threads_ss / num_threads);
+
+  // Calculate num_wg
+  *num_wg = std::min(num_wg_slm, num_wg_threads);
+  *num_wg = std::min(*num_wg, max_num_wg);
+  return ret;
+}
+
+/// This function is used for occupancy calculation, it computes the work-group
+/// number and the work-group size which achieves the maximum occupancy of the
+/// device potentially. Ref to
+/// https://github.com/oneapi-src/oneAPI-samples/tree/master/Tools/GPU-Occupancy-Calculator
+/// \param [out] num_wg Work-group number.
+/// \param [out] wg_size Work-group size.
+/// \param [in] max_wg_size_for_device_code The maximum working work-group size
+/// for current device code logic. Zero means no limitation.
+/// \param [in] slm_size Share local memory size.
+/// \param [in] sg_size Sub-group size.
+/// \param [in] used_barrier Whether barrier is used.
+/// \param [in] used_large_grf Whether large General Register File is used.
+/// \return Returns 0.
+inline int calculate_max_potential_wg(int *num_wg, int *wg_size,
+                                      int max_wg_size_for_device_code,
+                                      int slm_size = 0, int sg_size = 32,
+                                      bool used_barrier = false,
+                                      bool used_large_grf = false) {
+  sycl::device &dev = compat::get_current_device();
+  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
+  if (max_wg_size_for_device_code == 0 ||
+      max_wg_size_for_device_code >= max_wg_size)
+    *wg_size = (int)max_wg_size;
+  else
+    *wg_size = max_wg_size_for_device_code;
+  calculate_max_active_wg_per_xecore(num_wg, *wg_size, slm_size, sg_size,
+                                     used_barrier, used_large_grf);
+  std::uint32_t num_ss = 1;
+  if (dev.has(sycl::aspect::ext_intel_gpu_slices) &&
+      dev.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) {
+    num_ss =
+        dev.get_info<sycl::ext::intel::info::device::gpu_slices>() *
+        dev.get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();
+  }
+  num_wg[0] = num_ss * num_wg[0];
+  return 0;
+}
+
+/// Supported group types
+enum class group_type { work_group, sub_group, logical_group, root_group };
+
+/// The group_base will dispatch the function call to the specific interface
+/// based on the group type.
+template <int dimensions = 3> class group_base {
+public:
+  group_base(sycl::nd_item<dimensions> item)
+      : nd_item(item), logical_group(item) {}
+  ~group_base() {}
+  /// Returns the number of work-items in the group.
+  size_t get_local_linear_range() {
+    switch (type) {
+    case group_type::work_group:
+      return nd_item.get_group().get_local_linear_range();
+    case group_type::sub_group:
+      return nd_item.get_sub_group().get_local_linear_range();
+    case group_type::logical_group:
+      return logical_group.get_local_linear_range();
+    default:
+      return -1; // Unkonwn group type
+    }
+  }
+  /// Returns the index of the work-item within the group.
+  size_t get_local_linear_id() {
+    switch (type) {
+    case group_type::work_group:
+      return nd_item.get_group().get_local_linear_id();
+    case group_type::sub_group:
+      return nd_item.get_sub_group().get_local_linear_id();
+    case group_type::logical_group:
+      return logical_group.get_local_linear_id();
+    default:
+      return -1; // Unkonwn group type
+    }
+  }
+  /// Wait for all the elements within the group to complete their execution
+  /// before proceeding.
+  void barrier() {
+    switch (type) {
+    case group_type::work_group:
+      sycl::group_barrier(nd_item.get_group());
+      break;
+    case group_type::sub_group:
+    case group_type::logical_group:
+      sycl::group_barrier(nd_item.get_sub_group());
+      break;
+    default:
+      break;
+    }
+  }
+
+protected:
+  experimental::logical_group<dimensions> logical_group;
+  sycl::nd_item<dimensions> nd_item;
+  group_type type;
+};
+
+/// Container type that can store supported group_types.
+template <typename GroupT, int dimensions = 3>
+class group : public group_base<dimensions> {
+  using group_base<dimensions>::type;
+  using group_base<dimensions>::logical_group;
+
+public:
+  group(GroupT g, sycl::nd_item<dimensions> item)
+      : group_base<dimensions>(item) {
+    if constexpr (std::is_same_v<GroupT, sycl::sub_group>) {
+      type = group_type::sub_group;
+    } else if constexpr (std::is_same_v<GroupT, sycl::group<dimensions>>) {
+      type = group_type::work_group;
+    } else if constexpr (std::is_same_v<
+                             GroupT, experimental::logical_group<dimensions>>) {
+      logical_group = g;
+      type = group_type::logical_group;
+    }
+  }
+};
+} // namespace experimental
+
+// Calculate the number of work-groups per compute unit
+// \tparam [in] KernelName SYCL kernel name to calculate for
+// \param [in] q SYCL queue used to execute kernel
+// \param [in] wg_dim3 dim3 representing work-group shape
+// \param [in] local_mem_size Local memory usage per work-group in bytes
+// \return size_t representing maximum work-groups per compute unit
+template <class KernelName>
+size_t max_active_work_groups_per_cu(
+    compat::dim3 wg_dim3, size_t local_mem_size,
+    sycl::queue queue = compat::get_default_queue()) {
+  namespace syclex = sycl::ext::oneapi::experimental;
+  // max_num_work_groups only supports range<3>
+  auto ctx = queue.get_context();
+  auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
+  auto kernel = bundle.template get_kernel<KernelName>();
+  sycl::range<3> wg_range_3d(wg_dim3);
+  size_t max_wgs = kernel.template ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(queue, wg_range_3d,
+                                                                local_mem_size);
+  size_t max_compute_units =
+      queue.get_device().get_info<sycl::info::device::max_compute_units>();
+  // Spec dictates max_compute_units > 0, so no need to catch div 0
+  return max_wgs / max_compute_units;
+}
+
+// Calculate the number of work-groups per compute unit
+// \tparam [in] KernelName SYCL kernel name to calculate for
+// \tparam [in] RangeDim the dimension of the sycl::range
+// \param [in] q SYCL queue used to execute kernel
+// \param [in] wg_range SYCL work-group range
+// \param [in] local_mem_size Local memory usage per work-group in bytes
+// \return size_t representing maximum work-groups per compute unit
+template <class KernelName, int RangeDim>
+size_t max_active_work_groups_per_cu(
+    sycl::range<RangeDim> wg_range, size_t local_mem_size,
+    sycl::queue queue = compat::get_default_queue()) {
+  return max_active_work_groups_per_cu<KernelName>(compat::dim3(wg_range),
+                                                   local_mem_size, queue);
+}
+
+/// If x <= 2, then return a pointer to the default queue;
+/// otherwise, return x reinterpreted as a queue_ptr.
+inline queue_ptr int_as_queue_ptr(uintptr_t x) {
+  return x <= 2 ? detail::dev_mgr::instance().current_device().default_queue()
+                : reinterpret_cast<queue_ptr>(x);
+}
+
+template <int n_nondefault_params, int n_default_params, typename T>
+class args_selector;
+
+/// args_selector is a helper class for extracting arguments from an
+/// array of pointers to arguments or buffer of arguments to pass to a
+/// kernel function.
+///
+/// \param R(Ts...) The type of the kernel
+/// \param n_nondefault_params The number of nondefault parameters of the kernel
+/// (excluding parameters that like sycl::nd_item, etc.)
+/// \param n_default_params The number of default parameters of the kernel
+///
+/// Example usage:
+/// With the following kernel:
+///   void foo(sycl::float2 *x, int n, sycl::nd_item<3> item_ct1, float f=.1) {}
+/// and with the declaration:
+///   args_selector<2, 1, decltype(foo)> selector(kernelParams, extra);
+///   void* kernelParams[2 + 1] = { (void*)float2_var, int_var, float_var }
+/// we have:
+///   selector.get<0>() returns a reference to sycl::float*,
+///   selector.get<1>() returns a reference to int,
+///   selector.get<2>() returns a reference to float
+template <int n_nondefault_params, int n_default_params, typename R,
+          typename... Ts>
+class args_selector<n_nondefault_params, n_default_params, R(Ts...)> {
+private:
+  void **kernel_params;
+  char *args_buffer;
+
+  template <int i> static constexpr int account_for_default_params() {
+    constexpr int n_total_params = sizeof...(Ts);
+    if constexpr (i >= n_nondefault_params) {
+      return n_total_params - n_default_params + (i - n_nondefault_params);
+    } else {
+      return i;
+    }
+  }
+
+public:
+  /// Get the type of the ith argument of R(Ts...)
+  /// \param [in] i Index of parameter to get
+  /// \returns Type of ith parameter
+  template <int i>
+  using arg_type =
+      std::tuple_element_t<account_for_default_params<i>(), std::tuple<Ts...>>;
+
+private:
+  template <int i> static constexpr int get_offset() {
+    if constexpr (i == 0) {
+      // we can assume args_buffer is properly aligned to the
+      // first argument
+      return 0;
+    } else {
+      constexpr int prev_off = get_offset<i - 1>();
+      constexpr int prev_past_end = prev_off + sizeof(arg_type<i - 1>);
+      using T = arg_type<i>;
+      // is the past-the-end of the i-1st element properly aligned
+      // with the ith element's alignment?
+      if constexpr (prev_past_end % alignof(T) == 0) {
+        return prev_past_end;
+      }
+      // otherwise bump prev_past_end to match alignment
+      else {
+        return prev_past_end + (alignof(T) - (prev_past_end % alignof(T)));
+      }
+    }
+  }
+
+  static char *get_args_buffer(void **extra) {
+    if (!extra)
+      return nullptr;
+    for (; (std::size_t)*extra != 0; ++extra) {
+      if ((std::size_t)*extra == 1) {
+        return static_cast<char *>(*(extra + 1));
+      }
+    }
+    return nullptr;
+  }
+
+public:
+  /// If kernel_params is nonnull, then args_selector will
+  /// extract arguments from kernel_params. Otherwise, it
+  /// will extract them from extra.
+  /// \param [in] kernel_params Array of pointers to arguments
+  /// a or null pointer.
+  /// \param [in] extra Array containing pointer to argument buffer.
+  args_selector(void **kernel_params, void **extra)
+      : kernel_params(kernel_params), args_buffer(get_args_buffer(extra)) {}
+
+  /// Get a reference to the ith argument extracted from kernel_params
+  /// or extra.
+  /// \param [in] i Index of argument to get
+  /// \returns Reference to the ith argument
+  template <int i> arg_type<i> &get() {
+    if (kernel_params) {
+      return *static_cast<arg_type<i> *>(kernel_params[i]);
+    } else {
+      return *reinterpret_cast<arg_type<i> *>(args_buffer + get_offset<i>());
+    }
+  }
+};
+
+} // namespace compat
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index ca9d220fc9..2a724a087e 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -1,5 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -60,7 +61,7 @@ T* allocate(size_t count = 1) {
 
 #if defined(CUTLASS_ENABLE_SYCL)
   if (count > 0) {
-    ptr = reinterpret_cast<T*>(syclcompat::malloc(bytes));
+    ptr = reinterpret_cast<T*>(compat::malloc(bytes));
     if ((void*)ptr == nullptr) {
       throw std::runtime_error("Failed to allocate memory");
     }
@@ -93,7 +94,7 @@ template <typename T>
 void free(T* ptr) {
   if (ptr) {
 #if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::free(ptr);
+    compat::free(ptr);
     if (ptr != nullptr) {
       throw std::runtime_error("Failed to free device memory");
     }
@@ -117,7 +118,7 @@ void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
     bytes = 1;
   }
 #if defined(CUTLASS_ENABLE_SYCL)
-  syclcompat::memcpy(dst, src, bytes);
+  compat::memcpy(dst, src, bytes);
 #else
   cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
   if (cuda_error != cudaSuccess) {
@@ -198,7 +199,7 @@ class DeviceAllocation {
   struct deleter {
     void operator()(T* ptr) {
 #if defined(CUTLASS_ENABLE_SYCL)
-      syclcompat::free(ptr);
+      compat::free(ptr);
 #else
       cudaError_t cuda_error = (cudaFree(ptr));
       if (cuda_error != cudaSuccess) {
diff --git a/tools/util/include/cutlass/util/helper_cuda.hpp b/tools/util/include/cutlass/util/helper_cuda.hpp
index 19c7b3e5e7..6c730371f7 100644
--- a/tools/util/include/cutlass/util/helper_cuda.hpp
+++ b/tools/util/include/cutlass/util/helper_cuda.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -46,8 +47,8 @@ device_init(int device_id, bool quiet = false)
 
 #if defined(CUTLASS_ENABLE_SYCL)
 
-  syclcompat::select_device(device_id);
-  auto &device = syclcompat::get_current_device();
+  compat::select_device(device_id);
+  auto &device = compat::get_current_device();
   
   if (!quiet) {
     printf("Using device %d: %s  (%d Compute Units)\n",
diff --git a/tools/util/include/cutlass/util/initialize_block.hpp b/tools/util/include/cutlass/util/initialize_block.hpp
index c8d6b868e5..1564057f52 100644
--- a/tools/util/include/cutlass/util/initialize_block.hpp
+++ b/tools/util/include/cutlass/util/initialize_block.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,7 +35,7 @@
 
 #pragma once
 #ifdef CUTLASS_ENABLE_SYCL
-#include <syclcompat.hpp> 
+#include <compat.hpp> 
 #else
 #include <cuda.h>
 #endif
@@ -111,7 +112,7 @@ bool initialize_block(Element* block, std::size_t size, uint64_t seed, Args_t&&.
     }
   }
 
-  syclcompat::wait();
+  compat::wait();
   return true;
 }
 
@@ -189,7 +190,7 @@ void initialize_mixed_dtype_block(cutlass::DeviceAllocation<T1>& block_device,
     }
   }
 
-  syclcompat::wait();
+  compat::wait();
 }
 
 #undef CUDA_CHECK
diff --git a/tools/util/include/cutlass/util/mixed_dtype_utils.hpp b/tools/util/include/cutlass/util/mixed_dtype_utils.hpp
index 6fec929680..e008bdd45c 100644
--- a/tools/util/include/cutlass/util/mixed_dtype_utils.hpp
+++ b/tools/util/include/cutlass/util/mixed_dtype_utils.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,7 +35,7 @@
 
 #pragma once
 #ifdef CUTLASS_ENABLE_SYCL
-#include <syclcompat.hpp> 
+#include <compat.hpp> 
 #else
 #include <cuda.h>
 #endif
@@ -195,13 +196,13 @@ static void dequantize(DequantizedElement* dq_buffer,
 
   dim3 blocks(blocks_x, blocks_y, 1);
 #ifdef CUTLASS_ENABLE_SYCL
-  syclcompat::launch<dequantize_kernel<
+  compat::launch<dequantize_kernel<
       QuantizedElement, DequantizedElement, OperandLayout, ElementScale,
       ElementZero, decltype(scale_layout_bcast), decltype(zero_layout_bcast), decltype(thr_layout)>>(
       blocks, tpb, dq_buffer, q_buffer, operand_layout, scale_buffer,
       zero_buffer, scale_layout_bcast, zero_layout_bcast, thr_layout);
 
-  syclcompat::wait_and_throw();
+  compat::wait_and_throw();
 #else
   dequantize_kernel<<<blocks, tpb, 0, stream>>>(dq_buffer, q_buffer, operand_layout, scale_buffer, zero_buffer, scale_layout_bcast, thr_layout);
   CUDA_CHECK(cudaStreamSynchronize(stream));
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
index 0cf57e526e..0181d8430b 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -239,7 +240,7 @@ void GemmComplex(
   int const kNblock = 4;
 
 #if defined (CUTLASS_ENABLE_SYCL)
-using syclcompat::dim3;
+using compat::dim3;
 #endif
 
   dim3 block(16, 8);
@@ -252,7 +253,7 @@ using syclcompat::dim3;
   if (grid.y <= std::numeric_limits<uint16_t>::max()) {
 #if defined(CUTLASS_ENABLE_SYCL)
 
-  syclcompat::launch<kernel::GemmComplex<
+  compat::launch<kernel::GemmComplex<
                       ElementA,
                       LayoutA,
                       ElementB,
@@ -329,7 +330,7 @@ using syclcompat::dim3;
     );
 
 #if defined (CUTLASS_ENABLE_SYCL)
-  syclcompat::launch<kernel::GemmComplex<
+  compat::launch<kernel::GemmComplex<
                       ElementA,
                       LayoutA,
                       ElementB,
diff --git a/tools/util/include/cutlass/util/reference/device/sycl_tensor_fill.h b/tools/util/include/cutlass/util/reference/device/sycl_tensor_fill.h
index 7ab9b5b2bf..d5fd93b0c1 100644
--- a/tools/util/include/cutlass/util/reference/device/sycl_tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/device/sycl_tensor_fill.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
-* Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,6 +44,7 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/complex.h"
 #include "cutlass/util/reference/device/tensor_foreach.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/tensor_view.h"
 #include "cutlass/layout/vector.h"
 
@@ -175,25 +177,17 @@ void BlockFillRandomUniformCopyFromHost(
   size_t capacity,
   uint64_t seed,                          ///< seed for RNG
   typename RealType<Element>::Type max,   ///< upper bound of distribution
-  typename RealType<Element>::Type min    ///< lower bound for distribution
+  typename RealType<Element>::Type min,   ///< lower bound for distribution
+  int bits = -1                           ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
   ) {
-  
-  if constexpr(std::is_same_v<Element, float> || 
-               std::is_same_v<Element, cute::bfloat16_t> || 
-               std::is_same_v<Element, cute::half_t>) {
-    std::random_device rd;
-    std::mt19937 gen(seed);
-    std::uniform_real_distribution<float> dis(min, max);
-    auto buff = std::vector<Element>(capacity);
-
-    for (size_t i = 0; i < capacity; ++i) {
-      buff[i] = (Element)(dis(gen));
-    }
-    syclcompat::memcpy<Element>(ptr, buff.data(), capacity);
-    syclcompat::wait();
-  } else {
-    assert(false && "Not supported dtype");
-  }
+  auto buff = std::vector<Element>(capacity);
+
+  cutlass::reference::host::BlockFillRandomUniform(buff.data(), capacity, seed, max, min, bits);
+
+  compat::memcpy<Element>(ptr, buff.data(), capacity);
+  compat::wait();
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
index 7bb2bcf208..be6d343eb0 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -150,11 +151,11 @@ bool BlockCompareEqual(
   int *device_equal_flag = nullptr;
 
 #if defined (CUTLASS_ENABLE_SYCL)
-  device_equal_flag = reinterpret_cast<int*>(syclcompat::malloc(sizeof(int)));
+  device_equal_flag = reinterpret_cast<int*>(compat::malloc(sizeof(int)));
   if (device_equal_flag == nullptr) {
     throw std::runtime_error("Failed to allocate device flag.");
   }
-  syclcompat::memcpy(device_equal_flag, &equal_flag, sizeof(int));
+  compat::memcpy(device_equal_flag, &equal_flag, sizeof(int));
 #else
   if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
     throw std::runtime_error("Failed to allocate device flag.");
@@ -192,14 +193,14 @@ bool BlockCompareEqual(
   }
 
 #if defined(CUTLASS_ENABLE_SYCL)
-  const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
-  const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
-  syclcompat::launch<kernel::BlockCompareEqual<Element>>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity);
-  syclcompat::wait();
+  const auto sycl_block = compat::dim3(block_size, 1, 1);
+  const auto sycl_grid = compat::dim3(grid_size, 1, 1);
+  compat::launch<kernel::BlockCompareEqual<Element>>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity);
+  compat::wait();
 
-  syclcompat::memcpy(&equal_flag, device_equal_flag, sizeof(int));
+  compat::memcpy(&equal_flag, device_equal_flag, sizeof(int));
 
-  syclcompat::free(reinterpret_cast<void*>(device_equal_flag));
+  compat::free(reinterpret_cast<void*>(device_equal_flag));
 #else
   dim3 grid(grid_size, 1, 1);
   dim3 block(block_size, 1, 1);
@@ -243,11 +244,11 @@ bool BlockCompareRelativelyEqual(
   int *device_equal_flag = nullptr;
 
 #if defined (CUTLASS_ENABLE_SYCL)
-  device_equal_flag = reinterpret_cast<int*>(syclcompat::malloc(sizeof(int)));
+  device_equal_flag = reinterpret_cast<int*>(compat::malloc(sizeof(int)));
   if (device_equal_flag == nullptr) {
     throw std::runtime_error("Failed to allocate device flag.");
   }
-  syclcompat::memcpy(device_equal_flag, &equal_flag, sizeof(int));
+  compat::memcpy(device_equal_flag, &equal_flag, sizeof(int));
 #else
   if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
     throw std::runtime_error("Failed to allocate device flag.");
@@ -285,16 +286,16 @@ bool BlockCompareRelativelyEqual(
   }
 
 #if defined(CUTLASS_ENABLE_SYCL)
-  const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
-  const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
+  const auto sycl_block = compat::dim3(block_size, 1, 1);
+  const auto sycl_grid = compat::dim3(grid_size, 1, 1);
 
-  syclcompat::launch<kernel::BlockCompareRelativelyEqual<Element>>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity,
+  compat::launch<kernel::BlockCompareRelativelyEqual<Element>>(sycl_grid, sycl_block, device_equal_flag, ptr_A, ptr_B, capacity,
                                                                   epsilon, nonzero_floor);
-  syclcompat::wait();
+  compat::wait();
 
-  syclcompat::memcpy(&equal_flag, device_equal_flag, sizeof(int));
+  compat::memcpy(&equal_flag, device_equal_flag, sizeof(int));
 
-  syclcompat::free(reinterpret_cast<void*>(device_equal_flag));
+  compat::free(reinterpret_cast<void*>(device_equal_flag));
 #else
   dim3 grid(grid_size, 1, 1);
   dim3 block(block_size, 1, 1);
@@ -363,9 +364,9 @@ void BlockElementwiseOp(
   }
 
 #if defined(CUTLASS_ENABLE_SYCL)
-  const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
-  const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
-  syclcompat::launch<kernel::BlockElementwiseOp<BinaryOp, Element>>(
+  const auto sycl_block = compat::dim3(block_size, 1, 1);
+  const auto sycl_grid = compat::dim3(grid_size, 1, 1);
+  compat::launch<kernel::BlockElementwiseOp<BinaryOp, Element>>(
       sycl_grid, sycl_block, ptr_dst, ptr_A, ptr_B, capacity);
 #else
   dim3 grid(grid_size, 1, 1);
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index 6d24496730..ff2d90d7a7 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -1,5 +1,6 @@
 /***************************************************************************************************
  * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,9 +77,9 @@ struct TensorForEach {
     }
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
-    const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
-    syclcompat::launch<kernel::TensorForEach<Func, Rank, Params>>(sycl_grid, sycl_block, size, params);
+    const auto sycl_block = compat::dim3(block_size, 1, 1);
+    const auto sycl_grid = compat::dim3(grid_size, 1, 1);
+    compat::launch<kernel::TensorForEach<Func, Rank, Params>>(sycl_grid, sycl_block, size, params);
 #else
     dim3 grid(grid_size, 1, 1);
     dim3 block(block_size, 1, 1);
@@ -104,9 +105,9 @@ struct TensorDiagonalForEach {
     }
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
-    const auto sycl_grid = syclcompat::dim3((end - start + block_size - 1) / block_size, 1, 1);
-    syclcompat::launch<kernel::TensorDiagonalForEach<Func, Rank, Params>>(sycl_grid, sycl_block, size, params, start, end);
+    const auto sycl_block = compat::dim3(block_size, 1, 1);
+    const auto sycl_grid = compat::dim3((end - start + block_size - 1) / block_size, 1, 1);
+    compat::launch<kernel::TensorDiagonalForEach<Func, Rank, Params>>(sycl_grid, sycl_block, size, params, start, end);
 #else
     dim3 block(block_size, 1, 1);
     dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
@@ -154,9 +155,9 @@ struct BlockForEach {
     }
 
 #if defined(CUTLASS_ENABLE_SYCL)
-    const auto sycl_block = syclcompat::dim3(block_size, 1, 1);
-    const auto sycl_grid = syclcompat::dim3(grid_size, 1, 1);
-    syclcompat::launch<kernel::BlockForEach<Element, Func>>(sycl_grid, sycl_block, ptr, capacity, params);
+    const auto sycl_block = compat::dim3(block_size, 1, 1);
+    const auto sycl_grid = compat::dim3(grid_size, 1, 1);
+    compat::launch<kernel::BlockForEach<Element, Func>>(sycl_grid, sycl_block, ptr, capacity, params);
 #else
     dim3 grid(grid_size, 1, 1);
     dim3 block(block_size, 1, 1);
diff --git a/tools/util/include/cutlass/util/sycl_timer.hpp b/tools/util/include/cutlass/util/sycl_timer.hpp
index 63b4b98a85..830c5b26a4 100644
--- a/tools/util/include/cutlass/util/sycl_timer.hpp
+++ b/tools/util/include/cutlass/util/sycl_timer.hpp
@@ -1,5 +1,6 @@
 /***************************************************************************************************
-* Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * Copyright (C) 2025 Intel Corporation, All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,7 +33,7 @@
 #pragma once
 
 #include <sycl/sycl.hpp>
-#include <syclcompat.hpp>
+#include <compat.hpp>
 
 #if defined(CUTLASS_SYCL_PROFILING_ENABLED)
 #include "cutlass/util/sycl_event_manager.hpp"
@@ -57,7 +58,7 @@ struct SYCLTimer {
 #if defined(CUTLASS_SYCL_PROFILING_ENABLED)
     syclEventRecord(start_);
 #else
-    syclcompat::get_default_queue().wait();
+    compat::get_default_queue().wait();
     start_ = std::chrono::high_resolution_clock::now();
 #endif
   }
@@ -66,7 +67,7 @@ struct SYCLTimer {
 #if defined(CUTLASS_SYCL_PROFILING_ENABLED)
     syclEventRecord(stop_);
 #else
-    syclcompat::get_default_queue().wait();
+    compat::get_default_queue().wait();
     stop_ = std::chrono::high_resolution_clock::now();
 #endif
   }
@@ -78,7 +79,7 @@ struct SYCLTimer {
     syclEventElapsedTime(&time, start_, stop_);
     return time;
 #else
-    syclcompat::get_default_queue().wait();
+    compat::get_default_queue().wait();
     auto stop = std::chrono::high_resolution_clock::now();
     std::chrono::duration<float, std::milli> time = stop - start_;
     return time.count();