intel · airMeng · Feb 23, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 22, 2024
diff --git a/include/common/core/common.hpp b/include/common/core/common.hpp
@@ -112,20 +112,17 @@ enum class data_size : uint8_t {
 /// The specific LSC shared function to fence with xetla_fence
 enum class memory_kind : uint8_t {
     untyped_global = 0, /// untyped global memory
-    untyped_global_low_pri = 1, /// low-priority untyped global memory
-    typed_global = 2, /// typed global memory
-    shared_local = 3, /// shared local memory
+    typed_global = 1, /// typed global memory
+    shared_local = 2, /// shared local memory
 };
 
 /// The xetla_fence operation to apply to caches
 enum class fence_op : uint8_t {
     none = 0, /// no operation
     evict = 1, /// dirty lines evicted and invalidated from L1
     invalidate = 2, /// invalidate all clean lines
-    discard = 3, /// direct and clean lines are discarded w/o eviction
+
     clean = 4, /// dirty lines are written to memory, but retained in cache
-    /// in clean state
-    flushl2 = 5, /// flush only L2
 };
 /// The scope that xetla_fence operation should apply to
 enum class fence_scope : uint8_t {

diff --git a/include/common/core/memory.hpp b/include/common/core/memory.hpp
@@ -72,55 +72,49 @@ constexpr __ESIMD_ENS::lsc_data_size get_data_size(gpu::xetla::data_size ds) {
 /// @brief lookup table for memory kind.
 ///
 ///
-constexpr __ESIMD_ENS::lsc_memory_kind get_memory_kind(
+constexpr sycl::ext::intel::esimd::memory_kind get_memory_kind(
         gpu::xetla::memory_kind mk) {
     switch (mk) {
         case gpu::xetla::memory_kind::untyped_global:
-            return __ESIMD_ENS::lsc_memory_kind::untyped_global;
-        case gpu::xetla::memory_kind::untyped_global_low_pri:
-            return __ESIMD_ENS::lsc_memory_kind::untyped_global_low_pri;
+            return sycl::ext::intel::esimd::memory_kind::image;
         case gpu::xetla::memory_kind::typed_global:
-            return __ESIMD_ENS::lsc_memory_kind::typed_global;
+            return sycl::ext::intel::esimd::memory_kind::global;
         case gpu::xetla::memory_kind::shared_local:
-            return __ESIMD_ENS::lsc_memory_kind::shared_local;
+            return sycl::ext::intel::esimd::memory_kind::local;
     }
 }
 
 /// @brief lookup table for fence op.
 ///
 ///
-constexpr __ESIMD_ENS::lsc_fence_op get_fence_op(gpu::xetla::fence_op fo) {
+constexpr sycl::ext::intel::esimd::fence_flush_op get_fence_op(gpu::xetla::fence_op fo) {
     switch (fo) {
-        case gpu::xetla::fence_op::none: return __ESIMD_ENS::lsc_fence_op::none;
+        case gpu::xetla::fence_op::none: return sycl::ext::intel::esimd::fence_flush_op::none;
         case gpu::xetla::fence_op::evict:
-            return __ESIMD_ENS::lsc_fence_op::evict;
+            return sycl::ext::intel::esimd::fence_flush_op::evict;
         case gpu::xetla::fence_op::invalidate:
-            return __ESIMD_ENS::lsc_fence_op::invalidate;
-        case gpu::xetla::fence_op::discard:
-            return __ESIMD_ENS::lsc_fence_op::discard;
+            return sycl::ext::intel::esimd::fence_flush_op::invalidate;
         case gpu::xetla::fence_op::clean:
-            return __ESIMD_ENS::lsc_fence_op::clean;
-        case gpu::xetla::fence_op::flushl2:
-            return __ESIMD_ENS::lsc_fence_op::flushl3;
+            return sycl::ext::intel::esimd::fence_flush_op::clean;
     }
 }
 
 /// @brief lookup table for fence scope.
 ///
 ///
-constexpr __ESIMD_ENS::lsc_scope get_fence_scope(gpu::xetla::fence_scope fs) {
+constexpr sycl::ext::intel::esimd::fence_scope get_fence_scope(gpu::xetla::fence_scope fs) {
     switch (fs) {
         case gpu::xetla::fence_scope::group:
-            return __ESIMD_ENS::lsc_scope::group;
+            return sycl::ext::intel::esimd::fence_scope::group;
         case gpu::xetla::fence_scope::local:
-            return __ESIMD_ENS::lsc_scope::local;
-        case gpu::xetla::fence_scope::tile: return __ESIMD_ENS::lsc_scope::tile;
-        case gpu::xetla::fence_scope::gpu: return __ESIMD_ENS::lsc_scope::gpu;
-        case gpu::xetla::fence_scope::gpus: return __ESIMD_ENS::lsc_scope::gpus;
+            return sycl::ext::intel::esimd::fence_scope::local;
+        case gpu::xetla::fence_scope::tile: return sycl::ext::intel::esimd::fence_scope::tile;
+        case gpu::xetla::fence_scope::gpu: return sycl::ext::intel::esimd::fence_scope::gpu;
+        case gpu::xetla::fence_scope::gpus: return sycl::ext::intel::esimd::fence_scope::gpus;
         case gpu::xetla::fence_scope::system:
-            return __ESIMD_ENS::lsc_scope::system;
+            return sycl::ext::intel::esimd::fence_scope::system;
         case gpu::xetla::fence_scope::sysacq:
-            return __ESIMD_ENS::lsc_scope::sysacq;
+            return sycl::ext::intel::esimd::fence_scope::system_acquire;
     }
 }
 
@@ -635,10 +629,10 @@ __XETLA_API xetla_vector<T, N> xetla_atomic_local(
 template <memory_kind Kind = memory_kind::untyped_global,
         fence_op FenceOp = fence_op::none,
         fence_scope Scope = fence_scope::group, int N = 16>
-__XETLA_API void xetla_fence(xetla_mask<N> pred = 1) {
-    __ESIMD_ENS::lsc_fence<gpu::xetla::detail::get_memory_kind(Kind),
+__XETLA_API void xetla_fence() {
+    sycl::ext::intel::esimd::fence<gpu::xetla::detail::get_memory_kind(Kind),
             gpu::xetla::detail::get_fence_op(FenceOp),
-            gpu::xetla::detail::get_fence_scope(Scope), N>(pred);
+            gpu::xetla::detail::get_fence_scope(Scope)>();
 }
 
 /// @} xetla_core_memory

diff --git a/include/subgroup/tile/impl/tile_op_functor.hpp b/include/subgroup/tile/impl/tile_op_functor.hpp
@@ -719,7 +719,7 @@ struct elemwise_reduce_op_t {};
 /// @brief Is the element-wise reduce op functor, specialized for Xe architecture.
 template <reduce_op reduce_kind_, typename dtype_in_, gpu_arch arch_tag>
 struct elemwise_reduce_op_t<reduce_kind_, dtype_in_, arch_tag,
-        std::enable_if_t<(arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag <= gpu_arch::Xe)>> {
     using dtype_in = dtype_in_;
     using mem_desc_in_t
             = mem_desc_t<dtype_in, mem_layout::row_major, mem_space::global>;
@@ -1148,7 +1148,7 @@ struct linear_op_t {};
 /// @brief Is the linear_op functor, specialized for Xe architecture.
 template <typename dtype_in_, gpu_arch arch_tag>
 struct linear_op_t<dtype_in_, arch_tag,
-        std::enable_if_t<(arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag <= gpu_arch::Xe)>> {
     using dtype_in = dtype_in_;
     using mem_desc_in_t
             = mem_desc_t<dtype_in, mem_layout::row_major, mem_space::global>;

diff --git a/tests/utils/profiling.hpp b/tests/utils/profiling.hpp
@@ -103,7 +103,7 @@ class profiling_helper {
         //time mean square error
         for (int i = 1; i < iter; i++) {
 #if (__LIBSYCL_MAJOR_VERSION >= 7) && (__LIBSYCL_MINOR_VERSION >= 1)
-            stat.variance += sycl::pow(time[i] - stat.mean, 2);
+            stat.variance += sycl::pow(time[i] - stat.mean, (double)2);
 #else
             stat.variance += pow(time[i] - stat.mean, 2);
 #endif