From 95bd50ff0388ce79657b381480aecb0fa1968766 Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Thu, 29 Feb 2024 04:14:51 +0000
Subject: [PATCH 1/2] 2024.0 compatibility

---
 include/common/core/common.hpp |  5 +-
 include/common/core/memory.hpp | 84 ++++++++++++++++++++++++++--------
 tests/utils/profiling.hpp      |  2 +-
 3 files changed, 69 insertions(+), 22 deletions(-)

diff --git a/include/common/core/common.hpp b/include/common/core/common.hpp
index 3a63ec898..f27c4fcfc 100644
--- a/include/common/core/common.hpp
+++ b/include/common/core/common.hpp
@@ -112,8 +112,9 @@ enum class data_size : uint8_t {
 /// The specific LSC shared function to fence with xetla_fence
 enum class memory_kind : uint8_t {
     untyped_global = 0, /// untyped global memory
-    typed_global = 1, /// typed global memory
-    shared_local = 2, /// shared local memory
+    // "1" reserved for low-priority untyped global memory
+    typed_global = 2, /// typed global memory
+    shared_local = 3, /// shared local memory
 };
 
 /// The xetla_fence operation to apply to caches
diff --git a/include/common/core/memory.hpp b/include/common/core/memory.hpp
index 5012896a6..37ad98207 100644
--- a/include/common/core/memory.hpp
+++ b/include/common/core/memory.hpp
@@ -72,49 +72,89 @@ constexpr __ESIMD_ENS::lsc_data_size get_data_size(gpu::xetla::data_size ds) {
 /// @brief lookup table for memory kind.
 ///
 ///
-constexpr sycl::ext::intel::esimd::memory_kind get_memory_kind(
-        gpu::xetla::memory_kind mk) {
+constexpr auto get_memory_kind(gpu::xetla::memory_kind mk) {
     switch (mk) {
+#if __INTEL_LLVM_COMPILER >= 20240100
         case gpu::xetla::memory_kind::untyped_global:
-            return sycl::ext::intel::esimd::memory_kind::image;
+            return __ESIMD_NS::memory_kind::global;
         case gpu::xetla::memory_kind::typed_global:
-            return sycl::ext::intel::esimd::memory_kind::global;
+            return __ESIMD_NS::memory_kind::image;
         case gpu::xetla::memory_kind::shared_local:
-            return sycl::ext::intel::esimd::memory_kind::local;
+            return __ESIMD_NS::memory_kind::local;
+#else // legacy experimental api
+        case gpu::xetla::memory_kind::untyped_global:
+            return __ESIMD_ENS::lsc_memory_kind::untyped_global;
+        case gpu::xetla::memory_kind::typed_global:
+            return __ESIMD_ENS::lsc_memory_kind::typed_global;
+        case gpu::xetla::memory_kind::shared_local:
+            return __ESIMD_ENS::lsc_memory_kind::shared_local;
+#endif
     }
 }
 
 /// @brief lookup table for fence op.
 ///
 ///
-constexpr sycl::ext::intel::esimd::fence_flush_op get_fence_op(gpu::xetla::fence_op fo) {
+constexpr auto get_fence_op(gpu::xetla::fence_op fo) {
     switch (fo) {
-        case gpu::xetla::fence_op::none: return sycl::ext::intel::esimd::fence_flush_op::none;
+#if __INTEL_LLVM_COMPILER >= 20240100
+        case gpu::xetla::fence_op::none:
+            return __ESIMD_NS::fence_flush_op::none;
+        case gpu::xetla::fence_op::evict:
+            return __ESIMD_NS::fence_flush_op::evict;
+        case gpu::xetla::fence_op::invalidate:
+            return __ESIMD_NS::fence_flush_op::invalidate;
+        case gpu::xetla::fence_op::clean:
+            return __ESIMD_NS::fence_flush_op::clean;
+#else // legacy experimental api
+        case gpu::xetla::fence_op::none: //
+            return __ESIMD_ENS::lsc_fence_op::none;
         case gpu::xetla::fence_op::evict:
-            return sycl::ext::intel::esimd::fence_flush_op::evict;
+            return __ESIMD_ENS::lsc_fence_op::evict;
         case gpu::xetla::fence_op::invalidate:
-            return sycl::ext::intel::esimd::fence_flush_op::invalidate;
+            return __ESIMD_ENS::lsc_fence_op::invalidate;
         case gpu::xetla::fence_op::clean:
-            return sycl::ext::intel::esimd::fence_flush_op::clean;
+            return __ESIMD_ENS::lsc_fence_op::clean;
+#endif
     }
 }
 
 /// @brief lookup table for fence scope.
 ///
 ///
-constexpr sycl::ext::intel::esimd::fence_scope get_fence_scope(gpu::xetla::fence_scope fs) {
+constexpr auto get_fence_scope(gpu::xetla::fence_scope fs) {
     switch (fs) {
+#if __INTEL_LLVM_COMPILER >= 20240100
         case gpu::xetla::fence_scope::group:
-            return sycl::ext::intel::esimd::fence_scope::group;
+            return __ESIMD_NS::fence_scope::group;
         case gpu::xetla::fence_scope::local:
-            return sycl::ext::intel::esimd::fence_scope::local;
-        case gpu::xetla::fence_scope::tile: return sycl::ext::intel::esimd::fence_scope::tile;
-        case gpu::xetla::fence_scope::gpu: return sycl::ext::intel::esimd::fence_scope::gpu;
-        case gpu::xetla::fence_scope::gpus: return sycl::ext::intel::esimd::fence_scope::gpus;
+            return __ESIMD_NS::fence_scope::local;
+        case gpu::xetla::fence_scope::tile:
+            return __ESIMD_NS::fence_scope::tile;
+        case gpu::xetla::fence_scope::gpu: //
+            return __ESIMD_NS::fence_scope::gpu;
+        case gpu::xetla::fence_scope::gpus:
+            return __ESIMD_NS::fence_scope::gpus;
         case gpu::xetla::fence_scope::system:
-            return sycl::ext::intel::esimd::fence_scope::system;
+            return __ESIMD_NS::fence_scope::system;
         case gpu::xetla::fence_scope::sysacq:
-            return sycl::ext::intel::esimd::fence_scope::system_acquire;
+            return __ESIMD_NS::fence_scope::system_acquire;
+#else // legacy experimental api
+        case gpu::xetla::fence_scope::group:
+            return __ESIMD_ENS::lsc_scope::group;
+        case gpu::xetla::fence_scope::local:
+            return __ESIMD_ENS::lsc_scope::local;
+        case gpu::xetla::fence_scope::tile: //
+            return __ESIMD_ENS::lsc_scope::tile;
+        case gpu::xetla::fence_scope::gpu: //
+            return __ESIMD_ENS::lsc_scope::gpu;
+        case gpu::xetla::fence_scope::gpus: //
+            return __ESIMD_ENS::lsc_scope::gpus;
+        case gpu::xetla::fence_scope::system:
+            return __ESIMD_ENS::lsc_scope::system;
+        case gpu::xetla::fence_scope::sysacq:
+            return __ESIMD_ENS::lsc_scope::sysacq;
+#endif
     }
 }
 
@@ -630,9 +670,15 @@ template <memory_kind Kind = memory_kind::untyped_global,
         fence_op FenceOp = fence_op::none,
         fence_scope Scope = fence_scope::group, int N = 16>
 __XETLA_API void xetla_fence() {
-    sycl::ext::intel::esimd::fence<gpu::xetla::detail::get_memory_kind(Kind),
+#if __INTEL_LLVM_COMPILER >= 20240100
+    __ESIMD_NS::fence<gpu::xetla::detail::get_memory_kind(Kind),
             gpu::xetla::detail::get_fence_op(FenceOp),
             gpu::xetla::detail::get_fence_scope(Scope)>();
+#else
+    __ESIMD_ENS::lsc_fence<gpu::xetla::detail::get_memory_kind(Kind),
+            gpu::xetla::detail::get_fence_op(FenceOp),
+            gpu::xetla::detail::get_fence_scope(Scope), N>(xetla_mask<N>(1));
+#endif
 }
 
 /// @} xetla_core_memory
diff --git a/tests/utils/profiling.hpp b/tests/utils/profiling.hpp
index 60ef682ab..9ec7bbee6 100644
--- a/tests/utils/profiling.hpp
+++ b/tests/utils/profiling.hpp
@@ -103,7 +103,7 @@ class profiling_helper {
         //time mean square error
         for (int i = 1; i < iter; i++) {
 #if (__LIBSYCL_MAJOR_VERSION >= 7) && (__LIBSYCL_MINOR_VERSION >= 1)
-            stat.variance += sycl::pow(time[i] - stat.mean, (double)2);
+            stat.variance += sycl::pow(time[i] - stat.mean, 2.);
 #else
             stat.variance += pow(time[i] - stat.mean, 2);
 #endif

From e0a304d9ace39524f82356e9f7276a67fe502f59 Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Wed, 6 Mar 2024 04:05:32 +0000
Subject: [PATCH 2/2] fix more warnings on 2024.1

---
 include/common/core/base_types.hpp            |  6 +++++
 include/common/core/math_general.hpp          |  8 +++++++
 tests/integration/data_transformer/common.hpp | 22 ++++++++++++++-----
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/include/common/core/base_types.hpp b/include/common/core/base_types.hpp
index ed3bdae09..fc7ff6fe7 100644
--- a/include/common/core/base_types.hpp
+++ b/include/common/core/base_types.hpp
@@ -68,6 +68,8 @@ struct is_internal_type {
     static constexpr bool value = std::is_same<remove_const_t<T>, bf16>::value
             || std::is_same<remove_const_t<T>, tf32>::value;
 };
+template <typename T>
+inline constexpr bool is_internal_type_v = is_internal_type<T>::value;
 
 /// @brief Used to check if the type is floating_point.
 /// @tparam T is the data type
@@ -79,6 +81,8 @@ struct is_floating_point {
             || std::is_same<remove_const_t<T>, float>::value
             || std::is_same<remove_const_t<T>, double>::value;
 };
+template <typename T>
+inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
 
 /// @brief Used to check if the type is floating_point.
 /// @tparam T is the data type
@@ -93,6 +97,8 @@ struct is_integral {
             || std::is_same<remove_const_t<T>, int64_t>::value
             || std::is_same<remove_const_t<T>, uint64_t>::value;
 };
+template <typename T>
+inline constexpr bool is_integral_v = is_integral<T>::value;
 
 /// @brief Set the native data type of T
 /// @tparam T is the data type
diff --git a/include/common/core/math_general.hpp b/include/common/core/math_general.hpp
index 5ae8e41c1..439b463b9 100644
--- a/include/common/core/math_general.hpp
+++ b/include/common/core/math_general.hpp
@@ -462,7 +462,11 @@ __XETLA_API xetla_vector<T, SZ> xetla_add_c(xetla_vector<T, SZ> src0,
     static_assert((std::is_same<remove_const_t<T>, uint32_t>::value),
             "For addc, only uint32_t is supported");
     xetla_vector<T, SZ> carry_tmp;
+#if __INTEL_LLVM_COMPILER >= 20240100
+    xetla_vector<T, SZ> out = __ESIMD_NS::addc(carry_tmp, src0, src1);
+#else
     xetla_vector<T, SZ> out = __ESIMD_ENS::addc(carry_tmp, src0, src1);
+#endif
     carry = carry_tmp;
     return out;
 }
@@ -480,7 +484,11 @@ __XETLA_API xetla_vector<T, SZ> xetla_add_c(xetla_vector<T, SZ> src0, T src1,
     static_assert((std::is_same<remove_const_t<T>, uint32_t>::value),
             "For addc, only uint32_t is supported");
     xetla_vector<T, SZ> carry_tmp;
+#if __INTEL_LLVM_COMPILER >= 20240100
+    xetla_vector<T, SZ> out = __ESIMD_NS::addc(carry_tmp, src0, src1);
+#else
     xetla_vector<T, SZ> out = __ESIMD_ENS::addc(carry_tmp, src0, src1);
+#endif
     carry = carry_tmp;
     return out;
 }
diff --git a/tests/integration/data_transformer/common.hpp b/tests/integration/data_transformer/common.hpp
index a1789bffd..54c0cebe8 100644
--- a/tests/integration/data_transformer/common.hpp
+++ b/tests/integration/data_transformer/common.hpp
@@ -15,12 +15,24 @@
 *******************************************************************************/
 #pragma once
 
-#include <utils/common.hpp>
 #include "xetla.hpp"
+#include <utils/common.hpp>
 
 using namespace gpu::xetla;
 using namespace cl::sycl;
 
+namespace {
+// abs for floating point types is non-standard and has been deprecated.
+// Please use fabs instead. [-Wdeprecated-declarations]
+template <typename T>
+inline T _abs(const T &v) {
+    if constexpr (is_floating_point_v<T>)
+        return fabs(v);
+    else
+        return abs(v);
+};
+} // namespace
+
 template <typename data_type_in, typename data_type_out, typename data_type_acc>
 int data_transformer_result_validate(data_type_in *in_device,
         data_type_out *out_device, size_t mat_m, size_t mat_n,
@@ -42,8 +54,8 @@ int data_transformer_result_validate(data_type_in *in_device,
         for (uint32_t j = 0; j < mat_n; j++) {
             int idx = i * mat_n + j;
 
-            cpu_max = (cpu_max > abs(in[idx])) ? cpu_max
-                                               : abs((data_type_acc)in[idx]);
+            cpu_max = (cpu_max > _abs(in[idx])) ? cpu_max
+                                                : _abs((data_type_acc)in[idx]);
 
             res = out[idx];
 
@@ -56,7 +68,7 @@ int data_transformer_result_validate(data_type_in *in_device,
                         : (data_type_out)(in[j * mat_m + i]);
             }
 
-            if (abs(res - ref) > abs(0.01 * res)) {
+            if (_abs(res - ref) > _abs(0.01 * res)) {
                 std::cout << "i: " << i << " j: " << j << " idx: " << idx
                           << " in: " << in[idx] << " cpu: " << ref
                           << " gpu: " << res << std::endl;
@@ -69,7 +81,7 @@ int data_transformer_result_validate(data_type_in *in_device,
     cpu_max = cpu_max * scale[0];
 
     if (need_fp8_op) {
-        if (abs(cpu_max - amax_ptr[0]) > abs(0.01 * cpu_max)) {
+        if (_abs(cpu_max - amax_ptr[0]) > _abs(0.01 * cpu_max)) {
             std::cout << "cpu_max: " << cpu_max << " gpu_max: " << amax_ptr[0]
                       << std::endl;
             return 1;