From 95bd50ff0388ce79657b381480aecb0fa1968766 Mon Sep 17 00:00:00 2001 From: "Ding, Yi1" Date: Thu, 29 Feb 2024 04:14:51 +0000 Subject: [PATCH 1/2] 2024.0 compatibility --- include/common/core/common.hpp | 5 +- include/common/core/memory.hpp | 84 ++++++++++++++++++++++++++-------- tests/utils/profiling.hpp | 2 +- 3 files changed, 69 insertions(+), 22 deletions(-) diff --git a/include/common/core/common.hpp b/include/common/core/common.hpp index 3a63ec898..f27c4fcfc 100644 --- a/include/common/core/common.hpp +++ b/include/common/core/common.hpp @@ -112,8 +112,9 @@ enum class data_size : uint8_t { /// The specific LSC shared function to fence with xetla_fence enum class memory_kind : uint8_t { untyped_global = 0, /// untyped global memory - typed_global = 1, /// typed global memory - shared_local = 2, /// shared local memory + // "1" reserved for low-priority untyped global memory + typed_global = 2, /// typed global memory + shared_local = 3, /// shared local memory }; /// The xetla_fence operation to apply to caches diff --git a/include/common/core/memory.hpp b/include/common/core/memory.hpp index 5012896a6..37ad98207 100644 --- a/include/common/core/memory.hpp +++ b/include/common/core/memory.hpp @@ -72,49 +72,89 @@ constexpr __ESIMD_ENS::lsc_data_size get_data_size(gpu::xetla::data_size ds) { /// @brief lookup table for memory kind. /// /// -constexpr sycl::ext::intel::esimd::memory_kind get_memory_kind( - gpu::xetla::memory_kind mk) { +constexpr auto get_memory_kind(gpu::xetla::memory_kind mk) { switch (mk) { +#if __INTEL_LLVM_COMPILER >= 20240100 case gpu::xetla::memory_kind::untyped_global: - return sycl::ext::intel::esimd::memory_kind::image; + return __ESIMD_NS::memory_kind::global; case gpu::xetla::memory_kind::typed_global: - return sycl::ext::intel::esimd::memory_kind::global; + return __ESIMD_NS::memory_kind::image; case gpu::xetla::memory_kind::shared_local: - return sycl::ext::intel::esimd::memory_kind::local; + return __ESIMD_NS::memory_kind::local; +#else // legacy experimental api + case gpu::xetla::memory_kind::untyped_global: + return __ESIMD_ENS::lsc_memory_kind::untyped_global; + case gpu::xetla::memory_kind::typed_global: + return __ESIMD_ENS::lsc_memory_kind::typed_global; + case gpu::xetla::memory_kind::shared_local: + return __ESIMD_ENS::lsc_memory_kind::shared_local; +#endif } } /// @brief lookup table for fence op. /// /// -constexpr sycl::ext::intel::esimd::fence_flush_op get_fence_op(gpu::xetla::fence_op fo) { +constexpr auto get_fence_op(gpu::xetla::fence_op fo) { switch (fo) { - case gpu::xetla::fence_op::none: return sycl::ext::intel::esimd::fence_flush_op::none; +#if __INTEL_LLVM_COMPILER >= 20240100 + case gpu::xetla::fence_op::none: + return __ESIMD_NS::fence_flush_op::none; + case gpu::xetla::fence_op::evict: + return __ESIMD_NS::fence_flush_op::evict; + case gpu::xetla::fence_op::invalidate: + return __ESIMD_NS::fence_flush_op::invalidate; + case gpu::xetla::fence_op::clean: + return __ESIMD_NS::fence_flush_op::clean; +#else // legacy experimental api + case gpu::xetla::fence_op::none: // + return __ESIMD_ENS::lsc_fence_op::none; case gpu::xetla::fence_op::evict: - return sycl::ext::intel::esimd::fence_flush_op::evict; + return __ESIMD_ENS::lsc_fence_op::evict; case gpu::xetla::fence_op::invalidate: - return sycl::ext::intel::esimd::fence_flush_op::invalidate; + return __ESIMD_ENS::lsc_fence_op::invalidate; case gpu::xetla::fence_op::clean: - return sycl::ext::intel::esimd::fence_flush_op::clean; + return __ESIMD_ENS::lsc_fence_op::clean; +#endif } } /// @brief lookup table for fence scope. /// /// -constexpr sycl::ext::intel::esimd::fence_scope get_fence_scope(gpu::xetla::fence_scope fs) { +constexpr auto get_fence_scope(gpu::xetla::fence_scope fs) { switch (fs) { +#if __INTEL_LLVM_COMPILER >= 20240100 case gpu::xetla::fence_scope::group: - return sycl::ext::intel::esimd::fence_scope::group; + return __ESIMD_NS::fence_scope::group; case gpu::xetla::fence_scope::local: - return sycl::ext::intel::esimd::fence_scope::local; - case gpu::xetla::fence_scope::tile: return sycl::ext::intel::esimd::fence_scope::tile; - case gpu::xetla::fence_scope::gpu: return sycl::ext::intel::esimd::fence_scope::gpu; - case gpu::xetla::fence_scope::gpus: return sycl::ext::intel::esimd::fence_scope::gpus; + return __ESIMD_NS::fence_scope::local; + case gpu::xetla::fence_scope::tile: + return __ESIMD_NS::fence_scope::tile; + case gpu::xetla::fence_scope::gpu: // + return __ESIMD_NS::fence_scope::gpu; + case gpu::xetla::fence_scope::gpus: + return __ESIMD_NS::fence_scope::gpus; case gpu::xetla::fence_scope::system: - return sycl::ext::intel::esimd::fence_scope::system; + return __ESIMD_NS::fence_scope::system; case gpu::xetla::fence_scope::sysacq: - return sycl::ext::intel::esimd::fence_scope::system_acquire; + return __ESIMD_NS::fence_scope::system_acquire; +#else // legacy experimental api + case gpu::xetla::fence_scope::group: + return __ESIMD_ENS::lsc_scope::group; + case gpu::xetla::fence_scope::local: + return __ESIMD_ENS::lsc_scope::local; + case gpu::xetla::fence_scope::tile: // + return __ESIMD_ENS::lsc_scope::tile; + case gpu::xetla::fence_scope::gpu: // + return __ESIMD_ENS::lsc_scope::gpu; + case gpu::xetla::fence_scope::gpus: // + return __ESIMD_ENS::lsc_scope::gpus; + case gpu::xetla::fence_scope::system: + return __ESIMD_ENS::lsc_scope::system; + case gpu::xetla::fence_scope::sysacq: + return __ESIMD_ENS::lsc_scope::sysacq; +#endif } } @@ -630,9 +670,15 @@ template __XETLA_API void xetla_fence() { - sycl::ext::intel::esimd::fence= 20240100 + __ESIMD_NS::fence(); +#else + __ESIMD_ENS::lsc_fence(xetla_mask(1)); +#endif } /// @} xetla_core_memory diff --git a/tests/utils/profiling.hpp b/tests/utils/profiling.hpp index 60ef682ab..9ec7bbee6 100644 --- a/tests/utils/profiling.hpp +++ b/tests/utils/profiling.hpp @@ -103,7 +103,7 @@ class profiling_helper { //time mean square error for (int i = 1; i < iter; i++) { #if (__LIBSYCL_MAJOR_VERSION >= 7) && (__LIBSYCL_MINOR_VERSION >= 1) - stat.variance += sycl::pow(time[i] - stat.mean, (double)2); + stat.variance += sycl::pow(time[i] - stat.mean, 2.); #else stat.variance += pow(time[i] - stat.mean, 2); #endif From e0a304d9ace39524f82356e9f7276a67fe502f59 Mon Sep 17 00:00:00 2001 From: "Ding, Yi1" Date: Wed, 6 Mar 2024 04:05:32 +0000 Subject: [PATCH 2/2] fix more warnings on 2024.1 --- include/common/core/base_types.hpp | 6 +++++ include/common/core/math_general.hpp | 8 +++++++ tests/integration/data_transformer/common.hpp | 22 ++++++++++++++----- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/include/common/core/base_types.hpp b/include/common/core/base_types.hpp index ed3bdae09..fc7ff6fe7 100644 --- a/include/common/core/base_types.hpp +++ b/include/common/core/base_types.hpp @@ -68,6 +68,8 @@ struct is_internal_type { static constexpr bool value = std::is_same, bf16>::value || std::is_same, tf32>::value; }; +template +inline constexpr bool is_internal_type_v = is_internal_type::value; /// @brief Used to check if the type is floating_point. /// @tparam T is the data type @@ -79,6 +81,8 @@ struct is_floating_point { || std::is_same, float>::value || std::is_same, double>::value; }; +template +inline constexpr bool is_floating_point_v = is_floating_point::value; /// @brief Used to check if the type is floating_point. /// @tparam T is the data type @@ -93,6 +97,8 @@ struct is_integral { || std::is_same, int64_t>::value || std::is_same, uint64_t>::value; }; +template +inline constexpr bool is_integral_v = is_integral::value; /// @brief Set the native data type of T /// @tparam T is the data type diff --git a/include/common/core/math_general.hpp b/include/common/core/math_general.hpp index 5ae8e41c1..439b463b9 100644 --- a/include/common/core/math_general.hpp +++ b/include/common/core/math_general.hpp @@ -462,7 +462,11 @@ __XETLA_API xetla_vector xetla_add_c(xetla_vector src0, static_assert((std::is_same, uint32_t>::value), "For addc, only uint32_t is supported"); xetla_vector carry_tmp; +#if __INTEL_LLVM_COMPILER >= 20240100 + xetla_vector out = __ESIMD_NS::addc(carry_tmp, src0, src1); +#else xetla_vector out = __ESIMD_ENS::addc(carry_tmp, src0, src1); +#endif carry = carry_tmp; return out; } @@ -480,7 +484,11 @@ __XETLA_API xetla_vector xetla_add_c(xetla_vector src0, T src1, static_assert((std::is_same, uint32_t>::value), "For addc, only uint32_t is supported"); xetla_vector carry_tmp; +#if __INTEL_LLVM_COMPILER >= 20240100 + xetla_vector out = __ESIMD_NS::addc(carry_tmp, src0, src1); +#else xetla_vector out = __ESIMD_ENS::addc(carry_tmp, src0, src1); +#endif carry = carry_tmp; return out; } diff --git a/tests/integration/data_transformer/common.hpp b/tests/integration/data_transformer/common.hpp index a1789bffd..54c0cebe8 100644 --- a/tests/integration/data_transformer/common.hpp +++ b/tests/integration/data_transformer/common.hpp @@ -15,12 +15,24 @@ *******************************************************************************/ #pragma once -#include #include "xetla.hpp" +#include using namespace gpu::xetla; using namespace cl::sycl; +namespace { +// abs for floating point types is non-standard and has been deprecated. +// Please use fabs instead. [-Wdeprecated-declarations] +template +inline T _abs(const T &v) { + if constexpr (is_floating_point_v) + return fabs(v); + else + return abs(v); +}; +} // namespace + template int data_transformer_result_validate(data_type_in *in_device, data_type_out *out_device, size_t mat_m, size_t mat_n, @@ -42,8 +54,8 @@ int data_transformer_result_validate(data_type_in *in_device, for (uint32_t j = 0; j < mat_n; j++) { int idx = i * mat_n + j; - cpu_max = (cpu_max > abs(in[idx])) ? cpu_max - : abs((data_type_acc)in[idx]); + cpu_max = (cpu_max > _abs(in[idx])) ? cpu_max + : _abs((data_type_acc)in[idx]); res = out[idx]; @@ -56,7 +68,7 @@ int data_transformer_result_validate(data_type_in *in_device, : (data_type_out)(in[j * mat_m + i]); } - if (abs(res - ref) > abs(0.01 * res)) { + if (_abs(res - ref) > _abs(0.01 * res)) { std::cout << "i: " << i << " j: " << j << " idx: " << idx << " in: " << in[idx] << " cpu: " << ref << " gpu: " << res << std::endl; @@ -69,7 +81,7 @@ int data_transformer_result_validate(data_type_in *in_device, cpu_max = cpu_max * scale[0]; if (need_fp8_op) { - if (abs(cpu_max - amax_ptr[0]) > abs(0.01 * cpu_max)) { + if (_abs(cpu_max - amax_ptr[0]) > _abs(0.01 * cpu_max)) { std::cout << "cpu_max: " << cpu_max << " gpu_max: " << amax_ptr[0] << std::endl; return 1;