Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions include/common/core/base_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ struct is_internal_type {
static constexpr bool value = std::is_same<remove_const_t<T>, bf16>::value
|| std::is_same<remove_const_t<T>, tf32>::value;
};
template <typename T>
inline constexpr bool is_internal_type_v = is_internal_type<T>::value;

/// @brief Used to check if the type is floating_point.
/// @tparam T is the data type
Expand All @@ -79,6 +81,8 @@ struct is_floating_point {
|| std::is_same<remove_const_t<T>, float>::value
|| std::is_same<remove_const_t<T>, double>::value;
};
template <typename T>
inline constexpr bool is_floating_point_v = is_floating_point<T>::value;

/// @brief Used to check if the type is floating_point.
/// @tparam T is the data type
Expand All @@ -93,6 +97,8 @@ struct is_integral {
|| std::is_same<remove_const_t<T>, int64_t>::value
|| std::is_same<remove_const_t<T>, uint64_t>::value;
};
template <typename T>
inline constexpr bool is_integral_v = is_integral<T>::value;

/// @brief Set the native data type of T
/// @tparam T is the data type
Expand Down
5 changes: 3 additions & 2 deletions include/common/core/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,9 @@ enum class data_size : uint8_t {
/// The specific LSC shared function to fence with xetla_fence
enum class memory_kind : uint8_t {
untyped_global = 0, /// untyped global memory
typed_global = 1, /// typed global memory
shared_local = 2, /// shared local memory
// "1" reserved for low-priority untyped global memory
typed_global = 2, /// typed global memory
shared_local = 3, /// shared local memory
};

/// The xetla_fence operation to apply to caches
Expand Down
8 changes: 8 additions & 0 deletions include/common/core/math_general.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,11 @@ __XETLA_API xetla_vector<T, SZ> xetla_add_c(xetla_vector<T, SZ> src0,
static_assert((std::is_same<remove_const_t<T>, uint32_t>::value),
"For addc, only uint32_t is supported");
xetla_vector<T, SZ> carry_tmp;
#if __INTEL_LLVM_COMPILER >= 20240100
xetla_vector<T, SZ> out = __ESIMD_NS::addc(carry_tmp, src0, src1);
#else
xetla_vector<T, SZ> out = __ESIMD_ENS::addc(carry_tmp, src0, src1);
#endif
carry = carry_tmp;
return out;
}
Expand All @@ -480,7 +484,11 @@ __XETLA_API xetla_vector<T, SZ> xetla_add_c(xetla_vector<T, SZ> src0, T src1,
static_assert((std::is_same<remove_const_t<T>, uint32_t>::value),
"For addc, only uint32_t is supported");
xetla_vector<T, SZ> carry_tmp;
#if __INTEL_LLVM_COMPILER >= 20240100
xetla_vector<T, SZ> out = __ESIMD_NS::addc(carry_tmp, src0, src1);
#else
xetla_vector<T, SZ> out = __ESIMD_ENS::addc(carry_tmp, src0, src1);
#endif
carry = carry_tmp;
return out;
}
Expand Down
84 changes: 65 additions & 19 deletions include/common/core/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,49 +72,89 @@ constexpr __ESIMD_ENS::lsc_data_size get_data_size(gpu::xetla::data_size ds) {
/// @brief lookup table for memory kind.
///
///
constexpr sycl::ext::intel::esimd::memory_kind get_memory_kind(
gpu::xetla::memory_kind mk) {
constexpr auto get_memory_kind(gpu::xetla::memory_kind mk) {
switch (mk) {
#if __INTEL_LLVM_COMPILER >= 20240100
case gpu::xetla::memory_kind::untyped_global:
return sycl::ext::intel::esimd::memory_kind::image;
return __ESIMD_NS::memory_kind::global;
case gpu::xetla::memory_kind::typed_global:
return sycl::ext::intel::esimd::memory_kind::global;
return __ESIMD_NS::memory_kind::image;
case gpu::xetla::memory_kind::shared_local:
return sycl::ext::intel::esimd::memory_kind::local;
return __ESIMD_NS::memory_kind::local;
#else // legacy experimental api
case gpu::xetla::memory_kind::untyped_global:
return __ESIMD_ENS::lsc_memory_kind::untyped_global;
case gpu::xetla::memory_kind::typed_global:
return __ESIMD_ENS::lsc_memory_kind::typed_global;
case gpu::xetla::memory_kind::shared_local:
return __ESIMD_ENS::lsc_memory_kind::shared_local;
#endif
}
}

/// @brief lookup table for fence op.
///
///
constexpr sycl::ext::intel::esimd::fence_flush_op get_fence_op(gpu::xetla::fence_op fo) {
constexpr auto get_fence_op(gpu::xetla::fence_op fo) {
switch (fo) {
case gpu::xetla::fence_op::none: return sycl::ext::intel::esimd::fence_flush_op::none;
#if __INTEL_LLVM_COMPILER >= 20240100
case gpu::xetla::fence_op::none:
return __ESIMD_NS::fence_flush_op::none;
case gpu::xetla::fence_op::evict:
return __ESIMD_NS::fence_flush_op::evict;
case gpu::xetla::fence_op::invalidate:
return __ESIMD_NS::fence_flush_op::invalidate;
case gpu::xetla::fence_op::clean:
return __ESIMD_NS::fence_flush_op::clean;
#else // legacy experimental api
case gpu::xetla::fence_op::none: //
return __ESIMD_ENS::lsc_fence_op::none;
case gpu::xetla::fence_op::evict:
return sycl::ext::intel::esimd::fence_flush_op::evict;
return __ESIMD_ENS::lsc_fence_op::evict;
case gpu::xetla::fence_op::invalidate:
return sycl::ext::intel::esimd::fence_flush_op::invalidate;
return __ESIMD_ENS::lsc_fence_op::invalidate;
case gpu::xetla::fence_op::clean:
return sycl::ext::intel::esimd::fence_flush_op::clean;
return __ESIMD_ENS::lsc_fence_op::clean;
#endif
}
}

/// @brief lookup table for fence scope.
///
///
constexpr sycl::ext::intel::esimd::fence_scope get_fence_scope(gpu::xetla::fence_scope fs) {
constexpr auto get_fence_scope(gpu::xetla::fence_scope fs) {
switch (fs) {
#if __INTEL_LLVM_COMPILER >= 20240100
case gpu::xetla::fence_scope::group:
return sycl::ext::intel::esimd::fence_scope::group;
return __ESIMD_NS::fence_scope::group;
case gpu::xetla::fence_scope::local:
return sycl::ext::intel::esimd::fence_scope::local;
case gpu::xetla::fence_scope::tile: return sycl::ext::intel::esimd::fence_scope::tile;
case gpu::xetla::fence_scope::gpu: return sycl::ext::intel::esimd::fence_scope::gpu;
case gpu::xetla::fence_scope::gpus: return sycl::ext::intel::esimd::fence_scope::gpus;
return __ESIMD_NS::fence_scope::local;
case gpu::xetla::fence_scope::tile:
return __ESIMD_NS::fence_scope::tile;
case gpu::xetla::fence_scope::gpu: //
return __ESIMD_NS::fence_scope::gpu;
case gpu::xetla::fence_scope::gpus:
return __ESIMD_NS::fence_scope::gpus;
case gpu::xetla::fence_scope::system:
return sycl::ext::intel::esimd::fence_scope::system;
return __ESIMD_NS::fence_scope::system;
case gpu::xetla::fence_scope::sysacq:
return sycl::ext::intel::esimd::fence_scope::system_acquire;
return __ESIMD_NS::fence_scope::system_acquire;
#else // legacy experimental api
case gpu::xetla::fence_scope::group:
return __ESIMD_ENS::lsc_scope::group;
case gpu::xetla::fence_scope::local:
return __ESIMD_ENS::lsc_scope::local;
case gpu::xetla::fence_scope::tile: //
return __ESIMD_ENS::lsc_scope::tile;
case gpu::xetla::fence_scope::gpu: //
return __ESIMD_ENS::lsc_scope::gpu;
case gpu::xetla::fence_scope::gpus: //
return __ESIMD_ENS::lsc_scope::gpus;
case gpu::xetla::fence_scope::system:
return __ESIMD_ENS::lsc_scope::system;
case gpu::xetla::fence_scope::sysacq:
return __ESIMD_ENS::lsc_scope::sysacq;
#endif
}
}

Expand Down Expand Up @@ -630,9 +670,15 @@ template <memory_kind Kind = memory_kind::untyped_global,
fence_op FenceOp = fence_op::none,
fence_scope Scope = fence_scope::group, int N = 16>
__XETLA_API void xetla_fence() {
sycl::ext::intel::esimd::fence<gpu::xetla::detail::get_memory_kind(Kind),
#if __INTEL_LLVM_COMPILER >= 20240100
__ESIMD_NS::fence<gpu::xetla::detail::get_memory_kind(Kind),
gpu::xetla::detail::get_fence_op(FenceOp),
gpu::xetla::detail::get_fence_scope(Scope)>();
#else
__ESIMD_ENS::lsc_fence<gpu::xetla::detail::get_memory_kind(Kind),
gpu::xetla::detail::get_fence_op(FenceOp),
gpu::xetla::detail::get_fence_scope(Scope), N>(xetla_mask<N>(1));
#endif
}

/// @} xetla_core_memory
Expand Down
22 changes: 17 additions & 5 deletions tests/integration/data_transformer/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,24 @@
*******************************************************************************/
#pragma once

#include <utils/common.hpp>
#include "xetla.hpp"
#include <utils/common.hpp>

using namespace gpu::xetla;
using namespace cl::sycl;

namespace {
// abs for floating point types is non-standard and has been deprecated.
// Please use fabs instead. [-Wdeprecated-declarations]
template <typename T>
inline T _abs(const T &v) {
if constexpr (is_floating_point_v<T>)
return fabs(v);
else
return abs(v);
};
} // namespace

template <typename data_type_in, typename data_type_out, typename data_type_acc>
int data_transformer_result_validate(data_type_in *in_device,
data_type_out *out_device, size_t mat_m, size_t mat_n,
Expand All @@ -42,8 +54,8 @@ int data_transformer_result_validate(data_type_in *in_device,
for (uint32_t j = 0; j < mat_n; j++) {
int idx = i * mat_n + j;

cpu_max = (cpu_max > abs(in[idx])) ? cpu_max
: abs((data_type_acc)in[idx]);
cpu_max = (cpu_max > _abs(in[idx])) ? cpu_max
: _abs((data_type_acc)in[idx]);

res = out[idx];

Expand All @@ -56,7 +68,7 @@ int data_transformer_result_validate(data_type_in *in_device,
: (data_type_out)(in[j * mat_m + i]);
}

if (abs(res - ref) > abs(0.01 * res)) {
if (_abs(res - ref) > _abs(0.01 * res)) {
std::cout << "i: " << i << " j: " << j << " idx: " << idx
<< " in: " << in[idx] << " cpu: " << ref
<< " gpu: " << res << std::endl;
Expand All @@ -69,7 +81,7 @@ int data_transformer_result_validate(data_type_in *in_device,
cpu_max = cpu_max * scale[0];

if (need_fp8_op) {
if (abs(cpu_max - amax_ptr[0]) > abs(0.01 * cpu_max)) {
if (_abs(cpu_max - amax_ptr[0]) > _abs(0.01 * cpu_max)) {
std::cout << "cpu_max: " << cpu_max << " gpu_max: " << amax_ptr[0]
<< std::endl;
return 1;
Expand Down
2 changes: 1 addition & 1 deletion tests/utils/profiling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class profiling_helper {
//time mean square error
for (int i = 1; i < iter; i++) {
#if (__LIBSYCL_MAJOR_VERSION >= 7) && (__LIBSYCL_MINOR_VERSION >= 1)
stat.variance += sycl::pow(time[i] - stat.mean, (double)2);
stat.variance += sycl::pow(time[i] - stat.mean, 2.);
#else
stat.variance += pow(time[i] - stat.mean, 2);
#endif
Expand Down