From 0bb59521b385909d831d263396ba104b67b73e4b Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Mon, 25 Jul 2022 14:06:47 -0700 Subject: [PATCH 01/15] [SYCL][ESIMD][EMU] Enabling dpas with ESIMD_EMULATOR backend - +fix for 'saturate<>' for ESIMD_EMULATOR backend --- .../intel/experimental/esimd/detail/math_intrin.hpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index 0ad3475156e6a..3f2b5c228f044 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -647,7 +647,8 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, static_cast(reinterpret_cast(tmpUint) >> 16); } else retv[r * SIMDSize + n] = - __ESIMD_EMU_DNS::satur::saturate(simdAcc[n], sat1); + __ESIMD_EMU_DNS::satur::template saturate(simdAcc[n], + (int)sat1); } } // Repeat. @@ -663,14 +664,9 @@ inline __ESIMD_DNS::vector_type_t __esimd_dpas(__ESIMD_DNS::vector_type_t src0, __ESIMD_DNS::vector_type_t src1, __ESIMD_DNS::vector_type_t src2) { -#ifdef __SYCL_EXPLICIT_SIMD_PLUGIN__ return __esimd_dpas_inner( std::addressof(src0), src1, src2); -#else // __SYCL_EXPLICIT_SIMD_PLUGIN__ - __ESIMD_UNSUPPORTED_ON_HOST; - return __ESIMD_DNS::vector_type_t(); -#endif // __SYCL_EXPLICIT_SIMD_PLUGIN__ } template <__ESIMD_ENS::argument_type src1_precision, @@ -680,14 +676,9 @@ template <__ESIMD_ENS::argument_type src1_precision, inline __ESIMD_DNS::vector_type_t __esimd_dpas2(__ESIMD_DNS::vector_type_t src1, __ESIMD_DNS::vector_type_t src2) { -#ifdef __SYCL_EXPLICIT_SIMD_PLUGIN__ return __esimd_dpas_inner(nullptr, src1, src2); -#else // __SYCL_EXPLICIT_SIMD_PLUGIN__ - __ESIMD_UNSUPPORTED_ON_HOST; - return __ESIMD_DNS::vector_type_t(); -#endif // __SYCL_EXPLICIT_SIMD_PLUGIN__ } template <__ESIMD_ENS::argument_type src1_precision, From b28d750439b973d711272e3997ec32bf8acd4c2e Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Tue, 26 Jul 2022 10:28:30 -0700 Subject: [PATCH 02/15] Working version for dpas.cpp - First argument for '__esimd_dpas_inner' is changed from 'const __ESIMD_DNS::vector_type_t' to 'const void *' - This is because 'std::addressof()' from caller generates 'short * __attribute__((ext_vector_type(16)))' that cannot be converted into 'vector_type_t<>' while 'dpas.cpp' test is compiled. --- .../ext/intel/experimental/esimd/detail/math_intrin.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index 3f2b5c228f044..d3ae871520914 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -457,12 +457,13 @@ __esimd_dpas_bits_precision(__ESIMD_ENS::argument_type precisionType) { : 1; } +// TODO : 'src0' argument type other than 'void*'? template <__ESIMD_ENS::argument_type src1_precision, __ESIMD_ENS::argument_type src2_precision, int systolic_depth, int repeat_count, typename RT, typename T1, typename T2, __ESIMD_NS::uint SZ, __ESIMD_NS::uint N1, __ESIMD_NS::uint N2> inline __ESIMD_DNS::vector_type_t -__esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, +__esimd_dpas_inner(const void *src0, const __ESIMD_DNS::vector_type_t &src1, const __ESIMD_DNS::vector_type_t &src2) { __ESIMD_DNS::vector_type_t retv; @@ -575,7 +576,9 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, for (uint n = 0; n < SIMDSize; n++) { if (src0 != nullptr) { - auto src0El = src0[0][r * SIMDSize + n]; + __ESIMD_DNS::vector_type_t *src0Casted = + (__ESIMD_DNS::vector_type_t *)(src0); + auto src0El = src0Casted[0][r * SIMDSize + n]; if (pvcBfDest) { const auto tmp = (uint32_t)(src0El) << 16; @@ -666,7 +669,7 @@ __esimd_dpas(__ESIMD_DNS::vector_type_t src0, __ESIMD_DNS::vector_type_t src2) { return __esimd_dpas_inner( - std::addressof(src0), src1, src2); + (void *)std::addressof(src0), src1, src2); } template <__ESIMD_ENS::argument_type src1_precision, From 64d375bf0bda4455bc6c37e75ff20b21a5dd6a4a Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Tue, 26 Jul 2022 13:54:41 -0700 Subject: [PATCH 03/15] Separating definitions of __esimd_dpas* per __SYCL_DEVICE_ONLY__ - As template and runtime arguments are different (e.g. 'dpas_info' is not used in ESIMD_EMULATOR backend (hostmode)) --- .../sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index d3ae871520914..8f02c19c2130b 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -110,6 +110,8 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) } #endif // __SYCL_DEVICE_ONLY__ +#ifdef __SYCL_DEVICE_ONLY__ + template SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t @@ -135,7 +137,7 @@ SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t __esimd_dpasw2(__ESIMD_DNS::vector_type_t src1, __ESIMD_DNS::vector_type_t src2, int dpas_info); -#ifndef __SYCL_DEVICE_ONLY__ +#else // __SYCL_DEVICE_ONLY__ template __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) From fc3bd073da5b1f6c93a3581cc55c726f058c5d19 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Thu, 28 Jul 2022 11:15:59 -0700 Subject: [PATCH 04/15] Unifying __esimd_dpas* functions - Template arguments for dpas operations (repeat count, systolic depth, src1_precision, src2_precision) are converted into runtime arguments - __esimd_dpas* functions have same signature regardless of __SYCL_DEVICE_ONLY__ --- .../experimental/esimd/detail/math_intrin.hpp | 233 ++++++++++-------- .../ext/intel/experimental/esimd/math.hpp | 51 +--- 2 files changed, 142 insertions(+), 142 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index 8f02c19c2130b..a118237e88501 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -12,6 +12,42 @@ /// @cond ESIMD_DETAIL +#define ARG_UNUSED(x) (void)x + +inline int __esimd_encode_dpas_info(int repeat_count, int systolic_depth, + __ESIMD_ENS::argument_type src1_precision, + __ESIMD_ENS::argument_type src2_precision) { + return (repeat_count << 24) + (systolic_depth << 16) + + (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); +} + +inline int __esimd_decode_repeat_count(int dpas_info) { + return (dpas_info >> 24); +} + +inline int __esimd_decode_systolic_depth(int dpas_info) { + return (dpas_info >> 16); +} + +inline __ESIMD_ENS::argument_type +__esimd_decode_src1_precision(const int dpas_info) { + int decoded = dpas_info; + decoded &= 0xFF; + decoded -= 1; + + return (__ESIMD_ENS::argument_type)decoded; +} + +inline __ESIMD_ENS::argument_type +__esimd_decode_src2_precision(const int dpas_info) { + int decoded = dpas_info; + decoded >>= 8; + decoded &= 0xFF; + decoded -= 1; + + return (__ESIMD_ENS::argument_type)decoded; +} + #include #define __ESIMD_raw_vec_t(T, SZ) \ @@ -110,34 +146,7 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) } #endif // __SYCL_DEVICE_ONLY__ -#ifdef __SYCL_DEVICE_ONLY__ - -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t -__esimd_dpas(__ESIMD_DNS::vector_type_t src0, - __ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2, int src1_precision, - int src2_precision, int depth, int repeat, int sign_res, - int sign_acc); - -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t -__esimd_dpas2(__ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2, int dpas_info); - -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t -__esimd_dpasw(__ESIMD_DNS::vector_type_t src0, - __ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2, int dpas_info); - -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t -__esimd_dpasw2(__ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2, int dpas_info); - -#else // __SYCL_DEVICE_ONLY__ +#ifndef __SYCL_DEVICE_ONLY__ template __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) @@ -441,7 +450,7 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) return retv; } -inline constexpr __ESIMD_NS::uint +inline __ESIMD_NS::uint __esimd_dpas_bits_precision(__ESIMD_ENS::argument_type precisionType) { return precisionType == __ESIMD_ENS::argument_type::TF32 ? 32 : precisionType == __ESIMD_ENS::argument_type::BF16 || @@ -460,14 +469,15 @@ __esimd_dpas_bits_precision(__ESIMD_ENS::argument_type precisionType) { } // TODO : 'src0' argument type other than 'void*'? -template <__ESIMD_ENS::argument_type src1_precision, - __ESIMD_ENS::argument_type src2_precision, int systolic_depth, - int repeat_count, typename RT, typename T1, typename T2, - __ESIMD_NS::uint SZ, __ESIMD_NS::uint N1, __ESIMD_NS::uint N2> +template inline __ESIMD_DNS::vector_type_t __esimd_dpas_inner(const void *src0, const __ESIMD_DNS::vector_type_t &src1, - const __ESIMD_DNS::vector_type_t &src2) { + const __ESIMD_DNS::vector_type_t &src2, + __ESIMD_ENS::argument_type src1_precision, + __ESIMD_ENS::argument_type src2_precision, + int systolic_depth, int repeat_count) { __ESIMD_DNS::vector_type_t retv; __ESIMD_NS::uint sat1 = @@ -476,7 +486,7 @@ __esimd_dpas_inner(const void *src0, __ESIMD_EMU_DNS::SetSatur::value>::set(); - constexpr __ESIMD_NS::uint ops_per_chan = + __ESIMD_NS::uint ops_per_chan = src1_precision == __ESIMD_ENS::argument_type::BF16 || src1_precision == __ESIMD_ENS::argument_type::FP16 || src2_precision == __ESIMD_ENS::argument_type::BF16 || @@ -491,8 +501,8 @@ __esimd_dpas_inner(const void *src0, __ESIMD_NS::uint V = 0, U = 0, k = 0, temp = 0, src1_ops_per_dword = 0, p = 0; - constexpr auto src1_el_bits = __esimd_dpas_bits_precision(src1_precision); - constexpr auto src2_el_bits = __esimd_dpas_bits_precision(src2_precision); + auto src1_el_bits = __esimd_dpas_bits_precision(src1_precision); + auto src2_el_bits = __esimd_dpas_bits_precision(src2_precision); uint32_t src1_signed = src1_precision == __ESIMD_ENS::argument_type::S2 || @@ -516,54 +526,55 @@ __esimd_dpas_inner(const void *src0, constexpr size_t SIMDSize = 8; #endif - constexpr bool - pvcHfDest = isPvc && std::is_same::value, - pvcBfDest = isPvc && std::is_same::value, - pvcBfOrHfDest = pvcBfDest || pvcHfDest, + constexpr bool pvcHfDest = + isPvc && std::is_same::value, + pvcBfDest = isPvc && std::is_same::value, + pvcBfOrHfDest = pvcBfDest || pvcHfDest; - pvcBfDestChecks = pvcBfDest && - src1_precision == __ESIMD_ENS::argument_type::BF16 && - src2_precision == __ESIMD_ENS::argument_type::BF16, + bool pvcBfDestChecks = pvcBfDest && + src1_precision == __ESIMD_ENS::argument_type::BF16 && + src2_precision == __ESIMD_ENS::argument_type::BF16, - pvcHfDestChecks = - pvcHfDest && ((src1_precision == __ESIMD_ENS::argument_type::FP16 && - src2_precision == __ESIMD_ENS::argument_type::FP16) || - (src1_precision == __ESIMD_ENS::argument_type::BF16 && - src2_precision == __ESIMD_ENS::argument_type::BF16)), + pvcHfDestChecks = + pvcHfDest && ((src1_precision == __ESIMD_ENS::argument_type::FP16 && + src2_precision == __ESIMD_ENS::argument_type::FP16) || + (src1_precision == __ESIMD_ENS::argument_type::BF16 && + src2_precision == __ESIMD_ENS::argument_type::BF16)), - destTypeChk = - (!pvcBfOrHfDest && __ESIMD_EMU_DNS::is_fp_or_dword_type::value) || - (pvcBfOrHfDest && (pvcBfDestChecks || pvcHfDestChecks)), + destTypeChk = (!pvcBfOrHfDest && + __ESIMD_EMU_DNS::is_fp_or_dword_type::value) || + (pvcBfOrHfDest && (pvcBfDestChecks || pvcHfDestChecks)), - srcTypeChk = __ESIMD_EMU_DNS::is_dword_type::value && - __ESIMD_EMU_DNS::is_dword_type::value, + srcTypeChk = __ESIMD_EMU_DNS::is_dword_type::value && + __ESIMD_EMU_DNS::is_dword_type::value, - destSizeChk = SZ >= /*TODO: ==*/SIMDSize * repeat_count, + destSizeChk = SZ >= /*TODO: ==*/SIMDSize * repeat_count, - systolicDepthAndRepeatCountChk = - systolic_depth == 8 && repeat_count >= 1 && repeat_count <= 8, + systolicDepthAndRepeatCountChk = + systolic_depth == 8 && repeat_count >= 1 && repeat_count <= 8, - src1CountChk = - N1 == ((src1_el_bits * systolic_depth * ops_per_chan * SZ) / - (repeat_count * sizeof(T1) * 8)), - src2CountChk = - N2 >= ((src2_el_bits * systolic_depth * ops_per_chan * repeat_count) / - (sizeof(T2) * 8)) + src1CountChk = + N1 == ((src1_el_bits * systolic_depth * ops_per_chan * SZ) / + (repeat_count * sizeof(T1) * 8)), + src2CountChk = + N2 >= + ((src2_el_bits * systolic_depth * ops_per_chan * repeat_count) / + (sizeof(T2) * 8)) /*TODO: ==; fix PVCIGEMM24*/ ; if constexpr (!isPvc) static_assert(!pvcBfOrHfDest, "dpas: hfloat and bfloat16 destination " "element type is only supported on PVC."); - static_assert(destTypeChk, "dpas: unsupported dest and accumulator type."); - static_assert(srcTypeChk, "dpas: unsupported src element type."); - static_assert(destSizeChk, - "dpas: destination size must be SIMDSize x repeat_count."); - static_assert(systolicDepthAndRepeatCountChk, - "dpas: only systolic_depth = 8 and repeat_count of 1 to 8 are " - "supported."); - static_assert(src1CountChk, "dpas: invalid size for src1."); - static_assert(src2CountChk, "dpas: invalid size for src2."); + assert(destTypeChk && "dpas: unsupported dest and accumulator type."); + assert(srcTypeChk && "dpas: unsupported src element type."); + assert(destSizeChk && + "dpas: destination size must be SIMDSize x repeat_count."); + assert(systolicDepthAndRepeatCountChk && + "dpas: only systolic_depth = 8 and repeat_count of 1 to 8 are " + "supported."); + assert(src1CountChk && "dpas: invalid size for src1."); + assert(src2CountChk && "dpas: invalid size for src2."); using TmpAccEl = typename std::conditional< pvcBfOrHfDest, float, @@ -639,7 +650,7 @@ __esimd_dpas_inner(const void *src0, } // Systolic phase. for (uint n = 0; n < SIMDSize; n++) { - if constexpr (pvcBfDest) { + if (pvcBfDest) { // TODO: make abstraction, support saturation, review rounding algo for // corner cases. auto tmpFloat = simdAcc[n]; @@ -661,55 +672,73 @@ __esimd_dpas_inner(const void *src0, return retv; } -template <__ESIMD_ENS::argument_type src1_precision, - __ESIMD_ENS::argument_type src2_precision, int systolic_depth, - int repeat_count, typename T, typename T0, typename T1, typename T2, - int N, int N1, int N2> -inline __ESIMD_DNS::vector_type_t +#endif // #ifndef __SYCL_DEVICE_ONLY__ + +template +SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t __esimd_dpas(__ESIMD_DNS::vector_type_t src0, __ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2) { - return __esimd_dpas_inner( - (void *)std::addressof(src0), src1, src2); + __ESIMD_DNS::vector_type_t src2, int src1_precision, + int src2_precision, int depth, int repeat, int sign_res, + int sign_acc) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else // __SYCL_DEVICE_ONLY__ +{ + ARG_UNUSED(sign_res); + ARG_UNUSED(sign_acc); + return __esimd_dpas_inner( + (void *)std::addressof(src0), src1, src2, + (__ESIMD_ENS::argument_type)(src1_precision - 1), + (__ESIMD_ENS::argument_type)(src2_precision - 1), depth, repeat); } +#endif // __SYCL_DEVICE_ONLY__ -template <__ESIMD_ENS::argument_type src1_precision, - __ESIMD_ENS::argument_type src2_precision, int systolic_depth, - int repeat_count, typename T, typename T1, typename T2, int N, int N1, - int N2> +template inline __ESIMD_DNS::vector_type_t __esimd_dpas2(__ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2) { - return __esimd_dpas_inner(nullptr, src1, - src2); + __ESIMD_DNS::vector_type_t src2, int dpas_info) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + return __esimd_dpas_inner( + nullptr, src1, src2, __esimd_decode_src1_precision(dpas_info), + __esimd_decode_src2_precision(dpas_info), + __esimd_decode_systolic_depth(dpas_info), + __esimd_decode_repeat_count(dpas_info)); } +#endif -template <__ESIMD_ENS::argument_type src1_precision, - __ESIMD_ENS::argument_type src2_precision, int systolic_depth, - int repeat_count, typename T, typename T1, typename T2, int N, int N1, - int N2> +template inline __ESIMD_DNS::vector_type_t __esimd_dpasw(__ESIMD_DNS::vector_type_t src0, __ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2) { + __ESIMD_DNS::vector_type_t src2, int dpas_info) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + ARG_UNUSED(dpas_info); __ESIMD_UNSUPPORTED_ON_HOST; return __ESIMD_DNS::vector_type_t(); } +#endif -template <__ESIMD_ENS::argument_type src1_precision, - __ESIMD_ENS::argument_type src2_precision, int systolic_depth, - int repeat_count, typename T, typename T1, typename T2, int N, int N1, - int N2> +template inline __ESIMD_DNS::vector_type_t __esimd_dpasw2(__ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2) { + __ESIMD_DNS::vector_type_t src2, int dpas_info) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + ARG_UNUSED(dpas_info); __ESIMD_UNSUPPORTED_ON_HOST; return __ESIMD_DNS::vector_type_t(); } - -#endif // #ifdef __SYCL_DEVICE_ONLY__ +#endif #undef __ESIMD_raw_vec_t #undef __ESIMD_cpp_vec_t diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp index ee188fa671faf..a7c958cff2943 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp @@ -1843,7 +1843,6 @@ dpas(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); -#if defined(__SYCL_DEVICE_ONLY__) constexpr int dst_signed = std::is_signed::value; constexpr int src0_signed = std::is_signed::value; __ESIMD_NS::simd result = __esimd_dpas( @@ -1851,13 +1850,6 @@ dpas(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, (int)src2_precision + 1, systolic_depth, repeat_count, dst_signed, src0_signed); -#else - __ESIMD_NS::simd result = - __esimd_dpas(src0.data(), src1.data(), - src2.data()); -#endif // __SYCL_DEVICE_ONLY__ - if constexpr (std::is_same_v) return result; else @@ -1939,17 +1931,10 @@ __ESIMD_API __ESIMD_NS::simd dpas(__ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); -#if defined(__SYCL_DEVICE_ONLY__) - int dpas_info = (repeat_count << 24) + (systolic_depth << 16) + - (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); - __ESIMD_NS::simd result = - __esimd_dpas2(src1.data(), src2.data(), dpas_info); -#else - __ESIMD_NS::simd result = - __esimd_dpas2(src1.data(), - src2.data()); -#endif // __SYCL_DEVICE_ONLY__ + __ESIMD_NS::simd result = __esimd_dpas2( + src1.data(), src2.data(), + __esimd_encode_dpas_info(repeat_count, systolic_depth, src1_precision, + src2_precision)); if constexpr (std::is_same_v) return result; @@ -2022,17 +2007,10 @@ dpasw(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); -#if defined(__SYCL_DEVICE_ONLY__) - int dpas_info = (repeat_count << 24) + (systolic_depth << 16) + - (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); __ESIMD_NS::simd result = __esimd_dpasw( - src0.data(), src1.data(), src2.data(), dpas_info); -#else - __ESIMD_NS::simd result = - __esimd_dpasw( - src0.data(), src1.data(), src2.data()); -#endif // __SYCL_DEVICE_ONLY__ + src0.data(), src1.data(), src2.data(), + __esimd_encode_dpas_info(repeat_count, systolic_depth, src1_precision, + src2_precision)); if constexpr (std::is_same_v) return result; @@ -2103,17 +2081,10 @@ __ESIMD_API __ESIMD_NS::simd dpasw2(__ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); -#if defined(__SYCL_DEVICE_ONLY__) - int dpas_info = (repeat_count << 24) + (systolic_depth << 16) + - (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); - __ESIMD_NS::simd result = - __esimd_dpasw2(src1.data(), src2.data(), dpas_info); -#else - __ESIMD_NS::simd result = - __esimd_dpasw2(src1.data(), - src2.data()); -#endif // __SYCL_DEVICE_ONLY__ + __ESIMD_NS::simd result = __esimd_dpasw2( + src1.data(), src2.data(), + __esimd_encode_dpas_info(repeat_count, systolic_depth, src1_precision, + src2_precision)); if constexpr (std::is_same_v) return result; From a621dd8c08c60a0a5777c9036abc3ee5590c6214 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Thu, 28 Jul 2022 15:32:15 -0700 Subject: [PATCH 05/15] Missing changes in unifying __esimd_dpas* functions --- .../ext/intel/experimental/esimd/detail/math_intrin.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index a118237e88501..e2bc8947fea48 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -696,7 +696,7 @@ __esimd_dpas(__ESIMD_DNS::vector_type_t src0, #endif // __SYCL_DEVICE_ONLY__ template -inline __ESIMD_DNS::vector_type_t +SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t __esimd_dpas2(__ESIMD_DNS::vector_type_t src1, __ESIMD_DNS::vector_type_t src2, int dpas_info) #ifdef __SYCL_DEVICE_ONLY__ @@ -712,7 +712,7 @@ __esimd_dpas2(__ESIMD_DNS::vector_type_t src1, #endif template -inline __ESIMD_DNS::vector_type_t +SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t __esimd_dpasw(__ESIMD_DNS::vector_type_t src0, __ESIMD_DNS::vector_type_t src1, __ESIMD_DNS::vector_type_t src2, int dpas_info) @@ -727,7 +727,7 @@ __esimd_dpasw(__ESIMD_DNS::vector_type_t src0, #endif template -inline __ESIMD_DNS::vector_type_t +SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t __esimd_dpasw2(__ESIMD_DNS::vector_type_t src1, __ESIMD_DNS::vector_type_t src2, int dpas_info) #ifdef __SYCL_DEVICE_ONLY__ From b69842320c5983b84adabb5caf71af67404ea774 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Thu, 28 Jul 2022 15:39:16 -0700 Subject: [PATCH 06/15] systolic_depth decoding bug fix --- .../sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index e2bc8947fea48..c19f922906721 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -26,7 +26,7 @@ inline int __esimd_decode_repeat_count(int dpas_info) { } inline int __esimd_decode_systolic_depth(int dpas_info) { - return (dpas_info >> 16); + return ((dpas_info >> 16) & 0xFF); } inline __ESIMD_ENS::argument_type From a47893c7b99fcc3f0a2242becb6649436bec3372 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Thu, 28 Jul 2022 16:49:45 -0700 Subject: [PATCH 07/15] Fix for passing dpas_test.cpp --- .../ext/intel/experimental/esimd/common.hpp | 28 +++++++++++++ .../experimental/esimd/detail/math_intrin.hpp | 42 ++----------------- .../ext/intel/experimental/esimd/math.hpp | 12 +++--- 3 files changed, 38 insertions(+), 44 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp index 9b3fb1945c72b..d8e01b66f01fc 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp @@ -410,6 +410,34 @@ constexpr void check_lsc_cache_hint() { } } +int encode_dpas_info(int repeat_count, int systolic_depth, + argument_type src1_precision, + argument_type src2_precision) { + return (repeat_count << 24) + (systolic_depth << 16) + + (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); +} + +int decode_repeat_count(int dpas_info) { return (dpas_info >> 24); } + +int decode_systolic_depth(int dpas_info) { return ((dpas_info >> 16) & 0xFF); } + +argument_type decode_src1_precision(const int dpas_info) { + int decoded = dpas_info; + decoded &= 0xFF; + decoded -= 1; + + return (argument_type)decoded; +} + +argument_type decode_src2_precision(const int dpas_info) { + int decoded = dpas_info; + decoded >>= 8; + decoded &= 0xFF; + decoded -= 1; + + return (argument_type)decoded; +} + } // namespace detail /// Represents a split barrier action. diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index c19f922906721..e2afd1264665b 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -14,40 +14,6 @@ #define ARG_UNUSED(x) (void)x -inline int __esimd_encode_dpas_info(int repeat_count, int systolic_depth, - __ESIMD_ENS::argument_type src1_precision, - __ESIMD_ENS::argument_type src2_precision) { - return (repeat_count << 24) + (systolic_depth << 16) + - (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); -} - -inline int __esimd_decode_repeat_count(int dpas_info) { - return (dpas_info >> 24); -} - -inline int __esimd_decode_systolic_depth(int dpas_info) { - return ((dpas_info >> 16) & 0xFF); -} - -inline __ESIMD_ENS::argument_type -__esimd_decode_src1_precision(const int dpas_info) { - int decoded = dpas_info; - decoded &= 0xFF; - decoded -= 1; - - return (__ESIMD_ENS::argument_type)decoded; -} - -inline __ESIMD_ENS::argument_type -__esimd_decode_src2_precision(const int dpas_info) { - int decoded = dpas_info; - decoded >>= 8; - decoded &= 0xFF; - decoded -= 1; - - return (__ESIMD_ENS::argument_type)decoded; -} - #include #define __ESIMD_raw_vec_t(T, SZ) \ @@ -704,10 +670,10 @@ __esimd_dpas2(__ESIMD_DNS::vector_type_t src1, #else { return __esimd_dpas_inner( - nullptr, src1, src2, __esimd_decode_src1_precision(dpas_info), - __esimd_decode_src2_precision(dpas_info), - __esimd_decode_systolic_depth(dpas_info), - __esimd_decode_repeat_count(dpas_info)); + nullptr, src1, src2, __ESIMD_EDNS::decode_src1_precision(dpas_info), + __ESIMD_EDNS::decode_src2_precision(dpas_info), + __ESIMD_EDNS::decode_systolic_depth(dpas_info), + __ESIMD_EDNS::decode_repeat_count(dpas_info)); } #endif diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp index a7c958cff2943..6e1c695d2e208 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp @@ -1933,8 +1933,8 @@ __ESIMD_API __ESIMD_NS::simd dpas(__ESIMD_NS::simd src1, __ESIMD_NS::simd result = __esimd_dpas2( src1.data(), src2.data(), - __esimd_encode_dpas_info(repeat_count, systolic_depth, src1_precision, - src2_precision)); + __ESIMD_EDNS::encode_dpas_info(repeat_count, systolic_depth, + src1_precision, src2_precision)); if constexpr (std::is_same_v) return result; @@ -2009,8 +2009,8 @@ dpasw(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd result = __esimd_dpasw( src0.data(), src1.data(), src2.data(), - __esimd_encode_dpas_info(repeat_count, systolic_depth, src1_precision, - src2_precision)); + __ESIMD_EDNS::encode_dpas_info(repeat_count, systolic_depth, + src1_precision, src2_precision)); if constexpr (std::is_same_v) return result; @@ -2083,8 +2083,8 @@ __ESIMD_API __ESIMD_NS::simd dpasw2(__ESIMD_NS::simd src1, __ESIMD_NS::simd result = __esimd_dpasw2( src1.data(), src2.data(), - __esimd_encode_dpas_info(repeat_count, systolic_depth, src1_precision, - src2_precision)); + __ESIMD_EDNS::encode_dpas_info(repeat_count, systolic_depth, + src1_precision, src2_precision)); if constexpr (std::is_same_v) return result; From 0e220e93532950d4c6540e34cc05af22abbbb5b6 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Thu, 28 Jul 2022 17:26:20 -0700 Subject: [PATCH 08/15] Fixing 'odr' test failure --- sycl/include/sycl/ext/intel/experimental/esimd/common.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp index d8e01b66f01fc..6731f85d01971 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp @@ -410,6 +410,7 @@ constexpr void check_lsc_cache_hint() { } } +ESIMD_INLINE int encode_dpas_info(int repeat_count, int systolic_depth, argument_type src1_precision, argument_type src2_precision) { @@ -417,10 +418,13 @@ int encode_dpas_info(int repeat_count, int systolic_depth, (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); } +ESIMD_INLINE int decode_repeat_count(int dpas_info) { return (dpas_info >> 24); } +ESIMD_INLINE int decode_systolic_depth(int dpas_info) { return ((dpas_info >> 16) & 0xFF); } +ESIMD_INLINE argument_type decode_src1_precision(const int dpas_info) { int decoded = dpas_info; decoded &= 0xFF; @@ -429,6 +433,7 @@ argument_type decode_src1_precision(const int dpas_info) { return (argument_type)decoded; } +ESIMD_INLINE argument_type decode_src2_precision(const int dpas_info) { int decoded = dpas_info; decoded >>= 8; From 1960d87ddb57d378a242a6dd785265a0f54f0fa1 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Fri, 29 Jul 2022 09:12:15 -0700 Subject: [PATCH 09/15] Reverting clang-formatting for '__esimd_dpas_inner' --- .../experimental/esimd/detail/math_intrin.hpp | 70 ++++++++++--------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index e2afd1264665b..631aecefadc5d 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -492,42 +492,44 @@ __esimd_dpas_inner(const void *src0, constexpr size_t SIMDSize = 8; #endif - constexpr bool pvcHfDest = - isPvc && std::is_same::value, - pvcBfDest = isPvc && std::is_same::value, - pvcBfOrHfDest = pvcBfDest || pvcHfDest; - - bool pvcBfDestChecks = pvcBfDest && - src1_precision == __ESIMD_ENS::argument_type::BF16 && - src2_precision == __ESIMD_ENS::argument_type::BF16, - - pvcHfDestChecks = - pvcHfDest && ((src1_precision == __ESIMD_ENS::argument_type::FP16 && - src2_precision == __ESIMD_ENS::argument_type::FP16) || - (src1_precision == __ESIMD_ENS::argument_type::BF16 && - src2_precision == __ESIMD_ENS::argument_type::BF16)), - - destTypeChk = (!pvcBfOrHfDest && - __ESIMD_EMU_DNS::is_fp_or_dword_type::value) || - (pvcBfOrHfDest && (pvcBfDestChecks || pvcHfDestChecks)), - - srcTypeChk = __ESIMD_EMU_DNS::is_dword_type::value && - __ESIMD_EMU_DNS::is_dword_type::value, - - destSizeChk = SZ >= /*TODO: ==*/SIMDSize * repeat_count, - - systolicDepthAndRepeatCountChk = - systolic_depth == 8 && repeat_count >= 1 && repeat_count <= 8, - - src1CountChk = - N1 == ((src1_el_bits * systolic_depth * ops_per_chan * SZ) / - (repeat_count * sizeof(T1) * 8)), - src2CountChk = - N2 >= - ((src2_el_bits * systolic_depth * ops_per_chan * repeat_count) / - (sizeof(T2) * 8)) + // clang-format off + constexpr bool + pvcHfDest = isPvc && std::is_same::value, + pvcBfDest = isPvc && std::is_same::value, + pvcBfOrHfDest = pvcBfDest || pvcHfDest; + + bool + pvcBfDestChecks = pvcBfDest && + src1_precision == __ESIMD_ENS::argument_type::BF16 && + src2_precision == __ESIMD_ENS::argument_type::BF16, + + pvcHfDestChecks = + pvcHfDest && ((src1_precision == __ESIMD_ENS::argument_type::FP16 && + src2_precision == __ESIMD_ENS::argument_type::FP16) || + (src1_precision == __ESIMD_ENS::argument_type::BF16 && + src2_precision == __ESIMD_ENS::argument_type::BF16)), + + destTypeChk = + (!pvcBfOrHfDest && __ESIMD_EMU_DNS::is_fp_or_dword_type::value) || + (pvcBfOrHfDest && (pvcBfDestChecks || pvcHfDestChecks)), + + srcTypeChk = __ESIMD_EMU_DNS::is_dword_type::value && + __ESIMD_EMU_DNS::is_dword_type::value, + + destSizeChk = SZ >= /*TODO: ==*/SIMDSize * repeat_count, + + systolicDepthAndRepeatCountChk = + systolic_depth == 8 && repeat_count >= 1 && repeat_count <= 8, + + src1CountChk = + N1 == ((src1_el_bits * systolic_depth * ops_per_chan * SZ) / + (repeat_count * sizeof(T1) * 8)), + src2CountChk = + N2 >= ((src2_el_bits * systolic_depth * ops_per_chan * repeat_count) / + (sizeof(T2) * 8)) /*TODO: ==; fix PVCIGEMM24*/ ; + // clang-format on if constexpr (!isPvc) static_assert(!pvcBfOrHfDest, "dpas: hfloat and bfloat16 destination " From 39078160dc07b745de910212e57944ff42497403 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Fri, 29 Jul 2022 09:39:35 -0700 Subject: [PATCH 10/15] Reverting (void*) work-around --- .../ext/intel/experimental/esimd/detail/math_intrin.hpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index 631aecefadc5d..f3f25e01ddfa5 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -434,11 +434,10 @@ __esimd_dpas_bits_precision(__ESIMD_ENS::argument_type precisionType) { : 1; } -// TODO : 'src0' argument type other than 'void*'? template inline __ESIMD_DNS::vector_type_t -__esimd_dpas_inner(const void *src0, +__esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, const __ESIMD_DNS::vector_type_t &src1, const __ESIMD_DNS::vector_type_t &src2, __ESIMD_ENS::argument_type src1_precision, @@ -557,9 +556,7 @@ __esimd_dpas_inner(const void *src0, for (uint n = 0; n < SIMDSize; n++) { if (src0 != nullptr) { - __ESIMD_DNS::vector_type_t *src0Casted = - (__ESIMD_DNS::vector_type_t *)(src0); - auto src0El = src0Casted[0][r * SIMDSize + n]; + auto src0El = src0[0][r * SIMDSize + n]; if (pvcBfDest) { const auto tmp = (uint32_t)(src0El) << 16; @@ -657,7 +654,7 @@ __esimd_dpas(__ESIMD_DNS::vector_type_t src0, ARG_UNUSED(sign_res); ARG_UNUSED(sign_acc); return __esimd_dpas_inner( - (void *)std::addressof(src0), src1, src2, + std::addressof(src0), src1, src2, (__ESIMD_ENS::argument_type)(src1_precision - 1), (__ESIMD_ENS::argument_type)(src2_precision - 1), depth, repeat); } From 27ab02267cb1375a6dc21ba7e3afbcda6774f5e5 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Fri, 29 Jul 2022 18:06:49 -0700 Subject: [PATCH 11/15] Bugfix for dpas_test.cpp test failure --- .../sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index f3f25e01ddfa5..f18e710810db5 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -654,7 +654,7 @@ __esimd_dpas(__ESIMD_DNS::vector_type_t src0, ARG_UNUSED(sign_res); ARG_UNUSED(sign_acc); return __esimd_dpas_inner( - std::addressof(src0), src1, src2, + (__ESIMD_DNS::vector_type_t *)std::addressof(src0), src1, src2, (__ESIMD_ENS::argument_type)(src1_precision - 1), (__ESIMD_ENS::argument_type)(src2_precision - 1), depth, repeat); } From 170746082a7f4e9660a7e552ee0019d33199f14f Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Mon, 8 Aug 2022 11:59:37 -0700 Subject: [PATCH 12/15] Changing dpas_info funtion argument into template argument for host mode - For precision, repeat, depth, and signedness info, runtime function arguments are changed to template argument for host mode compilation - dpas_info encoding/decoding, ARG_UNUSED removed as they are no longer used - TODO : Refactor intrinsic declarations for device mode --- .../ext/intel/experimental/esimd/common.hpp | 33 ---- .../experimental/esimd/detail/math_intrin.hpp | 160 +++++++++--------- .../ext/intel/experimental/esimd/math.hpp | 64 +++++-- 3 files changed, 136 insertions(+), 121 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp index 6731f85d01971..9b3fb1945c72b 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp @@ -410,39 +410,6 @@ constexpr void check_lsc_cache_hint() { } } -ESIMD_INLINE -int encode_dpas_info(int repeat_count, int systolic_depth, - argument_type src1_precision, - argument_type src2_precision) { - return (repeat_count << 24) + (systolic_depth << 16) + - (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); -} - -ESIMD_INLINE -int decode_repeat_count(int dpas_info) { return (dpas_info >> 24); } - -ESIMD_INLINE -int decode_systolic_depth(int dpas_info) { return ((dpas_info >> 16) & 0xFF); } - -ESIMD_INLINE -argument_type decode_src1_precision(const int dpas_info) { - int decoded = dpas_info; - decoded &= 0xFF; - decoded -= 1; - - return (argument_type)decoded; -} - -ESIMD_INLINE -argument_type decode_src2_precision(const int dpas_info) { - int decoded = dpas_info; - decoded >>= 8; - decoded &= 0xFF; - decoded -= 1; - - return (argument_type)decoded; -} - } // namespace detail /// Represents a split barrier action. diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index f18e710810db5..e8d58b301836b 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -12,8 +12,6 @@ /// @cond ESIMD_DETAIL -#define ARG_UNUSED(x) (void)x - #include #define __ESIMD_raw_vec_t(T, SZ) \ @@ -112,7 +110,37 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) } #endif // __SYCL_DEVICE_ONLY__ -#ifndef __SYCL_DEVICE_ONLY__ +#ifdef __SYCL_DEVICE_ONLY__ + +// TODO: For Device compilation, change 'dpas_info' function argument +// (precision, depth, repeat, signedness) into template argument like +// __esimd_dpas* intrinsic declarations for host mode +template +SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t +__esimd_dpas(__ESIMD_DNS::vector_type_t src0, + __ESIMD_DNS::vector_type_t src1, + __ESIMD_DNS::vector_type_t src2, int src1_precision, + int src2_precision, int depth, int repeat, int sign_res, + int sign_acc); + +template +SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t +__esimd_dpas2(__ESIMD_DNS::vector_type_t src1, + __ESIMD_DNS::vector_type_t src2, int dpas_info); + +template +SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t +__esimd_dpasw(__ESIMD_DNS::vector_type_t src0, + __ESIMD_DNS::vector_type_t src1, + __ESIMD_DNS::vector_type_t src2, int dpas_info); + +template +SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t +__esimd_dpasw2(__ESIMD_DNS::vector_type_t src1, + __ESIMD_DNS::vector_type_t src2, int dpas_info); + +#else // __SYCL_DEVICE_ONLY__ template __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) @@ -416,7 +444,7 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) return retv; } -inline __ESIMD_NS::uint +inline constexpr __ESIMD_NS::uint __esimd_dpas_bits_precision(__ESIMD_ENS::argument_type precisionType) { return precisionType == __ESIMD_ENS::argument_type::TF32 ? 32 : precisionType == __ESIMD_ENS::argument_type::BF16 || @@ -434,15 +462,14 @@ __esimd_dpas_bits_precision(__ESIMD_ENS::argument_type precisionType) { : 1; } -template +template <__ESIMD_ENS::argument_type src1_precision, + __ESIMD_ENS::argument_type src2_precision, int systolic_depth, + int repeat_count, typename RT, typename T1, typename T2, + __ESIMD_NS::uint SZ, __ESIMD_NS::uint N1, __ESIMD_NS::uint N2> inline __ESIMD_DNS::vector_type_t __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, const __ESIMD_DNS::vector_type_t &src1, - const __ESIMD_DNS::vector_type_t &src2, - __ESIMD_ENS::argument_type src1_precision, - __ESIMD_ENS::argument_type src2_precision, - int systolic_depth, int repeat_count) { + const __ESIMD_DNS::vector_type_t &src2) { __ESIMD_DNS::vector_type_t retv; __ESIMD_NS::uint sat1 = @@ -451,7 +478,7 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, __ESIMD_EMU_DNS::SetSatur::value>::set(); - __ESIMD_NS::uint ops_per_chan = + constexpr __ESIMD_NS::uint ops_per_chan = src1_precision == __ESIMD_ENS::argument_type::BF16 || src1_precision == __ESIMD_ENS::argument_type::FP16 || src2_precision == __ESIMD_ENS::argument_type::BF16 || @@ -466,8 +493,8 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, __ESIMD_NS::uint V = 0, U = 0, k = 0, temp = 0, src1_ops_per_dword = 0, p = 0; - auto src1_el_bits = __esimd_dpas_bits_precision(src1_precision); - auto src2_el_bits = __esimd_dpas_bits_precision(src2_precision); + constexpr auto src1_el_bits = __esimd_dpas_bits_precision(src1_precision); + constexpr auto src2_el_bits = __esimd_dpas_bits_precision(src2_precision); uint32_t src1_signed = src1_precision == __ESIMD_ENS::argument_type::S2 || @@ -491,13 +518,11 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, constexpr size_t SIMDSize = 8; #endif - // clang-format off constexpr bool pvcHfDest = isPvc && std::is_same::value, pvcBfDest = isPvc && std::is_same::value, - pvcBfOrHfDest = pvcBfDest || pvcHfDest; + pvcBfOrHfDest = pvcBfDest || pvcHfDest, - bool pvcBfDestChecks = pvcBfDest && src1_precision == __ESIMD_ENS::argument_type::BF16 && src2_precision == __ESIMD_ENS::argument_type::BF16, @@ -528,20 +553,19 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, (sizeof(T2) * 8)) /*TODO: ==; fix PVCIGEMM24*/ ; - // clang-format on if constexpr (!isPvc) static_assert(!pvcBfOrHfDest, "dpas: hfloat and bfloat16 destination " "element type is only supported on PVC."); - assert(destTypeChk && "dpas: unsupported dest and accumulator type."); - assert(srcTypeChk && "dpas: unsupported src element type."); - assert(destSizeChk && - "dpas: destination size must be SIMDSize x repeat_count."); - assert(systolicDepthAndRepeatCountChk && - "dpas: only systolic_depth = 8 and repeat_count of 1 to 8 are " - "supported."); - assert(src1CountChk && "dpas: invalid size for src1."); - assert(src2CountChk && "dpas: invalid size for src2."); + static_assert(destTypeChk, "dpas: unsupported dest and accumulator type."); + static_assert(srcTypeChk, "dpas: unsupported src element type."); + static_assert(destSizeChk, + "dpas: destination size must be SIMDSize x repeat_count."); + static_assert(systolicDepthAndRepeatCountChk, + "dpas: only systolic_depth = 8 and repeat_count of 1 to 8 are " + "supported."); + static_assert(src1CountChk, "dpas: invalid size for src1."); + static_assert(src2CountChk, "dpas: invalid size for src2."); using TmpAccEl = typename std::conditional< pvcBfOrHfDest, float, @@ -615,7 +639,7 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, } // Systolic phase. for (uint n = 0; n < SIMDSize; n++) { - if (pvcBfDest) { + if constexpr (pvcBfDest) { // TODO: make abstraction, support saturation, review rounding algo for // corner cases. auto tmpFloat = simdAcc[n]; @@ -637,73 +661,55 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, return retv; } -#endif // #ifndef __SYCL_DEVICE_ONLY__ - -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t +template <__ESIMD_ENS::argument_type src1_precision, + __ESIMD_ENS::argument_type src2_precision, int systolic_depth, + int repeat_count, typename T, typename T0, typename T1, typename T2, + int N, int N1, int N2, int sign_res, int sign_acc> +inline __ESIMD_DNS::vector_type_t __esimd_dpas(__ESIMD_DNS::vector_type_t src0, __ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2, int src1_precision, - int src2_precision, int depth, int repeat, int sign_res, - int sign_acc) -#ifdef __SYCL_DEVICE_ONLY__ - ; -#else // __SYCL_DEVICE_ONLY__ -{ - ARG_UNUSED(sign_res); - ARG_UNUSED(sign_acc); - return __esimd_dpas_inner( - (__ESIMD_DNS::vector_type_t *)std::addressof(src0), src1, src2, - (__ESIMD_ENS::argument_type)(src1_precision - 1), - (__ESIMD_ENS::argument_type)(src2_precision - 1), depth, repeat); + __ESIMD_DNS::vector_type_t src2) { + return __esimd_dpas_inner( + (__ESIMD_DNS::vector_type_t *)std::addressof(src0), src1, src2); } -#endif // __SYCL_DEVICE_ONLY__ -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t +template <__ESIMD_ENS::argument_type src1_precision, + __ESIMD_ENS::argument_type src2_precision, int systolic_depth, + int repeat_count, typename T, typename T1, typename T2, int N, int N1, + int N2> +inline __ESIMD_DNS::vector_type_t __esimd_dpas2(__ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2, int dpas_info) -#ifdef __SYCL_DEVICE_ONLY__ - ; -#else -{ - return __esimd_dpas_inner( - nullptr, src1, src2, __ESIMD_EDNS::decode_src1_precision(dpas_info), - __ESIMD_EDNS::decode_src2_precision(dpas_info), - __ESIMD_EDNS::decode_systolic_depth(dpas_info), - __ESIMD_EDNS::decode_repeat_count(dpas_info)); + __ESIMD_DNS::vector_type_t src2) { + return __esimd_dpas_inner(nullptr, src1, + src2); } -#endif -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t +template <__ESIMD_ENS::argument_type src1_precision, + __ESIMD_ENS::argument_type src2_precision, int systolic_depth, + int repeat_count, typename T, typename T1, typename T2, int N, int N1, + int N2> +inline __ESIMD_DNS::vector_type_t __esimd_dpasw(__ESIMD_DNS::vector_type_t src0, __ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2, int dpas_info) -#ifdef __SYCL_DEVICE_ONLY__ - ; -#else -{ - ARG_UNUSED(dpas_info); + __ESIMD_DNS::vector_type_t src2) { __ESIMD_UNSUPPORTED_ON_HOST; return __ESIMD_DNS::vector_type_t(); } -#endif -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t +template <__ESIMD_ENS::argument_type src1_precision, + __ESIMD_ENS::argument_type src2_precision, int systolic_depth, + int repeat_count, typename T, typename T1, typename T2, int N, int N1, + int N2> +inline __ESIMD_DNS::vector_type_t __esimd_dpasw2(__ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2, int dpas_info) -#ifdef __SYCL_DEVICE_ONLY__ - ; -#else -{ - ARG_UNUSED(dpas_info); + __ESIMD_DNS::vector_type_t src2) { __ESIMD_UNSUPPORTED_ON_HOST; return __ESIMD_DNS::vector_type_t(); } -#endif + +#endif // #ifdef __SYCL_DEVICE_ONLY__ #undef __ESIMD_raw_vec_t #undef __ESIMD_cpp_vec_t diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp index 6e1c695d2e208..caca851a402d1 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp @@ -1845,11 +1845,23 @@ dpas(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, constexpr int dst_signed = std::is_signed::value; constexpr int src0_signed = std::is_signed::value; + +// TODO: For Device compilation, change 'dpas_info' function argument +// (precision, depth, repeat, signedness) into template argument like +// __esimd_dpas* intrinsic declarations for host mode +#if defined(__SYCL_DEVICE_ONLY__) __ESIMD_NS::simd result = __esimd_dpas( src0.data(), src1.data(), src2.data(), (int)src1_precision + 1, (int)src2_precision + 1, systolic_depth, repeat_count, dst_signed, src0_signed); +#else + __ESIMD_NS::simd result = + __esimd_dpas( + src0.data(), src1.data(), src2.data()); +#endif // __SYCL_DEVICE_ONLY__ + if constexpr (std::is_same_v) return result; else @@ -1931,10 +1943,20 @@ __ESIMD_API __ESIMD_NS::simd dpas(__ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); - __ESIMD_NS::simd result = __esimd_dpas2( - src1.data(), src2.data(), - __ESIMD_EDNS::encode_dpas_info(repeat_count, systolic_depth, - src1_precision, src2_precision)); +// TODO: For Device compilation, change 'dpas_info' function argument +// (precision, depth, repeat) into template argument like +// __esimd_dpas* intrinsic declarations for host mode +#if defined(__SYCL_DEVICE_ONLY__) + int dpas_info = (repeat_count << 24) + (systolic_depth << 16) + + (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); + __ESIMD_NS::simd result = + __esimd_dpas2(src1.data(), src2.data(), dpas_info); +#else + __ESIMD_NS::simd result = + __esimd_dpas2(src1.data(), + src2.data()); +#endif // __SYCL_DEVICE_ONLY__ if constexpr (std::is_same_v) return result; @@ -2007,10 +2029,20 @@ dpasw(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); +// TODO: For Device compilation, change 'dpas_info' function argument +// (precision, depth, repeat) into template argument like +// __esimd_dpas* intrinsic declarations for host mode +#if defined(__SYCL_DEVICE_ONLY__) + int dpas_info = (repeat_count << 24) + (systolic_depth << 16) + + (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); __ESIMD_NS::simd result = __esimd_dpasw( - src0.data(), src1.data(), src2.data(), - __ESIMD_EDNS::encode_dpas_info(repeat_count, systolic_depth, - src1_precision, src2_precision)); + src0.data(), src1.data(), src2.data(), dpas_info); +#else + __ESIMD_NS::simd result = + __esimd_dpasw( + src0.data(), src1.data(), src2.data()); +#endif // __SYCL_DEVICE_ONLY__ if constexpr (std::is_same_v) return result; @@ -2081,10 +2113,20 @@ __ESIMD_API __ESIMD_NS::simd dpasw2(__ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); - __ESIMD_NS::simd result = __esimd_dpasw2( - src1.data(), src2.data(), - __ESIMD_EDNS::encode_dpas_info(repeat_count, systolic_depth, - src1_precision, src2_precision)); +// TODO: For Device compilation, change 'dpas_info' function argument +// (precision, depth, repeat) into template argument like +// __esimd_dpas* intrinsic declarations for host mode +#if defined(__SYCL_DEVICE_ONLY__) + int dpas_info = (repeat_count << 24) + (systolic_depth << 16) + + (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); + __ESIMD_NS::simd result = + __esimd_dpasw2(src1.data(), src2.data(), dpas_info); +#else + __ESIMD_NS::simd result = + __esimd_dpasw2(src1.data(), + src2.data()); +#endif // __SYCL_DEVICE_ONLY__ if constexpr (std::is_same_v) return result; From d06821a36975ac0599a01b17734e408601432be7 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Fri, 12 Aug 2022 11:26:22 -0700 Subject: [PATCH 13/15] Removing unnecessary changes --- .../experimental/esimd/detail/math_intrin.hpp | 20 ++++++++++++------- .../ext/intel/experimental/esimd/math.hpp | 17 ++-------------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index e8d58b301836b..081bf1cd39389 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -12,6 +12,10 @@ /// @cond ESIMD_DETAIL +#ifndef __SYCL_DEVICE_ONLY__ +#define __ESIMD_EMU_ARG_UNUSED(x) (void)x +#endif // __SYCL_DEVICE_ONLY__ + #include #define __ESIMD_raw_vec_t(T, SZ) \ @@ -112,9 +116,8 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) #ifdef __SYCL_DEVICE_ONLY__ -// TODO: For Device compilation, change 'dpas_info' function argument -// (precision, depth, repeat, signedness) into template argument like -// __esimd_dpas* intrinsic declarations for host mode +// TODO: __esimd_dpas* should have single declaration for host and device: +// void __esimd_dpas*(...) template SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t @@ -653,7 +656,7 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, } else retv[r * SIMDSize + n] = __ESIMD_EMU_DNS::satur::template saturate(simdAcc[n], - (int)sat1); + sat1); } } // Repeat. @@ -664,14 +667,17 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, template <__ESIMD_ENS::argument_type src1_precision, __ESIMD_ENS::argument_type src2_precision, int systolic_depth, int repeat_count, typename T, typename T0, typename T1, typename T2, - int N, int N1, int N2, int sign_res, int sign_acc> + int N, int N1, int N2> inline __ESIMD_DNS::vector_type_t __esimd_dpas(__ESIMD_DNS::vector_type_t src0, __ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2) { + __ESIMD_DNS::vector_type_t src2, int sign_res, + int sign_acc) { + __ESIMD_EMU_ARG_UNUSED(sign_res); + __ESIMD_EMU_ARG_UNUSED(sign_acc); return __esimd_dpas_inner( - (__ESIMD_DNS::vector_type_t *)std::addressof(src0), src1, src2); + std::addressof(src0), src1, src2); } template <__ESIMD_ENS::argument_type src1_precision, diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp index caca851a402d1..42580a69c6b84 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp @@ -1845,10 +1845,6 @@ dpas(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, constexpr int dst_signed = std::is_signed::value; constexpr int src0_signed = std::is_signed::value; - -// TODO: For Device compilation, change 'dpas_info' function argument -// (precision, depth, repeat, signedness) into template argument like -// __esimd_dpas* intrinsic declarations for host mode #if defined(__SYCL_DEVICE_ONLY__) __ESIMD_NS::simd result = __esimd_dpas( src0.data(), src1.data(), src2.data(), (int)src1_precision + 1, @@ -1858,8 +1854,8 @@ dpas(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, #else __ESIMD_NS::simd result = __esimd_dpas( - src0.data(), src1.data(), src2.data()); + T, T0, T1, T2, N, N1, N2>( + src0.data(), src1.data(), src2.data(), dst_signed, src0_signed); #endif // __SYCL_DEVICE_ONLY__ if constexpr (std::is_same_v) @@ -1943,9 +1939,6 @@ __ESIMD_API __ESIMD_NS::simd dpas(__ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); -// TODO: For Device compilation, change 'dpas_info' function argument -// (precision, depth, repeat) into template argument like -// __esimd_dpas* intrinsic declarations for host mode #if defined(__SYCL_DEVICE_ONLY__) int dpas_info = (repeat_count << 24) + (systolic_depth << 16) + (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); @@ -2029,9 +2022,6 @@ dpasw(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); -// TODO: For Device compilation, change 'dpas_info' function argument -// (precision, depth, repeat) into template argument like -// __esimd_dpas* intrinsic declarations for host mode #if defined(__SYCL_DEVICE_ONLY__) int dpas_info = (repeat_count << 24) + (systolic_depth << 16) + (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); @@ -2113,9 +2103,6 @@ __ESIMD_API __ESIMD_NS::simd dpasw2(__ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); -// TODO: For Device compilation, change 'dpas_info' function argument -// (precision, depth, repeat) into template argument like -// __esimd_dpas* intrinsic declarations for host mode #if defined(__SYCL_DEVICE_ONLY__) int dpas_info = (repeat_count << 24) + (systolic_depth << 16) + (((int)src2_precision + 1) << 8) + ((int)src1_precision + 1); From 063b905d5a644582761eeb067fa1c5eb2f17a31c Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Fri, 12 Aug 2022 15:51:06 -0700 Subject: [PATCH 14/15] Applying suggestions --- .../intel/experimental/esimd/detail/math_intrin.hpp | 11 ++--------- .../sycl/ext/intel/experimental/esimd/math.hpp | 6 +++--- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index 081bf1cd39389..ef16f470f9212 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -12,10 +12,6 @@ /// @cond ESIMD_DETAIL -#ifndef __SYCL_DEVICE_ONLY__ -#define __ESIMD_EMU_ARG_UNUSED(x) (void)x -#endif // __SYCL_DEVICE_ONLY__ - #include #define __ESIMD_raw_vec_t(T, SZ) \ @@ -117,7 +113,7 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) #ifdef __SYCL_DEVICE_ONLY__ // TODO: __esimd_dpas* should have single declaration for host and device: -// void __esimd_dpas*(...) +// Ret __esimd_dpas*(...) template SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __ESIMD_DNS::vector_type_t @@ -671,10 +667,7 @@ template <__ESIMD_ENS::argument_type src1_precision, inline __ESIMD_DNS::vector_type_t __esimd_dpas(__ESIMD_DNS::vector_type_t src0, __ESIMD_DNS::vector_type_t src1, - __ESIMD_DNS::vector_type_t src2, int sign_res, - int sign_acc) { - __ESIMD_EMU_ARG_UNUSED(sign_res); - __ESIMD_EMU_ARG_UNUSED(sign_acc); + __ESIMD_DNS::vector_type_t src2) { return __esimd_dpas_inner( std::addressof(src0), src1, src2); diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp index 42580a69c6b84..ee188fa671faf 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp @@ -1843,9 +1843,9 @@ dpas(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, (sizeof(T2) * 8)), "invalid size for Src2"); +#if defined(__SYCL_DEVICE_ONLY__) constexpr int dst_signed = std::is_signed::value; constexpr int src0_signed = std::is_signed::value; -#if defined(__SYCL_DEVICE_ONLY__) __ESIMD_NS::simd result = __esimd_dpas( src0.data(), src1.data(), src2.data(), (int)src1_precision + 1, (int)src2_precision + 1, systolic_depth, repeat_count, dst_signed, @@ -1854,8 +1854,8 @@ dpas(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, #else __ESIMD_NS::simd result = __esimd_dpas( - src0.data(), src1.data(), src2.data(), dst_signed, src0_signed); + T, T0, T1, T2, N, N1, N2>(src0.data(), src1.data(), + src2.data()); #endif // __SYCL_DEVICE_ONLY__ if constexpr (std::is_same_v) From 1615f4565e1a55d54bfa3850f42440e72996c856 Mon Sep 17 00:00:00 2001 From: "Ahn, Dongkyun" Date: Mon, 15 Aug 2022 10:51:43 -0700 Subject: [PATCH 15/15] Working version --- .../ext/intel/experimental/esimd/detail/math_intrin.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index ef16f470f9212..938d66b19c5bf 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -463,10 +463,10 @@ __esimd_dpas_bits_precision(__ESIMD_ENS::argument_type precisionType) { template <__ESIMD_ENS::argument_type src1_precision, __ESIMD_ENS::argument_type src2_precision, int systolic_depth, - int repeat_count, typename RT, typename T1, typename T2, + int repeat_count, typename RT, typename T0, typename T1, typename T2, __ESIMD_NS::uint SZ, __ESIMD_NS::uint N1, __ESIMD_NS::uint N2> inline __ESIMD_DNS::vector_type_t -__esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, +__esimd_dpas_inner(const __ESIMD_DNS::vector_type_t *src0, const __ESIMD_DNS::vector_type_t &src1, const __ESIMD_DNS::vector_type_t &src2) { __ESIMD_DNS::vector_type_t retv; @@ -669,7 +669,7 @@ __esimd_dpas(__ESIMD_DNS::vector_type_t src0, __ESIMD_DNS::vector_type_t src1, __ESIMD_DNS::vector_type_t src2) { return __esimd_dpas_inner( + repeat_count, T, T0, T1, T2, N, N1, N2>( std::addressof(src0), src1, src2); } @@ -681,7 +681,7 @@ inline __ESIMD_DNS::vector_type_t __esimd_dpas2(__ESIMD_DNS::vector_type_t src1, __ESIMD_DNS::vector_type_t src2) { return __esimd_dpas_inner(nullptr, src1, + repeat_count, T, T, T1, T2, N, N1, N2>(nullptr, src1, src2); }