From 79e0a8540b3912e78a757fccf1816bebd996742a Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Wed, 22 May 2024 09:26:32 -0700 Subject: [PATCH] [SYCL][ESIMD] Move a few math operations to SPIR-V intrinsics Signed-off-by: Sarnie, Nick --- llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp | 18 --- .../SYCLLowerIR/ESIMD/lower_llvm_intrin.ll | 14 +-- .../ext/intel/esimd/detail/math_intrin.hpp | 23 ++-- sycl/include/sycl/ext/intel/esimd/math.hpp | 10 +- .../experimental/esimd/detail/math_intrin.hpp | 15 ++- .../ext/intel/experimental/esimd/math.hpp | 36 +++++- sycl/test-e2e/ESIMD/clz_ctz.cpp | 116 ++++++++++++++++++ sycl/test-e2e/ESIMD/popcount.cpp | 101 +++++++++++++++ 8 files changed, 292 insertions(+), 41 deletions(-) create mode 100644 sycl/test-e2e/ESIMD/clz_ctz.cpp create mode 100644 sycl/test-e2e/ESIMD/popcount.cpp diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 9b856c118a819..8cfcdb314bfae 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -1237,21 +1237,6 @@ static Instruction *addCastInstIfNeeded(Instruction *OldI, Instruction *NewI, return NewI; } -// Translates the following intrinsics: -// %res = call float @llvm.fmuladd.f32(float %a, float %b, float %c) -// %res = call double @llvm.fmuladd.f64(double %a, double %b, double %c) -// To -// %mul = fmul %a, %b -// %res = fadd %mul, %c -// TODO: Remove when newer GPU driver is used in CI. -void translateFmuladd(CallInst *CI) { - assert(CI->getIntrinsicID() == Intrinsic::fmuladd); - IRBuilder<> Bld(CI); - auto *Mul = Bld.CreateFMul(CI->getOperand(0), CI->getOperand(1)); - auto *Res = Bld.CreateFAdd(Mul, CI->getOperand(2)); - CI->replaceAllUsesWith(Res); -} - // Translates an LLVM intrinsic to a form, digestable by the BE. bool translateLLVMIntrinsic(CallInst *CI) { Function *F = CI->getCalledFunction(); @@ -1263,9 +1248,6 @@ bool translateLLVMIntrinsic(CallInst *CI) { // no translation - it will be simply removed. // TODO: make use of 'assume' info in the BE break; - case Intrinsic::fmuladd: - translateFmuladd(CI); - break; default: return false; // "intrinsic wasn't translated, keep the original call" } diff --git a/llvm/test/SYCLLowerIR/ESIMD/lower_llvm_intrin.ll b/llvm/test/SYCLLowerIR/ESIMD/lower_llvm_intrin.ll index 6f797c3b53bc6..5abbfc9f12c81 100644 --- a/llvm/test/SYCLLowerIR/ESIMD/lower_llvm_intrin.ll +++ b/llvm/test/SYCLLowerIR/ESIMD/lower_llvm_intrin.ll @@ -1,7 +1,7 @@ ; RUN: opt -passes=LowerESIMD -S < %s | FileCheck %s -; This test checks that LowerESIMD pass correctly lowers some llvm intrinsics -; which can't be handled by the VC BE. +; This test checks that LowerESIMD pass does not lower some llvm intrinsics +; which can now be handled by the VC BE. target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" target triple = "spir64-unknown-unknown" @@ -10,17 +10,15 @@ declare double @llvm.fmuladd.f64(double %x, double %y, double %z) define spir_func float @test_fmuladd_f32(float %x, float %y, float %z) { %1 = call float @llvm.fmuladd.f32(float %x, float %y, float %z) -; CHECK: %[[A:[0-9a-zA-Z\._]+]] = fmul float %x, %y -; CHECK: %[[B:[0-9a-zA-Z\._]+]] = fadd float %[[A]], %z +; CHECK: %[[A:[0-9a-zA-Z\._]+]] = call float @llvm.fmuladd.f32(float %x, float %y, float %z) ret float %1 -; CHECK: ret float %[[B]] +; CHECK: ret float %[[A]] } define spir_func double @test_fmuladd_f64(double %x, double %y, double %z) { %1 = call double @llvm.fmuladd.f64(double %x, double %y, double %z) -; CHECK: %[[A:[0-9a-zA-Z\._]+]] = fmul double %x, %y -; CHECK: %[[B:[0-9a-zA-Z\._]+]] = fadd double %[[A]], %z +; CHECK: %[[A:[0-9a-zA-Z\._]+]] = call double @llvm.fmuladd.f64(double %x, double %y, double %z) ret double %1 -; CHECK: ret double %[[B]] +; CHECK: ret double %[[A]] } diff --git a/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp index e5fc036f5275c..2f6584a4bd640 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp @@ -72,6 +72,19 @@ template __ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) __spirv_ocl_native_powr(__ESIMD_raw_vec_t(T, N), __ESIMD_raw_vec_t(T, N)); +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) + __spirv_ocl_fabs(__ESIMD_raw_vec_t(T, N)) __ESIMD_INTRIN_END; + +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) + __spirv_ocl_s_abs(__ESIMD_raw_vec_t(T, N)) __ESIMD_INTRIN_END; + +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) + __spirv_ocl_fmin(__ESIMD_raw_vec_t(T, N), + __ESIMD_raw_vec_t(T, N)) __ESIMD_INTRIN_END; + // saturation intrinsics template __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) @@ -101,10 +114,6 @@ template __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) __esimd_sstrunc_sat(__ESIMD_raw_vec_t(T1, SZ) src) __ESIMD_INTRIN_END; -template -__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) - __esimd_abs(__ESIMD_raw_vec_t(T, SZ) src0) __ESIMD_INTRIN_END; - /// 3 kinds of max template __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) @@ -119,11 +128,7 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) __esimd_smax(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1) __ESIMD_INTRIN_END; -/// 3 kinds of min -template -__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) - __esimd_fmin(__ESIMD_raw_vec_t(T, SZ) src0, - __ESIMD_raw_vec_t(T, SZ) src1) __ESIMD_INTRIN_END; +/// 3 kinds of min, the missing fmin uses spir-v instrinsics above template __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) __esimd_umin(__ESIMD_raw_vec_t(T, SZ) src0, diff --git a/sycl/include/sycl/ext/intel/esimd/math.hpp b/sycl/include/sycl/ext/intel/esimd/math.hpp index 99effb3c7a932..096c33a2fda93 100644 --- a/sycl/include/sycl/ext/intel/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/esimd/math.hpp @@ -97,7 +97,11 @@ namespace detail { template ESIMD_NODEBUG ESIMD_INLINE simd __esimd_abs_common_internal(simd src0) { - simd Result = simd(__esimd_abs(src0.data())); + simd Result; + if constexpr (detail::is_generic_floating_point_v) + Result = simd(__spirv_ocl_fabs(src0.data())); + else + Result = simd(__spirv_ocl_s_abs(src0.data())); return convert(Result); } @@ -266,7 +270,7 @@ __ESIMD_API simd(min)(simd src0, simd src1, Sat sat = {}) { constexpr bool is_sat = std::is_same_v; if constexpr (std::is_floating_point::value) { - auto Result = __esimd_fmin(src0.data(), src1.data()); + auto Result = __spirv_ocl_fmin(src0.data(), src1.data()); if constexpr (is_sat) Result = __esimd_sat(Result); return simd(Result); @@ -1475,7 +1479,7 @@ template struct esimd_apply_reduced_min { template simd operator()(simd v1, simd v2) { if constexpr (std::is_floating_point::value) { - return __esimd_fmin(v1.data(), v2.data()); + return __spirv_ocl_fmin(v1.data(), v2.data()); } else if constexpr (std::is_unsigned::value) { return __esimd_umin(v1.data(), v2.data()); } else { diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index d42afb54c2ef8..e76cca317e8d4 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -112,8 +112,19 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(sycl::half, N) template __ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) - __esimd_fmadd(__ESIMD_raw_vec_t(T, N) a, __ESIMD_raw_vec_t(T, N) b, - __ESIMD_raw_vec_t(T, N) c) __ESIMD_INTRIN_END; + __spirv_ocl_fma(__ESIMD_raw_vec_t(T, N) a, __ESIMD_raw_vec_t(T, N) b, + __ESIMD_raw_vec_t(T, N) c) __ESIMD_INTRIN_END; +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) + __spirv_ocl_popcount(__ESIMD_raw_vec_t(T, N) src0) __ESIMD_INTRIN_END; + +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) + __spirv_ocl_ctz(__ESIMD_raw_vec_t(T, N) src0) __ESIMD_INTRIN_END; + +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) + __spirv_ocl_clz(__ESIMD_raw_vec_t(T, N) src0) __ESIMD_INTRIN_END; #undef __ESIMD_raw_vec_t #undef __ESIMD_cpp_vec_t diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp index 141334ce88eea..1eaa66e8d5f80 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp @@ -330,6 +330,40 @@ __ESIMD_API std::enable_if_t< return __ESIMD_NS::ror(src0, src1); } +/// Count the number of 1-bits. +/// @tparam T element type. +/// @tparam N vector length. +/// @return the popcounted vector. +template +__ESIMD_API std::enable_if_t && sizeof(T) < 8, + __ESIMD_NS::simd> +popcount(__ESIMD_NS::simd vec) { + return __spirv_ocl_popcount(vec.data()); +} + +/// Count the number of leading zeros. +/// If the input is 0, the number of total bits is returned. +/// @tparam T element type. +/// @tparam N vector length. +/// @return vector with number of leading zeros of the input vector. +template +__ESIMD_API std::enable_if_t && sizeof(T) < 8, + __ESIMD_NS::simd> +clz(__ESIMD_NS::simd vec) { + return __spirv_ocl_clz(vec.data()); +} + +/// Count the number of trailing zeros. +/// @tparam T element type. +/// @tparam N vector length. +/// @return vector with number of trailing zeros of the input vector. +template +__ESIMD_API std::enable_if_t && sizeof(T) < 8, + __ESIMD_NS::simd> +ctz(__ESIMD_NS::simd vec) { + return __spirv_ocl_ctz(vec.data()); +} + /// @} sycl_esimd_bitmanip /// @addtogroup sycl_esimd_math @@ -1671,7 +1705,7 @@ ESIMD_INLINE __ESIMD_NS::simd fma(__ESIMD_NS::simd a, static_assert(__ESIMD_DNS::is_generic_floating_point_v, "fma only supports floating point types"); using CppT = __ESIMD_DNS::element_type_traits::EnclosingCppT; - auto Ret = __esimd_fmadd<__ESIMD_DNS::__raw_t, N>( + auto Ret = __spirv_ocl_fma<__ESIMD_DNS::__raw_t, N>( __ESIMD_DNS::convert_vector(a.data()), __ESIMD_DNS::convert_vector(b.data()), __ESIMD_DNS::convert_vector(c.data())); diff --git a/sycl/test-e2e/ESIMD/clz_ctz.cpp b/sycl/test-e2e/ESIMD/clz_ctz.cpp new file mode 100644 index 0000000000000..87bcac8f27528 --- /dev/null +++ b/sycl/test-e2e/ESIMD/clz_ctz.cpp @@ -0,0 +1,116 @@ +//==---------------- clz_ctz.cpp - DPC++ ESIMD on-device test -------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +#include "esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; + +template bool test(queue &q) { + std::cout << "Running " << (CLZ ? "CLZ " : "CTZ ") + << esimd_test::type_name() << std::endl; + constexpr unsigned VL = 16; + constexpr unsigned Size = 1024 * 128; + + T *A = new T[Size]; + T *B = new T[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = i; + B[i] = 0; + } + + try { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + + // We need that many workgroups + range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.template get_access(cgh); + auto PB = bufb.template get_access(cgh); + cgh.parallel_for(GlobalRange * LocalRange, + [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::esimd; + unsigned int offset = i * VL * sizeof(T); + simd va; + va.copy_from(PA, offset); + simd vb; + if constexpr (CLZ) + vb = __ESIMD_ENS::clz(va); + else + vb = __ESIMD_ENS::ctz(va); + vb.copy_to(PB, offset); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + return false; + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + int Expected = + CLZ ? (i == 0 ? sizeof(T) * 8 : __builtin_clz(i)) : __builtin_ctz(i); + int Computed = B[i]; + if (Expected != Computed && ++err_cnt < 10) + std::cout << "Failure at " << std::to_string(i) + << ": Expected: " << std::to_string(Expected) + << " Computed: " << std::to_string(Computed) << std::endl; + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt == 0; +} + +int main() { + bool Passed = true; + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + esimd_test::printTestLabel(q); + Passed &= test(q); + Passed &= test(q); + Passed &= test(q); + Passed &= test(q); +// TODO: Enable once GPU driver issue is fixed +#if 0 + Passed &= test(q); + Passed &= test(q); + Passed &= test(q); + Passed &= test(q); + + Passed &= test(q); + Passed &= test(q); + Passed &= test(q); + Passed &= test(q); +#endif + return !Passed; +} diff --git a/sycl/test-e2e/ESIMD/popcount.cpp b/sycl/test-e2e/ESIMD/popcount.cpp new file mode 100644 index 0000000000000..1b66793b46435 --- /dev/null +++ b/sycl/test-e2e/ESIMD/popcount.cpp @@ -0,0 +1,101 @@ +//==---------------- popcount.cpp - DPC++ ESIMD on-device test -------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +#include "esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; + +template bool test(queue &q) { + std::cout << "Running " << esimd_test::type_name() << std::endl; + constexpr unsigned VL = 16; + constexpr unsigned Size = + std::min(std::numeric_limits::max() - VL + 1, 65536u); + + T *A = new T[Size]; + T *B = new T[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = i; + B[i] = 0; + } + + try { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + + // We need that many workgroups + range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.template get_access(cgh); + auto PB = bufb.template get_access(cgh); + cgh.parallel_for(GlobalRange * LocalRange, + [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::esimd; + unsigned int offset = i * VL * sizeof(T); + simd va; + va.copy_from(PA, offset); + simd vb = __ESIMD_ENS::popcount(va); + vb.copy_to(PB, offset); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + return false; + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + int Expected = std::bitset(i).count(); + int Computed = B[i]; + if (Expected != Computed && ++err_cnt < 10) + std::cout << "Failure at " << std::to_string(i) + << ": Expected: " << std::to_string(Expected) + << " Computed: " << std::to_string(Computed) << std::endl; + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt == 0; +} + +int main() { + bool Passed = true; + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + esimd_test::printTestLabel(q); + Passed &= test(q); + Passed &= test(q); + Passed &= test(q); + Passed &= test(q); + Passed &= test(q); + Passed &= test(q); + return !Passed; +}