From 102442987931cf0c745f4ad6c06f6c78a081a167 Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Sat, 29 Jan 2022 20:37:33 -0800 Subject: [PATCH 1/3] [ESIMD] Optimize out loops in popular simd constructors. Signed-off-by: Konstantin S Bobrovsky --- .../esimd/detail/simd_mask_impl.hpp | 2 +- .../esimd/detail/simd_obj_impl.hpp | 54 +++++++++-- sycl/test/esimd/ctor_codegen.cpp | 91 +++++++++++++++++++ 3 files changed, 136 insertions(+), 11 deletions(-) create mode 100644 sycl/test/esimd/ctor_codegen.cpp diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp index b85bb4685de1f..7e59085d0836e 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp @@ -49,7 +49,7 @@ class simd_mask_impl /// Construct from an array. To allow e.g. simd_mask m({1,0,0,1,...}). template > simd_mask_impl(const raw_element_type (&&Arr)[N1]) { - base_type::template init_from_array(std::move(Arr)); + base_type::init_from_array(std::move(Arr)); } /// Implicit conversion from simd. diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp index 2d02bdc79a96d..23a7ab30c77f1 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp @@ -77,8 +77,36 @@ struct is_simd_flag_type> : std::true_type {}; template static inline constexpr bool is_simd_flag_type_v = is_simd_flag_type::value; +/// @cond ESIMD_DETAIL + namespace detail { +// Functions to support efficient simd constructors - avoiding internal loop +// over elements. +template +constexpr vector_type_t make_vector_impl(const T (&&Arr)[N], + std::index_sequence) { + return vector_type_t{Arr[Is]...}; +} + +template +constexpr vector_type_t make_vector(const T (&&Arr)[N]) { + return make_vector_impl(std::move(Arr), std::make_index_sequence{}); +} + +template +constexpr vector_type_t make_vector_impl(T Base, T Stride, + std::index_sequence) { + return vector_type_t{(Base + ((T)Is) * Stride)...}; +} + +template +constexpr vector_type_t make_vector(T Base, T Stride) { + return make_vector_impl(Base, Stride, std::make_index_sequence{}); +} + +/// @endcond ESIMD_DETAIL + /// This is a base class for all ESIMD simd classes with real storage (simd, /// simd_mask_impl). It wraps a clang vector as the storage for the elements. /// Additionally this class supports region operations that map to Intel GPU @@ -120,10 +148,13 @@ class simd_obj_impl { static constexpr int length = N; protected: - template > - void init_from_array(const RawTy (&&Arr)[N1]) noexcept { - for (auto I = 0; I < N; ++I) { - M_data[I] = Arr[I]; + void init_from_array(const Ty (&&Arr)[N]) noexcept { + if constexpr (is_wrapper_elem_type_v) { + for (auto I = 0; I < N; ++I) { + M_data[I] = bitcast_to_raw_type(Arr[I]); + } + } else { + M_data = make_vector(std::move(Arr)); } } @@ -158,10 +189,13 @@ class simd_obj_impl { /// Initialize a simd_obj_impl object with an initial value and step. simd_obj_impl(Ty Val, Ty Step) noexcept { __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step)); -#pragma unroll - for (int i = 0; i < N; ++i) { - M_data[i] = bitcast_to_raw_type(Val); - Val = binary_op(Val, Step); + if constexpr (is_wrapper_elem_type_v) { + for (int i = 0; i < N; ++i) { + M_data[i] = bitcast_to_raw_type(Val); + Val = binary_op(Val, Step); + } + } else { + M_data = make_vector(Val, Step); } } @@ -175,8 +209,8 @@ class simd_obj_impl { /// Construct from an array. To allow e.g. simd_mask_type m({1,0,0,1,...}). template > - simd_obj_impl(const RawTy (&&Arr)[N1]) noexcept { - __esimd_dbg_print(simd_obj_impl(const RawTy(&&Arr)[N1])); + simd_obj_impl(const Ty (&&Arr)[N1]) noexcept { + __esimd_dbg_print(simd_obj_impl(const Ty(&&Arr)[N1])); init_from_array(std::move(Arr)); } diff --git a/sycl/test/esimd/ctor_codegen.cpp b/sycl/test/esimd/ctor_codegen.cpp new file mode 100644 index 0000000000000..fba9aa93e8f13 --- /dev/null +++ b/sycl/test/esimd/ctor_codegen.cpp @@ -0,0 +1,91 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -S %s -o - | FileCheck %s + +// Check efficiency of LLVM IR generated for various simd constructors. + +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel::experimental::esimd; + +// clang-format off + +SYCL_EXTERNAL auto foo(int i) SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z3fooi( +// CHECK: {{[^,]*}} %[[RES:[a-zA-Z0-9_\.]+]], +// CHECK: {{[^,]*}} %[[I:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val({ i, i }); + return val; +// CHECK: %[[V0:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> undef, i32 %[[I]], i64 0 +// CHECK-NEXT: %[[V1:[a-zA-Z0-9_\.]+]] = shufflevector <2 x i32> %[[V0]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x i32> %[[V1]], <2 x i32> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} + +SYCL_EXTERNAL auto bar() SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z3barv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val(17, 3); + return val; +// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x double> , <2 x double> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} + +SYCL_EXTERNAL auto gee() SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z3geev({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val(-7); + return val; +// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x i8> , <2 x i8> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} + +SYCL_EXTERNAL auto foomask() SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z7foomaskv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd_mask<2> val({ 1, 0 }); + return val; +// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x i16> , <2 x i16> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} + +SYCL_EXTERNAL auto geemask() SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z7geemaskv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd_mask<2> val(1); + return val; +// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x i16> , <2 x i16> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} + +// The element type is 'half', which requires conversion, so code generation +// is less efficient - has loop over elements. No much reason to check. +SYCL_EXTERNAL auto foohalf(half i) SYCL_ESIMD_FUNCTION { + simd val({ i, i }); + return val; +} + +// The element type is 'half', which requires conversion, so code generation +// is less efficient - has loop over elements. No much reason to check. +SYCL_EXTERNAL auto barhalf() SYCL_ESIMD_FUNCTION { + simd val(17, 3); + return val; +} + +// Here the element is half too, but code generation is efficient because +// no per-element operations are needed - scalar is converted before broadcasting. +SYCL_EXTERNAL auto geehalf() SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z7geehalfv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val(-7); + return val; +// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x half> , <2 x half> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} From 42312eed919051d3e3dd65795e5370eec3921b10 Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Sun, 30 Jan 2022 19:30:00 -0800 Subject: [PATCH 2/3] Fix corner-case test failures. Signed-off-by: Konstantin S Bobrovsky --- .../ext/intel/experimental/esimd/detail/simd_obj_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp index 23a7ab30c77f1..6feabf9e553d3 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp @@ -97,7 +97,7 @@ constexpr vector_type_t make_vector(const T (&&Arr)[N]) { template constexpr vector_type_t make_vector_impl(T Base, T Stride, std::index_sequence) { - return vector_type_t{(Base + ((T)Is) * Stride)...}; + return vector_type_t{(T)(Base + ((T)Is) * Stride)...}; } template @@ -189,7 +189,7 @@ class simd_obj_impl { /// Initialize a simd_obj_impl object with an initial value and step. simd_obj_impl(Ty Val, Ty Step) noexcept { __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step)); - if constexpr (is_wrapper_elem_type_v) { + if constexpr (is_wrapper_elem_type_v || !std::is_integral_v) { for (int i = 0; i < N; ++i) { M_data[i] = bitcast_to_raw_type(Val); Val = binary_op(Val, Step); From be4202985673c7ff5e5ebbd93475d285b7710fbf Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Sun, 30 Jan 2022 20:52:32 -0800 Subject: [PATCH 3/3] Fix unit test Signed-off-by: Konstantin S Bobrovsky --- sycl/test/esimd/ctor_codegen.cpp | 37 +++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/sycl/test/esimd/ctor_codegen.cpp b/sycl/test/esimd/ctor_codegen.cpp index fba9aa93e8f13..7073dfd281096 100644 --- a/sycl/test/esimd/ctor_codegen.cpp +++ b/sycl/test/esimd/ctor_codegen.cpp @@ -10,40 +10,50 @@ using namespace sycl::ext::intel::experimental::esimd; // clang-format off -SYCL_EXTERNAL auto foo(int i) SYCL_ESIMD_FUNCTION { -// CHECK: define dso_local spir_func void @_Z3fooi( +// Array-based constructor, FP element type, no loops exected - check. +SYCL_EXTERNAL auto foo(double i) SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z3food( // CHECK: {{[^,]*}} %[[RES:[a-zA-Z0-9_\.]+]], // CHECK: {{[^,]*}} %[[I:[a-zA-Z0-9_\.]+]]){{.*}} { - simd val({ i, i }); + simd val({ i, i }); return val; -// CHECK: %[[V0:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> undef, i32 %[[I]], i64 0 -// CHECK-NEXT: %[[V1:[a-zA-Z0-9_\.]+]] = shufflevector <2 x i32> %[[V0]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK: %[[V0:[a-zA-Z0-9_\.]+]] = insertelement <2 x double> undef, double %[[I]], i64 0 +// CHECK-NEXT: %[[V1:[a-zA-Z0-9_\.]+]] = shufflevector <2 x double> %[[V0]], <2 x double> poison, <2 x i32> zeroinitializer // CHECK-NEXT: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 -// CHECK-NEXT: store <2 x i32> %[[V1]], <2 x i32> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: store <2 x double> %[[V1]], <2 x double> addrspace(4)* %[[MDATA]] // CHECK-NEXT: ret void // CHECK-NEXT: } } +// Base + step constructor, FP element type, loops exected - don't check. SYCL_EXTERNAL auto bar() SYCL_ESIMD_FUNCTION { -// CHECK: define dso_local spir_func void @_Z3barv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { simd val(17, 3); return val; -// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 -// CHECK-NEXT: store <2 x double> , <2 x double> addrspace(4)* %[[MDATA]] -// CHECK-NEXT: ret void -// CHECK-NEXT: } } +// Base + step constructor, integer element type, no loops exected - check. +SYCL_EXTERNAL auto baz() SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z3bazv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val(17, 3); + return val; + // CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 + // CHECK-NEXT: store <2 x i32> , <2 x i32> addrspace(4)* %[[MDATA]] + // CHECK-NEXT: ret void + // CHECK-NEXT: } +} + +// Broadcast constructor, FP element type, no loops exected - check. SYCL_EXTERNAL auto gee() SYCL_ESIMD_FUNCTION { // CHECK: define dso_local spir_func void @_Z3geev({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { - simd val(-7); + simd val(-7); return val; // CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 -// CHECK-NEXT: store <2 x i8> , <2 x i8> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: store <2 x float> , <2 x float> addrspace(4)* %[[MDATA]] // CHECK-NEXT: ret void // CHECK-NEXT: } } +// Array-based simd_mask constructor, no loops exected - check. SYCL_EXTERNAL auto foomask() SYCL_ESIMD_FUNCTION { // CHECK: define dso_local spir_func void @_Z7foomaskv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { simd_mask<2> val({ 1, 0 }); @@ -54,6 +64,7 @@ SYCL_EXTERNAL auto foomask() SYCL_ESIMD_FUNCTION { // CHECK-NEXT: } } +// Broadcast simd_mask constructor, no loops exected - check. SYCL_EXTERNAL auto geemask() SYCL_ESIMD_FUNCTION { // CHECK: define dso_local spir_func void @_Z7geemaskv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { simd_mask<2> val(1);