Skip to content

[ESIMD] Optimize out loops in popular simd constructors. #5425

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class simd_mask_impl
/// Construct from an array. To allow e.g. simd_mask<N> m({1,0,0,1,...}).
template <int N1, class = std::enable_if_t<N1 == N>>
simd_mask_impl(const raw_element_type (&&Arr)[N1]) {
base_type::template init_from_array<N1>(std::move(Arr));
base_type::init_from_array(std::move(Arr));
}

/// Implicit conversion from simd.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,36 @@ struct is_simd_flag_type<overaligned_tag<N>> : std::true_type {};
template <typename T>
static inline constexpr bool is_simd_flag_type_v = is_simd_flag_type<T>::value;

/// @cond ESIMD_DETAIL

namespace detail {

// Functions to support efficient simd constructors - avoiding internal loop
// over elements.
template <class T, int N, size_t... Is>
constexpr vector_type_t<T, N> make_vector_impl(const T (&&Arr)[N],
std::index_sequence<Is...>) {
return vector_type_t<T, N>{Arr[Is]...};
}

template <class T, int N>
constexpr vector_type_t<T, N> make_vector(const T (&&Arr)[N]) {
return make_vector_impl<T, N>(std::move(Arr), std::make_index_sequence<N>{});
}

template <class T, int N, size_t... Is>
constexpr vector_type_t<T, N> make_vector_impl(T Base, T Stride,
std::index_sequence<Is...>) {
return vector_type_t<T, N>{(T)(Base + ((T)Is) * Stride)...};
}

template <class T, int N>
constexpr vector_type_t<T, N> make_vector(T Base, T Stride) {
return make_vector_impl<T, N>(Base, Stride, std::make_index_sequence<N>{});
}

/// @endcond ESIMD_DETAIL

/// This is a base class for all ESIMD simd classes with real storage (simd,
/// simd_mask_impl). It wraps a clang vector as the storage for the elements.
/// Additionally this class supports region operations that map to Intel GPU
Expand Down Expand Up @@ -120,10 +148,13 @@ class simd_obj_impl {
static constexpr int length = N;

protected:
template <int N1, class = std::enable_if_t<N1 == N>>
void init_from_array(const RawTy (&&Arr)[N1]) noexcept {
for (auto I = 0; I < N; ++I) {
M_data[I] = Arr[I];
void init_from_array(const Ty (&&Arr)[N]) noexcept {
if constexpr (is_wrapper_elem_type_v<Ty>) {
for (auto I = 0; I < N; ++I) {
M_data[I] = bitcast_to_raw_type(Arr[I]);
}
} else {
M_data = make_vector(std::move(Arr));
}
}

Expand Down Expand Up @@ -158,10 +189,13 @@ class simd_obj_impl {
/// Initialize a simd_obj_impl object with an initial value and step.
simd_obj_impl(Ty Val, Ty Step) noexcept {
__esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step));
#pragma unroll
for (int i = 0; i < N; ++i) {
M_data[i] = bitcast_to_raw_type(Val);
Val = binary_op<BinOp::add, Ty>(Val, Step);
if constexpr (is_wrapper_elem_type_v<Ty> || !std::is_integral_v<Ty>) {
for (int i = 0; i < N; ++i) {
M_data[i] = bitcast_to_raw_type(Val);
Val = binary_op<BinOp::add, Ty>(Val, Step);
}
} else {
M_data = make_vector<Ty, N>(Val, Step);
}
}

Expand All @@ -175,8 +209,8 @@ class simd_obj_impl {

/// Construct from an array. To allow e.g. simd_mask_type<N> m({1,0,0,1,...}).
template <int N1, class = std::enable_if_t<N1 == N>>
simd_obj_impl(const RawTy (&&Arr)[N1]) noexcept {
__esimd_dbg_print(simd_obj_impl(const RawTy(&&Arr)[N1]));
simd_obj_impl(const Ty (&&Arr)[N1]) noexcept {
__esimd_dbg_print(simd_obj_impl(const Ty(&&Arr)[N1]));
init_from_array(std::move(Arr));
}

Expand Down
102 changes: 102 additions & 0 deletions sycl/test/esimd/ctor_codegen.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// RUN: %clangxx -fsycl -fsycl-device-only -S %s -o - | FileCheck %s

// Check efficiency of LLVM IR generated for various simd constructors.

#include <CL/sycl.hpp>
#include <sycl/ext/intel/experimental/esimd.hpp>

using namespace sycl;
using namespace sycl::ext::intel::experimental::esimd;

// clang-format off

// Array-based constructor, FP element type, no loops exected - check.
SYCL_EXTERNAL auto foo(double i) SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z3food(
// CHECK: {{[^,]*}} %[[RES:[a-zA-Z0-9_\.]+]],
// CHECK: {{[^,]*}} %[[I:[a-zA-Z0-9_\.]+]]){{.*}} {
simd<double, 2> val({ i, i });
return val;
// CHECK: %[[V0:[a-zA-Z0-9_\.]+]] = insertelement <2 x double> undef, double %[[I]], i64 0
// CHECK-NEXT: %[[V1:[a-zA-Z0-9_\.]+]] = shufflevector <2 x double> %[[V0]], <2 x double> poison, <2 x i32> zeroinitializer
// CHECK-NEXT: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0
// CHECK-NEXT: store <2 x double> %[[V1]], <2 x double> addrspace(4)* %[[MDATA]]
// CHECK-NEXT: ret void
// CHECK-NEXT: }
}

// Base + step constructor, FP element type, loops exected - don't check.
SYCL_EXTERNAL auto bar() SYCL_ESIMD_FUNCTION {
simd<double, 2> val(17, 3);
return val;
}

// Base + step constructor, integer element type, no loops exected - check.
SYCL_EXTERNAL auto baz() SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z3bazv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
simd<int, 2> val(17, 3);
return val;
// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0
// CHECK-NEXT: store <2 x i32> <i32 17, i32 20>, <2 x i32> addrspace(4)* %[[MDATA]]
// CHECK-NEXT: ret void
// CHECK-NEXT: }
}

// Broadcast constructor, FP element type, no loops exected - check.
SYCL_EXTERNAL auto gee() SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z3geev({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
simd<float, 2> val(-7);
return val;
// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0
// CHECK-NEXT: store <2 x float> <float -7.000000e+00, float -7.000000e+00>, <2 x float> addrspace(4)* %[[MDATA]]
// CHECK-NEXT: ret void
// CHECK-NEXT: }
}

// Array-based simd_mask constructor, no loops exected - check.
SYCL_EXTERNAL auto foomask() SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z7foomaskv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
simd_mask<2> val({ 1, 0 });
return val;
// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0
// CHECK-NEXT: store <2 x i16> <i16 1, i16 0>, <2 x i16> addrspace(4)* %[[MDATA]]
// CHECK-NEXT: ret void
// CHECK-NEXT: }
}

// Broadcast simd_mask constructor, no loops exected - check.
SYCL_EXTERNAL auto geemask() SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z7geemaskv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
simd_mask<2> val(1);
return val;
// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0
// CHECK-NEXT: store <2 x i16> <i16 1, i16 1>, <2 x i16> addrspace(4)* %[[MDATA]]
// CHECK-NEXT: ret void
// CHECK-NEXT: }
}

// The element type is 'half', which requires conversion, so code generation
// is less efficient - has loop over elements. No much reason to check.
SYCL_EXTERNAL auto foohalf(half i) SYCL_ESIMD_FUNCTION {
simd<half, 2> val({ i, i });
return val;
}

// The element type is 'half', which requires conversion, so code generation
// is less efficient - has loop over elements. No much reason to check.
SYCL_EXTERNAL auto barhalf() SYCL_ESIMD_FUNCTION {
simd<half, 2> val(17, 3);
return val;
}

// Here the element is half too, but code generation is efficient because
// no per-element operations are needed - scalar is converted before broadcasting.
SYCL_EXTERNAL auto geehalf() SYCL_ESIMD_FUNCTION {
// CHECK: define dso_local spir_func void @_Z7geehalfv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} {
simd<half, 2> val(-7);
return val;
// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0
// CHECK-NEXT: store <2 x half> <half 0xHC700, half 0xHC700>, <2 x half> addrspace(4)* %[[MDATA]]
// CHECK-NEXT: ret void
// CHECK-NEXT: }
}