diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index ec8f2a0068ef8..d88f7f0ecde12 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -3623,6 +3623,92 @@ gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, return gather(acc, byte_offsets.read(), mask, pass_thru, props); } +/// template +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); +/// This function is identical to (lacc-ga-1) except that the \p byte_offsets +/// is represented as \c simd_view. +template < + int VS, typename T, int N, typename AccessorT, typename OffsetSimdViewT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + (detail::is_device_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, + simd pass_thru, PropertyListT props = {}) { + static_assert(N / VS == + OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of pass_thru parameter must correspond to the size of " + "byte_offsets parameter."); + return gather(acc, byte_offsets.read(), mask, pass_thru, props); +} + +/// template > +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, PassThruSimdViewT pass_thru, +/// PropertyListT props = {}); +/// This function is identical to (lacc-ga-1) except that the \p byte_offsets +/// and \p pass_thru are represented as \c simd_view. +template < + int VS = 1, typename AccessorT, typename OffsetSimdViewT, + typename PassThruSimdViewT, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename T = PassThruSimdViewT::value_type::element_type, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + (detail::is_device_accessor_with_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, + PassThruSimdViewT pass_thru, PropertyListT props = {}) { + static_assert(N / VS == + OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of pass_thru parameter must correspond to the size of " + "byte_offsets parameter."); + return gather(acc, byte_offsets.read(), mask, pass_thru.read(), + props); +} + +/// template > +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); +/// This function is identical to (lacc-ga-1) except that the \p byte_offsets +/// is represented as \c simd_view. +template < + int VS = 1, typename AccessorT, typename OffsetT, + typename PassThruSimdViewT, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename T = PassThruSimdViewT::value_type::element_type, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + (detail::is_device_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, + simd_mask mask, PassThruSimdViewT pass_thru, + PropertyListT props = {}) { + return gather(acc, byte_offsets, mask, pass_thru.read(), props); +} + /// template @@ -4672,6 +4758,140 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask mask, return slm_gather(byte_offsets.read(), mask, pass_thru, props); } +/// template +/// simd slm_gather( +/// OffsetSimdViewT byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); +/// Variation of the API that allows to use \c simd_view without specifying \c T +/// and \c N template parameters. +/// Loads ("gathers") elements of the type 'T' from Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets, and returns the loaded +/// elements. Access to any element's memory location can be disabled via the +/// input vector of predicates \p mask. If mask[i] is unset, then the load from +/// (byte_offsets[i]) is skipped and the corresponding i-th element from +/// \p pass_thru operand is returned. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param mask The access mask, defaults to all 1s. +/// @param pass_thru The vector pass through values. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +/// @return A vector of elements read. +template < + int VS, typename T, int N, typename OffsetSimdViewT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + (detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +slm_gather(OffsetSimdViewT byte_offsets, simd_mask mask, + simd pass_thru, PropertyListT props = {}) { + static_assert(N / VS == + OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of pass_thru parameter must correspond to the size of " + "byte_offsets parameter."); + return slm_gather(byte_offsets.read(), mask, pass_thru, props); +} + +/// template +/// simd slm_gather( +/// OffsetSimdViewT byte_offsets, +/// simd_mask mask, PassThruSimdViewT pass_thru, +/// PropertyListT props = {}); +/// Variation of the API that allows to use \c simd_view without specifying \c T +/// and \c N template parameters. +/// Loads ("gathers") elements of the type 'T' from Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets, and returns the loaded +/// elements. Access to any element's memory location can be disabled via the +/// input vector of predicates \p mask. If mask[i] is unset, then the load from +/// (byte_offsets[i]) is skipped and the corresponding i-th element from +/// \p pass_thru operand is returned. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param mask The access mask, defaults to all 1s. +/// @param pass_thru The vector pass through values. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +/// @return A vector of elements read. +template < + int VS = 1, typename OffsetSimdViewT, typename PassThruSimdViewT, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename T = PassThruSimdViewT::value_type::element_type, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + (detail::is_simd_view_type_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +slm_gather(OffsetSimdViewT byte_offsets, simd_mask mask, + PassThruSimdViewT pass_thru, PropertyListT props = {}) { + static_assert(N / VS == + OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of pass_thru parameter must correspond to the size of " + "byte_offsets parameter."); + return slm_gather(byte_offsets.read(), mask, pass_thru.read(), + props); +} + +/// template +/// simd slm_gather( +/// OffsetSimdViewT byte_offsets, +/// simd_mask mask, PassThruSimdViewT pass_thru, +/// PropertyListT props = {}); +/// Variation of the API that allows to use \c simd_view without specifying \c T +/// and \c N template parameters. +/// Loads ("gathers") elements of the type 'T' from Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets, and returns the loaded +/// elements. Access to any element's memory location can be disabled via the +/// input vector of predicates \p mask. If mask[i] is unset, then the load from +/// (byte_offsets[i]) is skipped and the corresponding i-th element from +/// \p pass_thru operand is returned. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param mask The access mask, defaults to all 1s. +/// @param pass_thru The vector pass through values. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +/// @return A vector of elements read. +template < + int VS = 1, typename PassThruSimdViewT, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename T = PassThruSimdViewT::value_type::element_type, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + (detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +slm_gather(simd byte_offsets, simd_mask mask, + PassThruSimdViewT pass_thru, PropertyListT props = {}) { + return slm_gather(byte_offsets, mask, pass_thru.read(), props); +} + /// simd slm_gather( /// OffsetSimdViewT byte_offsets, /// simd_mask mask, PropertyListT props = {}); // (slm-ga-8) @@ -8420,6 +8640,91 @@ gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, return gather(acc, byte_offsets.read(), mask, pass_thru, props); } +/// template +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); +/// This function is identical to (lacc-ga-1) except that the \p byte_offsets +/// is represented as \c simd_view. +template < + int VS, typename T, int N, typename AccessorT, typename OffsetSimdViewT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, + simd pass_thru, PropertyListT props = {}) { + static_assert(N / VS == + OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of pass_thru parameter must correspond to the size of " + "byte_offsets parameter."); + return gather(acc, byte_offsets.read(), mask, pass_thru, props); +} + +/// template > +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, PassThruSimdViewT pass_thru, +/// PropertyListT props = {}); +/// This function is identical to (lacc-ga-1) except that the \p byte_offsets +/// and \p pass_thru are represented as \c simd_view. +template < + int VS = 1, typename AccessorT, typename OffsetSimdViewT, + typename PassThruSimdViewT, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename T = PassThruSimdViewT::value_type::element_type, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, + PassThruSimdViewT pass_thru, PropertyListT props = {}) { + static_assert(N / VS == + OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of pass_thru parameter must correspond to the size of " + "byte_offsets parameter."); + return gather(acc, byte_offsets.read(), mask, pass_thru.read(), + props); +} + +/// template > +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); +/// This function is identical to (lacc-ga-1) except that the \p pass_thru +/// is represented as \c simd_view. +template < + int VS = 1, typename AccessorT, typename PassThruSimdViewT, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename T = PassThruSimdViewT::value_type::element_type, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, + simd_mask mask, PassThruSimdViewT pass_thru, + PropertyListT props = {}) { + return gather(acc, byte_offsets, mask, pass_thru.read(), props); +} + /// template diff --git a/sycl/test/esimd/memory_properties_gather.cpp b/sycl/test/esimd/memory_properties_gather.cpp index 6588af96cad47..bec6500a229d5 100644 --- a/sycl/test/esimd/memory_properties_gather.cpp +++ b/sycl/test/esimd/memory_properties_gather.cpp @@ -284,8 +284,8 @@ test_gather(AccType &acc, LocalAccType &local_acc, float *ptrf, // 5) gather(acc, offsets): offsets is simd or simd_view // CHECK-STATEFUL-COUNT-12: call <32 x float> @llvm.genx.gather.masked.scaled2.v32f32.v32i32.v32i1(i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x i1> {{[^)]+}}) - // CHECK-STATEFUL-COUNT-14: call <32 x i32> @llvm.genx.lsc.load.merge.bti.v32i32.v32i1.v32i32(<32 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}) - // CHECK-STATELESS-COUNT-26: call <32 x float> @llvm.masked.gather.v32f32.v32p4(<32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-STATEFUL-COUNT-28: call <32 x i32> @llvm.genx.lsc.load.merge.bti.v32i32.v32i1.v32i32(<32 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}) + // CHECK-STATELESS-COUNT-40: call <32 x float> @llvm.masked.gather.v32f32.v32p4(<32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) acc_res = gather(acc, ioffset_n32); acc_res = gather(acc, ioffset_n32_view); acc_res = gather(acc, ioffset_n32_view.select<32, 1>()); @@ -331,9 +331,31 @@ test_gather(AccType &acc, LocalAccType &local_acc, float *ptrf, acc_res = gather(acc, ioffset_n32_view.select<32, 1>(), mask_n32, pass_thru_view.select<32, 1>(), props_align4); + acc_res = gather(acc, ioffset_n32, mask_n32, pass_thru); + acc_res = gather(acc, ioffset_n32_view, mask_n32, pass_thru); + acc_res = gather(acc, ioffset_n32_view.select<32, 1>(), mask_n32, pass_thru); + + acc_res = gather(acc, ioffset_n32, mask_n32, pass_thru, props_align4); + acc_res = gather(acc, ioffset_n32_view, mask_n32, pass_thru, props_align4); + acc_res = gather(acc, ioffset_n32_view.select<32, 1>(), mask_n32, pass_thru, + props_align4); + + acc_res = gather(acc, ioffset_n32, mask_n32, pass_thru_view); + acc_res = gather(acc, ioffset_n32_view, mask_n32, pass_thru_view); + acc_res = gather(acc, ioffset_n32, mask_n32, pass_thru_view, props_align4); + acc_res = + gather(acc, ioffset_n32_view, mask_n32, pass_thru_view, props_align4); + acc_res = gather(acc, ioffset_n32, mask_n32, pass_thru_view.select<32, 1>()); + acc_res = gather(acc, ioffset_n32_view.select<32, 1>(), mask_n32, + pass_thru_view.select<32, 1>()); + acc_res = gather(acc, ioffset_n32, mask_n32, pass_thru_view.select<32, 1>(), + props_align4); + acc_res = gather(acc, ioffset_n32_view.select<32, 1>(), mask_n32, + pass_thru_view.select<32, 1>(), props_align4); + // 8) gather(ac, ...): same as (5), (6), (7) above, but with VS > 1. - // CHECK-STATEFUL-COUNT-26: call <32 x i32> @llvm.genx.lsc.load.merge.bti.v32i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}) - // CHECK-STATELESS-COUNT-26: call <32 x i32> @llvm.genx.lsc.load.merge.stateless.v32i32.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0, <32 x i32> {{[^)]+}}) + // CHECK-STATEFUL-COUNT-38: call <32 x i32> @llvm.genx.lsc.load.merge.bti.v32i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}) + // CHECK-STATELESS-COUNT-38: call <32 x i32> @llvm.genx.lsc.load.merge.stateless.v32i32.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0, <32 x i32> {{[^)]+}}) acc_res = gather(acc, ioffset_n16); acc_res = gather(acc, ioffset_n16_view); acc_res = gather(acc, ioffset_n16, props_align4); @@ -377,9 +399,30 @@ test_gather(AccType &acc, LocalAccType &local_acc, float *ptrf, acc_res = gather(acc, ioffset_n16_view.select<16, 1>(), mask_n16, pass_thru_view.select<32, 1>(), props_align4); + acc_res = gather<2>(acc, ioffset_n16_view, mask_n16, pass_thru); + + acc_res = gather<2>(acc, ioffset_n16_view, mask_n16, pass_thru, props_align4); + acc_res = gather<2>(acc, ioffset_n16, mask_n16, pass_thru_view); + acc_res = gather<2>(acc, ioffset_n16_view, mask_n16, pass_thru_view); + acc_res = gather<2>(acc, ioffset_n16, mask_n16, pass_thru_view, props_align4); + acc_res = + gather<2>(acc, ioffset_n16_view, mask_n16, pass_thru_view, props_align4); + + acc_res = + gather<2>(acc, ioffset_n16_view.select<16, 1>(), mask_n16, pass_thru); + acc_res = gather<2>(acc, ioffset_n16_view.select<16, 1>(), mask_n16, + pass_thru, props_align4); + acc_res = + gather<2>(acc, ioffset_n16, mask_n16, pass_thru_view.select<32, 1>()); + acc_res = gather<2>(acc, ioffset_n16_view.select<16, 1>(), mask_n16, + pass_thru_view.select<32, 1>()); + acc_res = gather<2>(acc, ioffset_n16, mask_n16, + pass_thru_view.select<32, 1>(), props_align4); + acc_res = gather<2>(acc, ioffset_n16_view.select<16, 1>(), mask_n16, + pass_thru_view.select<32, 1>(), props_align4); // 9) gather(lacc, offsets): offsets is simd or simd_view - // CHECK-COUNT-26: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-38: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) acc_res = gather(local_acc, ioffset_n32); acc_res = gather(local_acc, ioffset_n32_view); acc_res = gather(local_acc, ioffset_n32_view.select<32, 1>()); @@ -432,8 +475,34 @@ test_gather(AccType &acc, LocalAccType &local_acc, float *ptrf, gather(local_acc, ioffset_n32_view.select<32, 1>(), mask_n32, pass_thru_view.select<32, 1>(), props_align4); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32, pass_thru); + + acc_res = + gather(local_acc, ioffset_n32_view, mask_n32, pass_thru, props_align4); + + acc_res = gather(local_acc, ioffset_n32, mask_n32, pass_thru_view); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32, pass_thru_view); + acc_res = + gather(local_acc, ioffset_n32, mask_n32, pass_thru_view, props_align4); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32, pass_thru_view, + props_align4); + + acc_res = + gather(local_acc, ioffset_n32_view.select<32, 1>(), mask_n32, pass_thru); + acc_res = gather(local_acc, ioffset_n32_view.select<32, 1>(), mask_n32, + pass_thru, props_align4); + + acc_res = + gather(local_acc, ioffset_n32, mask_n32, pass_thru_view.select<32, 1>()); + acc_res = gather(local_acc, ioffset_n32_view.select<32, 1>(), mask_n32, + pass_thru_view.select<32, 1>()); + acc_res = gather(local_acc, ioffset_n32, mask_n32, + pass_thru_view.select<32, 1>(), props_align4); + acc_res = gather(local_acc, ioffset_n32_view.select<32, 1>(), mask_n32, + pass_thru_view.select<32, 1>(), props_align4); + // 12) gather(lacc, ...): same as (9), (10), (11) above, but with VS > 1. - // CHECK-COUNT-27: call <32 x i32> @llvm.genx.lsc.load.merge.slm.v32i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 0, <32 x i32> {{[^)]+}}) + // CHECK-COUNT-39: call <32 x i32> @llvm.genx.lsc.load.merge.slm.v32i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 0, <32 x i32> {{[^)]+}}) acc_res = gather(local_acc, ioffset_n16); acc_res = gather(local_acc, ioffset_n16_view); acc_res = gather(local_acc, ioffset_n16, props_align4); @@ -483,6 +552,29 @@ test_gather(AccType &acc, LocalAccType &local_acc, float *ptrf, mask_n16, pass_thru_view.select<32, 1>(), props_align4); + acc_res = gather<2>(local_acc, ioffset_n16_view, mask_n16, pass_thru); + acc_res = + gather<2>(local_acc, ioffset_n16_view, mask_n16, pass_thru, props_align4); + acc_res = gather<2>(local_acc, ioffset_n16, mask_n16, pass_thru_view); + acc_res = gather<2>(local_acc, ioffset_n16_view, mask_n16, pass_thru_view); + acc_res = + gather<2>(local_acc, ioffset_n16, mask_n16, pass_thru_view, props_align4); + acc_res = gather<2>(local_acc, ioffset_n16_view, mask_n16, pass_thru_view, + props_align4); + + acc_res = gather<2>(local_acc, ioffset_n16_view.select<16, 1>(), mask_n16, + pass_thru); + acc_res = gather<2>(local_acc, ioffset_n16_view.select<16, 1>(), mask_n16, + pass_thru, props_align4); + acc_res = gather<2>(local_acc, ioffset_n16, mask_n16, + pass_thru_view.select<32, 1>()); + acc_res = gather<2>(local_acc, ioffset_n16_view.select<16, 1>(), mask_n16, + pass_thru_view.select<32, 1>()); + acc_res = gather<2>(local_acc, ioffset_n16, mask_n16, + pass_thru_view.select<32, 1>(), props_align4); + acc_res = gather<2>(local_acc, ioffset_n16_view.select<16, 1>(), mask_n16, + pass_thru_view.select<32, 1>(), props_align4); + // Validate that a new API doesn't conflict with the old API. // CHECK-COUNT-2: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) acc_res = gather(local_acc, ioffset_n32, 0); @@ -539,7 +631,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_slm_gather(int byte_offset32) { props_align8); // 3) slm_gather(offsets, mask, pass_thru) - // CHECK-COUNT-7: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-13: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) slm = slm_gather(ioffset_n32, mask_n32, pass_thru); slm = slm_gather(ioffset_n32_view, mask_n32, pass_thru); slm = slm_gather(ioffset_n32, mask_n32, pass_thru_view); @@ -551,7 +643,15 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_slm_gather(int byte_offset32) { slm = slm_gather(ioffset_n32_view.select<32, 1>(), mask_n32, pass_thru_view.select<32, 1>()); - // CHECK-COUNT-7: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) + slm = slm_gather(ioffset_n32_view, mask_n32, pass_thru); + slm = slm_gather(ioffset_n32, mask_n32, pass_thru_view); + slm = slm_gather(ioffset_n32_view, mask_n32, pass_thru_view); + slm = slm_gather(ioffset_n32_view.select<32, 1>(), mask_n32, pass_thru); + slm = slm_gather(ioffset_n32, mask_n32, pass_thru_view.select<32, 1>()); + slm = slm_gather(ioffset_n32_view.select<32, 1>(), mask_n32, + pass_thru_view.select<32, 1>()); + + // CHECK-COUNT-13: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) slm = slm_gather(ioffset_n32, mask_n32, pass_thru, props_align8); slm = slm_gather(ioffset_n32_view, mask_n32, pass_thru, props_align8); @@ -566,8 +666,18 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_slm_gather(int byte_offset32) { slm = slm_gather(ioffset_n32_view.select<32, 1>(), mask_n32, pass_thru_view.select<32, 1>(), props_align8); + slm = slm_gather(ioffset_n32_view, mask_n32, pass_thru, props_align8); + slm = slm_gather(ioffset_n32, mask_n32, pass_thru_view, props_align8); + slm = slm_gather(ioffset_n32_view, mask_n32, pass_thru_view, props_align8); + slm = slm_gather(ioffset_n32_view.select<32, 1>(), mask_n32, pass_thru, + props_align8); + slm = slm_gather(ioffset_n32, mask_n32, pass_thru_view.select<32, 1>(), + props_align8); + slm = slm_gather(ioffset_n32_view.select<32, 1>(), mask_n32, + pass_thru_view.select<32, 1>(), props_align8); + // 4) slm_gather(...): same as (1), (2), (3) above, but with VS > 1. - // CHECK-COUNT-26: call <32 x i32> @llvm.genx.lsc.load.merge.slm.v32i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 0, <32 x i32> {{[^)]+}}) + // CHECK-COUNT-38: call <32 x i32> @llvm.genx.lsc.load.merge.slm.v32i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 0, <32 x i32> {{[^)]+}}) // 4a) check VS > 1. no 'mask' operand first. slm = slm_gather(ioffset_n16); slm = slm_gather(ioffset_n16_view); @@ -613,4 +723,21 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_slm_gather(int byte_offset32) { pass_thru_view.select<32, 1>(), props_align4); slm = slm_gather(ioffset_n16_view.select<16, 1>(), mask_n16, pass_thru_view.select<32, 1>(), props_align4); + + slm = slm_gather<2>(ioffset_n16_view, mask_n16, pass_thru); + slm = slm_gather<2>(ioffset_n16, mask_n16, pass_thru_view); + slm = slm_gather<2>(ioffset_n16_view, mask_n16, pass_thru_view); + slm = slm_gather<2>(ioffset_n16_view.select<16, 1>(), mask_n16, pass_thru); + slm = slm_gather<2>(ioffset_n16, mask_n16, pass_thru_view.select<32, 1>()); + slm = slm_gather<2>(ioffset_n16_view.select<16, 1>(), mask_n16, + pass_thru_view.select<32, 1>()); + slm = slm_gather<2>(ioffset_n16_view, mask_n16, pass_thru, props_align4); + slm = slm_gather<2>(ioffset_n16, mask_n16, pass_thru_view, props_align4); + slm = slm_gather<2>(ioffset_n16_view, mask_n16, pass_thru_view, props_align4); + slm = slm_gather<2>(ioffset_n16_view.select<16, 1>(), mask_n16, pass_thru, + props_align4); + slm = slm_gather<2>(ioffset_n16, mask_n16, pass_thru_view.select<32, 1>(), + props_align4); + slm = slm_gather<2>(ioffset_n16_view.select<16, 1>(), mask_n16, + pass_thru_view.select<32, 1>(), props_align4); } \ No newline at end of file