From d825ba8bec816039b824fb9c6a2fac041abd18ce Mon Sep 17 00:00:00 2001 From: "Klochkov, Vyacheslav N" Date: Mon, 6 Nov 2023 09:32:25 -0800 Subject: [PATCH] [ESIMD] Fix implementations of block_load(usm, ...) and block_load(acc,...) 1) Fix the big mess in E2E test for block_load(). Test did not really check the mask variant. It also used wrong alignments. 2) Fix the comments for USM and ACC block_load implementations. 3) Minor optimization for ACC block_load functions that do not accept the byte_offset operand. We can assume align16 for them. Signed-off-by: Klochkov, Vyacheslav N --- .../ext/intel/esimd/detail/memory_intrin.hpp | 2 +- sycl/include/sycl/ext/intel/esimd/memory.hpp | 392 +++++++++++------- sycl/test-e2e/ESIMD/esimd_test_utils.hpp | 27 +- .../unified_memory_api/Inputs/block_load.hpp | 194 +++++---- sycl/test/esimd/memory_properties.cpp | 2 +- 5 files changed, 377 insertions(+), 240 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp index e7eed0c5ecfc1..317a33c56000d 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp @@ -212,7 +212,7 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t()> __esimd_lsc_load_merge_bti( __ESIMD_DNS::simd_mask_storage_t pred, __ESIMD_DNS::vector_type_t offsets, SurfIndAliasT surf_ind, - __ESIMD_DNS::vector_type_t()> PassThru = 0) + __ESIMD_DNS::vector_type_t()> PassThru) #ifdef __SYCL_DEVICE_ONLY__ ; #else // __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 9f6ca4fe99ba7..16d4bcf2ab225 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -232,8 +232,8 @@ using DeviceAccessorOffsetT = uint64_t; using DeviceAccessorOffsetT = uint32_t; #endif -template +template __ESIMD_API std::enable_if_t, simd> block_load_impl(const T *p, simd_mask<1> pred, FlagsT flags) { // Verify input template arguments. @@ -317,8 +317,8 @@ block_load_impl(const T *p, simd_mask<1> pred, FlagsT flags) { /// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts. /// -template +template __ESIMD_API std::enable_if_t, simd> block_load_impl(const T *p, simd_mask<1> pred, simd pass_thru, FlagsT flags) { @@ -406,9 +406,8 @@ block_load_impl(const T *p, simd_mask<1> pred, simd pass_thru, /// @return is a vector of type T and size NElts. The elements of the returned /// vector for which the corresponding element in \p pred is 0 are undefined. /// -template +template __ESIMD_API std::enable_if_t && @@ -506,9 +505,8 @@ __ESIMD_API /// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts /// -template +template __ESIMD_API std::enable_if_t && @@ -574,8 +572,8 @@ __ESIMD_API #endif // !__ESIMD_FORCE_STATELESS_MEM } -template +template __ESIMD_API std::enable_if_t> block_store_impl(T *p, simd vals, simd_mask<1> pred, FlagsT flags) { detail::check_cache_hint(); @@ -669,19 +667,22 @@ block_store(Tx *addr, simd vals, Flags) { /// of the type esimd::properties and may include esimd::cache_hint_L1, /// esimd::cache_hint_L2, esimd::cache_hint_L3, esimd::alignment. -/// simd block_load(const T* ptr, props={}); // (1) -/// simd block_load(const T* ptr, size_t offset, props={}); // (2) +/// simd block_load(const T* ptr, props={}); // (usm-bl-1) +/// simd block_load(const T* ptr, size_t byte_offset, +/// props={}); // (usm-bl-2) -/// simd block_load(const T* ptr, simd_mask<1> pred, props={}); // (3) -/// simd block_load(const T* ptr, size_t offset, simd_mask<1> pred, -/// props={}); // (4) +/// simd block_load(const T* ptr, simd_mask<1> pred, +/// props={}); // (usm-bl-3) +/// simd block_load(const T* ptr, size_t byte_offset, +/// simd_mask<1> pred, props={}); // (usm-bl-4) /// simd block_load(const T* ptr, simd_mask<1> pred, -/// simd pass_thru, props={}); // (5) -/// simd block_load(const T* ptr, size_t offset, simd_mask<1> pred, -/// simd pass_thru, props={}); // (6) +/// simd pass_thru, props={}); // (usm-bl-5) +/// simd block_load(const T* ptr, size_t byte_offset, +/// simd_mask<1> pred, simd pass_thru, +/// props={}); // (usm-bl-6) -/// simd block_load(const T* ptr, props={}); // (1) +/// simd block_load(const T* ptr, props={}); // (usm-bl-1) /// This function loads a contiguous memory block from USM pointer \p ptr. /// /// There may be temporary restrictions depending on L1, L2 cache hints, @@ -730,7 +731,7 @@ block_load(const T *ptr, PropertyListT props = {}) { if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none) { detail::check_cache_hint(); - constexpr int DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); + constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); constexpr size_t Alignment = detail::getPropertyValue( DefaultAlignment); @@ -747,9 +748,10 @@ block_load(const T *ptr, PropertyListT props = {}) { } } -/// simd block_load(const T* ptr, size_t byte_offset, props={}); // (2) +/// simd block_load(const T* ptr, size_t byte_offset, +/// props={}); // (usm-bl-2) /// This function loads a contiguous memory block from address referenced -/// by USM pointer \p ptr and byte-offset \p byte_offset. +/// by USM pointer \p ptr and the given \p byte_offset. /// /// There may be temporary restrictions depending on L1, L2 cache hints, /// See details in the 'Restrictions' section below. The restrictions will be @@ -790,7 +792,8 @@ block_load(const T *ptr, size_t byte_offset, PropertyListT props = {}) { return block_load(AdjustedPtr, props); } -/// simd block_load(const T* ptr, simd_mask<1> pred, props={}); // (3) +/// simd block_load(const T* ptr, simd_mask<1> pred, +/// props={}); // (usm-bl-3) /// This function loads a contiguous memory block from USM pointer \p ptr. /// If the predicate \p pred is set to 0, then the load is omitted and the /// returned value is undefined. @@ -846,10 +849,10 @@ block_load(const T *ptr, simd_mask<1> pred, PropertyListT props = {}) { ptr, pred, overaligned_tag{}); } -/// simd block_load(const T* ptr, size_t byte_offset, simd_mask<1> pred, -/// props={}); // (4) +/// simd block_load(const T* ptr, size_t byte_offset, +/// simd_mask<1> pred, props={}); // (usm-bl-4) /// This function loads a contiguous memory block from address referenced -/// by USM pointer \p ptr and byte-offset \p byte_offset. +/// by USM pointer \p ptr and the given \p byte_offset. /// If the predicate \p pred is set to 0, then the load is omitted and the /// returned value is undefined. /// @@ -892,7 +895,7 @@ block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred, } /// simd block_load(const T* ptr, simd_mask<1> pred, -/// simd pass_thru, props={}); // (5) +/// simd pass_thru, props={}); // (usm-bl-5) /// This function loads a contiguous memory block from USM pointer \p ptr. /// If the predicate \p pred is set to 0, then the load is omitted and the /// vector \p pass_thru is returned. @@ -949,10 +952,11 @@ block_load(const T *ptr, simd_mask<1> pred, simd pass_thru, ptr, pred, pass_thru, overaligned_tag{}); } -/// simd block_load(const T* ptr, size_t byte_offset, simd_mask<1> pred, -/// simd pass_thru, props={}); // (6) +/// simd block_load(const T* ptr, size_t byte_offset, +/// simd_mask<1> pred, simd pass_thru, +/// props={}); // (usm-bl-6) /// This function loads a contiguous memory block from address referenced -/// by USM pointer \p ptr and byte-offset \p byte_offset. +/// by USM pointer \p ptr and the given \p byte_offset. /// If the predicate \p pred is set to 0, then the load is omitted and the /// vector \p pass_thru is returned. /// @@ -1020,9 +1024,9 @@ block_load(const Tx *addr, Flags) { reinterpret_cast(addr)); } -/// Loads a contiguous block of memory from given accessor and offset and -/// returns the loaded data as a vector. Actual code generated depends on the -/// alignment parameter. +/// Loads a contiguous block of memory from the given accessor \p acc and +/// \p byte_offset and returns the loaded data as a vector. +/// Actual code generated depends on the alignment parameter. /// @tparam Tx Element type. /// @tparam N Number of elements to load, N * sizeof(Tx) must be /// 1, 2, 4 or 8 owords long. @@ -1031,7 +1035,7 @@ block_load(const Tx *addr, Flags) { /// \c Flags parameter. If it is less than \c 16, then slower unaligned /// access is generated, otherwise the access is aligned. /// @param acc The accessor. -/// @param offset The offset to load from in bytes. +/// @param byte_offset The offset to load from in bytes. /// @param Flags Specifies the alignment. /// @return A vector of loaded elements. /// @@ -1042,10 +1046,11 @@ template >, class T = detail::__raw_t> -__ESIMD_API simd -block_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset, Flags flags) { +__ESIMD_API simd block_load(AccessorTy acc, + detail::DeviceAccessorOffsetT byte_offset, + Flags flags) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return block_load(__ESIMD_DNS::accessorToPointer(acc, offset), + return block_load(__ESIMD_DNS::accessorToPointer(acc, byte_offset), flags); #else std::ignore = flags; @@ -1064,18 +1069,17 @@ block_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset, Flags flags) { if constexpr (Flags::template alignment> >= detail::OperandSize::OWORD) { - return __esimd_oword_ld(surf_ind, offset >> 4); + return __esimd_oword_ld(surf_ind, byte_offset >> 4); } else { - return __esimd_oword_ld_unaligned(surf_ind, offset); + return __esimd_oword_ld_unaligned(surf_ind, byte_offset); } #endif } /// Each of the following block load functions loads a contiguous memory block -/// from the address referenced by accessor 'acc', or from 'acc + -/// offset', where 'offset' is the offset in bytes (not in elements!). The -/// parameter 'pred' is the one element predicate. If it is set to 1, then all -/// 'N' elements are loaded. Otherwise, the block load operation is a NO-OP. +/// from the address referenced by accessor 'acc', or from 'acc + byte_offset', +/// The parameter 'pred' is the one element predicate. If it is set to 1, then +/// all 'N' elements are loaded. Otherwise, the block load operation is a NO-OP. /// The parameter 'pass_thru' specifies the values being copied to the returned /// result if 'pred' is set to 0. /// The parameter 'props' specifies the optional compile-time properties @@ -1083,26 +1087,26 @@ block_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset, Flags flags) { /// esimd::cache_hint_L2, esimd::cache_hint_L3, esimd::alignment. /// simd -/// block_load(AccessorT acc, OffsetT offset, props = {}); // (acc-1) -/// simd block_load(AccessorT acc, props); // (acc-2) +/// block_load(AccessorT acc, OffsetT byte_offset, props = {}); // (acc-bl-1) +/// simd block_load(AccessorT acc, props = {}); // (acc-bl-2) /// simd -/// block_load(AccessorT acc, OffsetT offset, simd_mask<1> pred, -/// simd pass_thru, flags = {}); // (acc-3) +/// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred, +/// simd pass_thru, props = {}); // (acc-bl-3) /// simd -/// block_load(AccessorT acc, OffsetT offset, simd_mask<1> pred, -/// flags = {}); // (acc-4) +/// block_load(AccessorT acc, OffsetT byte_offset, +/// simd_mask<1> pred, props = {}); // (acc-bl-4) /// simd /// block_load(AccessorT acc, simd_mask<1> pred, -/// simd pass_thru, flags = {}); // (acc-5) +/// simd pass_thru, props = {}); // (acc-bl-5) /// simd -/// block_load(AccessorT acc, simd_mask<1> pred, flags = {}); // (acc-6) +/// block_load(AccessorT acc, simd_mask<1> pred, props = {}); // (acc-bl-6) /// simd -/// block_load(AccessorT acc, OffsetT offset, props = {}); // (acc-1) +/// block_load(AccessorT acc, OffsetT byte_offset, props = {}); // (acc-bl-1) /// This function loads a contiguous memory block referenced -/// by accessor \p acc and byte-offset \p offset. +/// by accessor \p acc and \p byte_offset. /// /// The parameter \p props specifies the optional compile-time properties /// of the type esimd::properties and may include esimd::cache_hint_L1, @@ -1112,22 +1116,30 @@ block_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset, Flags flags) { /// the cache_hint::none value is assumed by default. /// /// Alignment: If \p props does not specify the 'alignment' property, then -/// the \p offset must be at least 4-byte aligned for elements of 4-bytes or -/// smaller and 8-byte aligned for 8-byte elements. -/// -/// Restrictions - cache hint imposed - temporary: -/// If L1 or L2 cache hint is passed, then: -/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or -/// smaller and 8-byte aligned for 8-byte elements. -/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; -/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// the \p byte_offset must be at least 4-byte aligned for elements of 4-bytes +/// or smaller and 8-byte aligned for 8-byte elements. +/// The alignment requirement may be less strict if stateless memory mode is ON, +/// see block_load(usm_ptr, props) (aka usm-bl-01) for details/requirements. +/// +/// Restrictions: there may be some extra restrictions depending on +/// a) stateless memory mode enforcement is ON, +/// b) cache hints are used, +/// c) number of bytes loaded is either 16,32,64, or 128. +/// If (b) || !(c), then the target device must be DG2 or PVC (not Gen12). +/// If (a) && !(b), then there is no restriction on the number of elements +/// to be loaded and \p byte_offset must be only element-aligned. +/// +/// Gen12 requirements: !(b) && (c). +/// It can load 16-, 32-, 64-, or 128-bytes only. +/// DG2/PVC requirements: +/// It can load such number of elements depending on the type 'T': +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), /// or 128(only if alignment is 8-bytes or more); -/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), /// or 256(only if alignment is 8-bytes or more); -/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), /// or 512(only if alignment is 8-bytes or more). -/// R3: The target device must be DG2, PVC or newer GPU. -/// (R1), (R2) and (R3) are not applied if there are no cache hints. template @@ -1136,10 +1148,11 @@ __ESIMD_API std::enable_if_t< detail::is_device_accessor_with_v, simd> -block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset, +block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, PropertyListT props = {}) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return block_load(detail::accessorToPointer(acc, offset), props); + return block_load(detail::accessorToPointer(acc, byte_offset), + props); #else // !__ESIMD_FORCE_STATELESS_MEM constexpr auto L1Hint = detail::getPropertyValue( @@ -1166,41 +1179,42 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset, if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || !IsLegacySize) { return detail::block_load_impl( - acc, offset, simd_mask<1>(1), overaligned_tag{}); + acc, byte_offset, simd_mask<1>(1), overaligned_tag{}); } else { - return block_load(acc, offset, overaligned_tag{}); + return block_load(acc, byte_offset, overaligned_tag{}); } #endif // !__ESIMD_FORCE_STATELESS_MEM } -/// simd block_load(AccessorT acc, props); // (acc-2) +/// simd block_load(AccessorT acc, props = {}); // (acc-bl-2) /// This function loads a contiguous memory block referenced -/// by accessor \p acc using implied offset=0. +/// by accessor \p acc and implied offset=0. /// /// The parameter \p props specifies the optional compile-time properties /// of the type esimd::properties and may include esimd::cache_hint_L1, -/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies +/// the alignment property, then it is ignored because this variant implies +/// zero offset, which means the most favourable 16-byte alignment is used. /// /// Cache hints: If \p props does not specify any L1 or L2 cache hints, then /// the cache_hint::none value is assumed by default. /// -/// Alignment: If \p props does not specify the 'alignment' property, then -/// the \p offset must be at least 4-byte aligned for elements of 4-bytes or -/// smaller and 8-byte aligned for 8-byte elements. -/// -/// Restrictions - cache hint imposed - temporary: -/// If L1 or L2 cache hint is passed, then: -/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or -/// smaller and 8-byte aligned for 8-byte elements. -/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; -/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, -/// or 128(only if alignment is 8-bytes or more); -/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, -/// or 256(only if alignment is 8-bytes or more); -/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, -/// or 512(only if alignment is 8-bytes or more). -/// R3: The target device must be DG2, PVC or newer GPU. -/// (R1), (R2) and (R3) are not applied if there are no cache hints. +/// Restrictions: there may be some extra restrictions depending on +/// a) stateless memory mode enforcement is ON, +/// b) cache hints are used, +/// c) number of bytes loaded is either 16,32,64, or 128. +/// If (b) || !(c), then the target device must be DG2 or PVC (not Gen12). +/// If (a) && !(b), then there is no restriction on the number of elements +/// to be loaded and \p byte_offset must be only element-aligned. +/// +/// Gen12 requirements: !(b) && (c). +/// It can load 16-, 32-, 64-, or 128-bytes only. +/// DG2/PVC requirements: +/// It can load such number of elements depending on the type 'T': +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128; +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256; +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512. template @@ -1209,17 +1223,29 @@ __ESIMD_API std::enable_if_t< detail::is_device_accessor_with_v, simd> -block_load(AccessorT acc, PropertyListT props = {}) { - return block_load(acc, 0, props); +block_load(AccessorT acc, PropertyListT /* props */ = {}) { + // Create new properties without the alignment property passed in 'props', + // and add alignment<16> as it is usable and most favourable in this case. + constexpr auto L1Hint = + detail::getPropertyValue( + cache_hint::none); + constexpr auto L2Hint = + detail::getPropertyValue( + cache_hint::none); + static_assert(!PropertyListT::template has_property(), + "L3 cache hint is reserved. The old/experimental L3 LSC cache " + "hint is cache_level::L2 now."); + properties Props{cache_hint_L1, cache_hint_L2, alignment<16>}; + return block_load(acc, 0, Props); } /// simd -/// block_load(AccessorT acc, OffsetT offset, simd_mask<1> pred, -/// simd pass_thru, flags = {}); // (acc-3) +/// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred, +/// simd pass_thru, props = {}); // (acc-bl-3) /// This function loads a contiguous memory block referenced -/// by accessor \p acc using the byte-offset \p offset. +/// by accessor \p acc and the given \p byte_offset. /// If the predicate \p pred is set to 0, then the load is omitted and the -/// returned \p pass_thru is returned. +/// \p pass_thru value is returned. /// /// The parameter \p props specifies the optional compile-time properties /// of the type esimd::properties and may include esimd::cache_hint_L1, @@ -1229,22 +1255,21 @@ block_load(AccessorT acc, PropertyListT props = {}) { /// the cache_hint::none value is assumed by default. /// /// Alignment: If \p props does not specify the 'alignment' property, then -/// the \p offset must be at least 4-byte aligned for elements of 4-bytes or -/// smaller and 8-byte aligned for 8-byte elements. -/// -/// Restrictions - cache hint imposed - temporary: -/// If L1 or L2 cache hint is passed, then: -/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or -/// smaller and 8-byte aligned for 8-byte elements. -/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; -/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// the \p byte_offset must be at least 4-byte aligned for elements of 4-bytes +/// or smaller and 8-byte aligned for 8-byte elements. +/// +/// Restrictions - cache hint and predicate imposed - temporary: +/// R1: \p byte_offset must be at least 4-byte aligned for elements of 4-bytes +/// or smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), /// or 128(only if alignment is 8-bytes or more); -/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), /// or 256(only if alignment is 8-bytes or more); -/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), /// or 512(only if alignment is 8-bytes or more). /// R3: The target device must be DG2, PVC or newer GPU. -/// (R1), (R2) and (R3) are not applied if there are no cache hints. template @@ -1253,8 +1278,9 @@ __ESIMD_API std::enable_if_t< detail::is_device_accessor_with_v, simd> -block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset, - simd_mask<1> pred, simd pass_thru, PropertyListT props = {}) { +block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, + simd_mask<1> pred, simd pass_thru, + PropertyListT /* props */ = {}) { constexpr auto L1Hint = detail::getPropertyValue( cache_hint::none); @@ -1265,20 +1291,20 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset, "L3 cache hint is reserved. The old/experimental L3 LSC cache " "hint is cache_level::L2 now."); - // If the alignment property is not passed, then assume the offset + // If the alignment property is not passed, then assume the byte_offset // is element-aligned and is at leat 4-bytes. constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); constexpr size_t Alignment = detail::getPropertyValue(DefaultAlignment); return detail::block_load_impl( - acc, offset, pred, pass_thru, overaligned_tag{}); + acc, byte_offset, pred, pass_thru, overaligned_tag{}); } /// simd -/// block_load(AccessorT acc, OffsetT offset, simd_mask<1> pred, -/// flags = {}); // (acc-4) +/// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred, +/// props = {}); // (acc-bl-4) /// This function loads a contiguous memory block referenced -/// by accessor \p acc using the byte-offset \p offset. +/// by accessor \p acc and the given \p byte_offset. /// If the predicate \p pred is set to 0, then the load is omitted and the /// returned value is undefined. /// @@ -1293,19 +1319,18 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset, /// the \p offset must be at least 4-byte aligned for elements of 4-bytes or /// smaller and 8-byte aligned for 8-byte elements. /// -/// Restrictions - cache hint imposed - temporary: -/// If L1 or L2 cache hint is passed, then: -/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or -/// smaller and 8-byte aligned for 8-byte elements. -/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; -/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// Restrictions - cache hint and predicate imposed - temporary: +/// R1: \p byte_offset must be at least 4-byte aligned for elements of 4-bytes +/// or smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), /// or 128(only if alignment is 8-bytes or more); -/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), /// or 256(only if alignment is 8-bytes or more); -/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), /// or 512(only if alignment is 8-bytes or more). /// R3: The target device must be DG2, PVC or newer GPU. -/// (R1), (R2) and (R3) are not applied if there are no cache hints. template @@ -1314,17 +1339,39 @@ __ESIMD_API std::enable_if_t< detail::is_device_accessor_with_v, simd> -block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset, +block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, simd_mask<1> pred, PropertyListT props = {}) { simd PassThru; // Intentionally uninitialized. - return block_load(acc, offset, pred, PassThru, props); + return block_load(acc, byte_offset, pred, PassThru, props); } /// simd /// block_load(AccessorT acc, simd_mask<1> pred, -/// simd pass_thru, flags = {}); // (acc-5) -/// Same as (acc-3) variant except that the byte-offset is not passed -/// and is implied to be 0. +/// simd pass_thru, props = {}); // (acc-bl-5) +/// This function loads a contiguous memory block referenced +/// by accessor \p acc and implied offset=0. +/// If the predicate \p pred is set to 0, then the load is omitted and the +/// \p pass_thru value is returned. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies +/// the alignment property, then it is ignored because this variant implies +/// zero offset, which means the most favourable 16-byte alignment is used. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Restrictions - cache hint and predicate imposed - temporary: +/// R1: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +/// R2: The target device must be DG2, PVC or newer GPU. template @@ -1334,14 +1381,48 @@ __ESIMD_API std::enable_if_t< detail::accessor_mode_cap::can_read>, simd> block_load(AccessorT acc, simd_mask<1> pred, simd pass_thru, - PropertyListT props = {}) { - return block_load(acc, 0, pred, pass_thru, props); + PropertyListT /* props */ = {}) { + // Create new properties without the alignment property passed in 'props', + // and add alignment<16> as it is usable and most favourable in this case. + constexpr auto L1Hint = + detail::getPropertyValue( + cache_hint::none); + constexpr auto L2Hint = + detail::getPropertyValue( + cache_hint::none); + static_assert(!PropertyListT::template has_property(), + "L3 cache hint is reserved. The old/experimental L3 LSC cache " + "hint is cache_level::L2 now."); + properties Props{cache_hint_L1, cache_hint_L2, alignment<16>}; + return block_load(acc, 0, pred, pass_thru, Props); } /// simd -/// block_load(AccessorT acc, simd_mask<1> pred, flags = {}); // (acc-6) -/// Same as (acc-4) variant except that the byte-offset is not passed -/// and is implied to be 0. +/// block_load(AccessorT acc, simd_mask<1> pred, props = {}); // (acc-bl-6) +/// This function loads a contiguous memory block referenced +/// by accessor \p acc and implied offset=0. +/// If the predicate \p pred is set to 0, then the load is omitted and some +/// undefined value is returned. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies +/// the alignment property, then it is ignored because this variant implies +/// zero offset, which means the most favourable 16-byte alignment is used. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Restrictions - cache hint and predicate imposed - temporary: +/// R1: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +/// R2: The target device must be DG2, PVC or newer GPU. template @@ -1350,9 +1431,22 @@ __ESIMD_API std::enable_if_t< detail::is_device_accessor_with_v, simd> -block_load(AccessorT acc, simd_mask<1> pred, PropertyListT props = {}) { +block_load(AccessorT acc, simd_mask<1> pred, PropertyListT /* props */ = {}) { + // Create new properties without the alignment property passed in 'props', + // and add alignment<16> as it is usable and most favourable in this case. + constexpr auto L1Hint = + detail::getPropertyValue( + cache_hint::none); + constexpr auto L2Hint = + detail::getPropertyValue( + cache_hint::none); + static_assert(!PropertyListT::template has_property(), + "L3 cache hint is reserved. The old/experimental L3 LSC cache " + "hint is cache_level::L2 now."); + properties Props{cache_hint_L1, cache_hint_L2, alignment<16>}; + simd PassThru; // Intentionally uninitialized. - return block_load(acc, 0, pred, PassThru, props); + return block_load(acc, 0, pred, PassThru, Props); } /// Each of the following block store functions stores a contiguous memory block @@ -2404,16 +2498,16 @@ slm_scatter_rgba(simd offsets, /// @tparam T Element type. /// @tparam N Number of elements to load. /// @tparam Flags The alignment specifier type tag. -/// @param offset The byte-offset to load from. +/// @param byte_offset The byte-offset to load from. /// @param Flags Specifies the alignment. /// @return A vector of loaded elements. /// template > __ESIMD_API std::enable_if_t, simd> -slm_block_load(uint32_t offset, Flags = {}) { +slm_block_load(uint32_t byte_offset, Flags = {}) { constexpr size_t Align = Flags::template alignment>; - return __esimd_slm_block_ld, N, Align>(offset); + return __esimd_slm_block_ld, N, Align>(byte_offset); } /// Stores elements of the vector \p vals to a contiguous block of SLM memory @@ -3375,7 +3469,7 @@ __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, } /// Loads a contiguous block of SLM memory referenced by the given -/// local-accessor \p acc and byte-offset \p offset, then returns the loaded +/// local-accessor \p acc and \p byte_offset, then returns the loaded /// data as a simd object. /// The generated code depends on the combination {T, N, Flags}. /// Providing flags specifying the alignment of 16-bytes or more produces more @@ -3383,25 +3477,25 @@ __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y, /// efficient gather is generated. If the loaded vector is too long /// for 1 flat-load GPU instruction, then a series of flat-loads and/or gathers /// may be generated. -/// @tparam Tx Element type. +/// @tparam T Element type. /// @tparam N Number of elements to load. /// @tparam AccessorTy Accessor type (auto-deduced). /// @tparam Flags The alignment specifier type tag. /// @param acc The local accessor. -/// @param offset The offset to load from in bytes. +/// @param byte_offset The offset to load from in bytes. /// @param Flags Specifies the alignment. /// @return A vector of loaded elements. /// -template > __ESIMD_API std::enable_if_t && is_simd_flag_type_v, - simd> - block_load(AccessorTy acc, uint32_t offset, Flags = {}) { - return slm_block_load(offset + - __ESIMD_DNS::localAccessorToOffset(acc)); + simd> + block_load(AccessorTy acc, uint32_t byte_offset, Flags flags = {}) { + return slm_block_load(byte_offset + detail::localAccessorToOffset(acc), + flags); } /// Variant of block_store that uses local accessor as a parameter. diff --git a/sycl/test-e2e/ESIMD/esimd_test_utils.hpp b/sycl/test-e2e/ESIMD/esimd_test_utils.hpp index bccae5d5c2a75..941005fde942d 100644 --- a/sycl/test-e2e/ESIMD/esimd_test_utils.hpp +++ b/sycl/test-e2e/ESIMD/esimd_test_utils.hpp @@ -639,7 +639,8 @@ enum GPUDriverOS { Linux = 1, Windows = 2, LinuxAndWindows = 3 }; /// for win/opencl see the link: /// https://www.intel.com/content/www/us/en/download/726609/intel-arc-iris-xe-graphics-whql-windows.html bool isGPUDriverGE(queue Q, GPUDriverOS OSCheck, std::string RequiredVersion, - std::string WinOpenCLRequiredVersion = "") { + std::string WinOpenCLRequiredVersion = "", + bool VerifyFormat = true) { auto Dev = Q.get_device(); if (!Dev.is_gpu()) return false; @@ -653,17 +654,16 @@ bool isGPUDriverGE(queue Q, GPUDriverOS OSCheck, std::string RequiredVersion, // A and B must have digits at the same positions. // Otherwise, A and B symbols must be equal, e.g. both be equal to '.'. - auto verifyDriverVersionFormat = [](const std::string &A, - const std::string &B) { + auto isExpectedDriverVersionFormat = [](const std::string &A, + const std::string &B) { if (A.size() != B.size()) - throw std::runtime_error( - "Inconsistent expected & actual driver versions"); + return false; for (int I = 0; I < A.size(); I++) { if ((A[I] >= '0' && A[I] <= '9' && !(B[I] >= '0' && B[I] <= '9')) && A[I] != B[I]) - throw std::runtime_error( - "Inconsistent expected & actual driver versions"); + return false; } + return true; }; auto BE = Q.get_backend(); @@ -684,8 +684,17 @@ bool isGPUDriverGE(queue Q, GPUDriverOS OSCheck, std::string RequiredVersion, !IsLinux && (OSCheck & GPUDriverOS::Windows)) { auto CurrentVersion = Dev.get_info(); CurrentVersion = CurrentVersion.substr(Start, Length); - verifyDriverVersionFormat(CurrentVersion, RequiredVersion); - IsGE &= CurrentVersion >= RequiredVersion; + if (isExpectedDriverVersionFormat(CurrentVersion, RequiredVersion)) { + IsGE = CurrentVersion >= RequiredVersion; + } else if (VerifyFormat) { + std::string Msg = + std::string("Inconsistent expected & actual driver versions: ") + + CurrentVersion + " vs " + RequiredVersion; + throw std::runtime_error( + "Inconsistent expected & actual driver versions"); + } else { + IsGE = false; + } } return IsGE; } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp index be666613a4f72..7b153447ef703 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp @@ -20,12 +20,12 @@ using namespace sycl::ext::intel::esimd; // Returns true iff verification is passed. template -bool verify(const T *In, const T *Out, size_t Size, int N, +bool verify(const T *In, const T *Out, size_t Size, int N, bool UseMask, bool UsePassThruOperand) { int NumErrors = 0; using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t; for (int i = 0; i < Size && NumErrors < 32; i++) { - bool IsMaskSet = (i / N + 1) % 1; + bool IsMaskSet = UseMask ? ((i / N + 1) & 0x1) : true; Tuint Expected = sycl::bit_cast(In[i]); Tuint Computed = sycl::bit_cast(Out[i]); @@ -39,7 +39,8 @@ bool verify(const T *In, const T *Out, size_t Size, int N, if (Computed != Expected) { NumErrors++; std::cout << "out[" << i << "] = 0x" << std::hex << Computed - << " vs etalon = 0x" << Expected << std::dec << std::endl; + << " vs etalon = 0x" << Expected << std::dec + << ", IsMaskSet = " << IsMaskSet << std::endl; } } std::cout << (NumErrors == 0 ? " passed\n" : " FAILED\n"); @@ -76,7 +77,7 @@ bool testUSM(queue Q, uint32_t Groups, uint32_t Threads, uint32_t ElemOffset = GlobalID * N; simd Vals; - simd_mask<1> Mask = (GlobalID + 1) % 1; + simd_mask<1> Mask = (GlobalID + 1) & 0x1; if constexpr (!CheckProperties) { if constexpr (UsePassThruOperand) { // TODO: these 2 lines work-around the problem with scalar @@ -144,7 +145,7 @@ bool testUSM(queue Q, uint32_t Groups, uint32_t Threads, return false; } - bool Passed = verify(In, Out, Size, N, UsePassThruOperand); + bool Passed = verify(In, Out, Size, N, UseMask, UsePassThruOperand); sycl::free(In, Q); sycl::free(Out, Q); return Passed; @@ -155,75 +156,100 @@ template bool testUSM(queue Q) { constexpr bool CheckMask = true; constexpr bool CheckProperties = true; - properties AlignOnlyProps{alignment<16>}; + properties Align16Props{alignment<16>}; + properties AlignElemProps{alignment}; bool Passed = true; // Test block_load() that is available on Gen12 and PVC. Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, AlignElemProps); Passed &= testUSM( - Q, 1, 4, AlignOnlyProps); + Q, 1, 4, AlignElemProps); Passed &= testUSM( - Q, 2, 8, AlignOnlyProps); + Q, 2, 8, AlignElemProps); Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, AlignElemProps); Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, AlignElemProps); Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, Align16Props); Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, Align16Props); + // Intentionally check non-power-of-2 simd size - it must work. - Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); - Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); + // Just pass element-size alignment. + // These test cases compute wrong values for for the few last elements + // if the driver is not new enough. + // TODO: windows version with the fix is not known. Enable it eventually. + if (sizeof(T) > 2 || + esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows, + "27556", "win.just.skip.test", false)) { + Passed &= testUSM( + Q, 2, 4, AlignElemProps); + Passed &= testUSM( + Q, 2, 4, AlignElemProps); + } + // Intentionally check big simd size - it must work. Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, AlignElemProps); Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, Align16Props); // Test block_load() without passing compile-time properties argument. Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, Align16Props); Passed &= testUSM( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, Align16Props); if constexpr (TestPVCFeatures) { // Using mask or cache hints adds the requirement to run tests on PVC. - // Also, PVC variant currently requires power-or-two elements and - // the number of bytes loaded per call must not exceed 512. + // Also, PVC variant currently requires a) power-or-two elements, + // b) the number of bytes loaded per call must not exceed 512, + // c) the alignment of USM ptr + offset to be 4 or 8-bytes(for 8-byte + // element vectors). + constexpr size_t RequiredAlignment = sizeof(T) <= 4 ? 4 : 8; properties PVCProps{cache_hint_L1, - cache_hint_L2, alignment<16>}; - - if constexpr (sizeof(T) >= 4) // only d/q words are supported now - Passed &= testUSM( - Q, 2, 4, PVCProps); - if constexpr (sizeof(T) >= 2) // only d/q words are supported now - Passed &= testUSM( - Q, 5, 5, PVCProps); - Passed &= testUSM(Q, 5, 5, - PVCProps); - Passed &= testUSM(Q, 5, 5, - PVCProps); - Passed &= testUSM( - Q, 5, 5, PVCProps); - Passed &= testUSM( - Q, 2, 4, PVCProps); - Passed &= testUSM(Q, 7, 1, - PVCProps); - if constexpr (128 * sizeof(T) <= 512) - Passed &= testUSM( - Q, 1, 4, PVCProps); - if constexpr (256 * sizeof(T) <= 512) - Passed &= testUSM( - Q, 1, 4, PVCProps); - if constexpr (512 * sizeof(T) <= 512) - Passed &= testUSM( - Q, 1, 4, PVCProps); + cache_hint_L2, + alignment}; + + // Only d/q-words are supported now. + // Thus we use this I32Factor for testing purposes and convenience. + constexpr int I32Factor = + std::max(static_cast(sizeof(int) / sizeof(T)), 1); + Passed &= + testUSM( + Q, 2, 4, PVCProps); + Passed &= + testUSM( + Q, 5, 5, PVCProps); + Passed &= + testUSM( + Q, 5, 5, PVCProps); + Passed &= + testUSM( + Q, 5, 5, PVCProps); + Passed &= + testUSM( + Q, 5, 5, PVCProps); + Passed &= + testUSM( + Q, 2, 4, PVCProps); + + // This call (potentially) and the next call (guaranteed) load the biggest + // load-able chunk, which requires loading with 8-byte elements, which + // requires the alignment to be 8-bytes or more. + properties PVCAlign8Props{cache_hint_L1, + cache_hint_L2, alignment<8>}; + Passed &= + testUSM( + Q, 7, 1, PVCAlign8Props); + if constexpr (sizeof(T) <= 4) + Passed &= + testUSM( + Q, 1, 4, PVCAlign8Props); } // TestPVCFeatures return Passed; @@ -258,7 +284,7 @@ bool testACC(queue Q, uint32_t Groups, uint32_t Threads, } try { - buffer InBuf(Size); + buffer InBuf(In); Q.submit([&](handler &CGH) { accessor InAcc{InBuf, CGH}; auto OutPtr = Out.data(); @@ -268,7 +294,7 @@ bool testACC(queue Q, uint32_t Groups, uint32_t Threads, uint32_t ElemOffset = GlobalID * N; simd Vals; - simd_mask<1> Mask = (GlobalID + 1) % 1; + simd_mask<1> Mask = (GlobalID + 1) & 0x1; if constexpr (!CheckProperties) { if constexpr (UsePassThruOperand) { // TODO: these 2 lines work-around the problem with scalar @@ -333,7 +359,8 @@ bool testACC(queue Q, uint32_t Groups, uint32_t Threads, return false; } - bool Passed = verify(In.data(), Out.data(), Size, N, UsePassThruOperand); + bool Passed = + verify(In.data(), Out.data(), Size, N, UseMask, UsePassThruOperand); return Passed; } @@ -342,7 +369,9 @@ template bool testACC(queue Q) { constexpr bool CheckMask = true; constexpr bool CheckProperties = true; - properties AlignOnlyProps{alignment<16>}; + properties Align16Props{alignment<16>}; + constexpr size_t RequiredAlignment = sizeof(T) <= 4 ? 4 : 8; + properties MinReqAlignProps{alignment}; bool Passed = true; @@ -350,21 +379,21 @@ template bool testACC(queue Q) { // 1, 2, 4 or 8 16-byte loads. constexpr int NElemsInOword = 16 / sizeof(T); Passed &= testACC( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, Align16Props); Passed &= testACC( - Q, 1, 4, AlignOnlyProps); + Q, 1, 4, Align16Props); Passed &= testACC( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, MinReqAlignProps); Passed &= testACC( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, Align16Props); // Test block_load() without passing compile-time properties argument. Passed &= testACC( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, Align16Props); if constexpr (TestPVCFeatures) { // Using mask or cache hints adds the requirement to run tests on PVC. @@ -374,39 +403,44 @@ template bool testACC(queue Q) { constexpr int I32Factor = std::max(static_cast(sizeof(int) / sizeof(T)), 1); properties PVCProps{cache_hint_L1, - cache_hint_L2, alignment<16>}; + cache_hint_L2, + alignment}; - // Test block_load() that is available on Gen12 and PVC: - // 1, 2, 4 or 8 16-byte loads + // Test block_load() that is available on PVC: + // 1, 2, 3, 4, 8, ... N elements (up to 512-bytes). Passed &= testACC( - Q, 2, 4, AlignOnlyProps); + Q, 2, 4, MinReqAlignProps); Passed &= - testACC( - Q, 1, 4, AlignOnlyProps); + testACC( + Q, 1, 4, MinReqAlignProps); Passed &= testACC( - Q, 2, 8, AlignOnlyProps); - Passed &= - testACC( - Q, 2, 4, AlignOnlyProps); - Passed &= - testACC( - Q, 2, 4, AlignOnlyProps); + Q, 2, 8, MinReqAlignProps); + Passed &= testACC( + Q, 2, 4, PVCProps); + Passed &= testACC( + Q, 2, 4, MinReqAlignProps); Passed &= - testACC( - Q, 2, 4, AlignOnlyProps); + testACC( + Q, 2, 4, MinReqAlignProps); Passed &= - testACC( - Q, 2, 4, AlignOnlyProps); + testACC( + Q, 2, 4, PVCProps); + + // This call (potentially) and the next call (guaranteed) load the biggest + // load-able chunk, which requires loading with 8-byte elements, which + // requires the alignment to be 8-bytes or more. + properties PVCAlign8Props{cache_hint_L1, + cache_hint_L2, alignment<8>}; Passed &= - testACC( - Q, 2, 4, AlignOnlyProps); + testACC( + Q, 2, 4, PVCAlign8Props); if constexpr (sizeof(T) <= 4) Passed &= - testACC( - Q, 2, 4, AlignOnlyProps); + testACC( + Q, 2, 4, PVCAlign8Props); } // TestPVCFeatures return Passed; diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 5ed1eba76eb2d..a30bd3720d747 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -135,7 +135,7 @@ foo(AccType &acc, float *ptrf, int byte_offset32, size_t byte_offset64) { // not power-of-two because only svm/legacy block_load supports // non-power-of-two vector lengths now. - // CHECK: call <4 x float> @llvm.genx.oword.ld.unaligned.v4f32(i32 0, i32 {{[^)]+}}, i32 {{[^)]+}}) + // CHECK: call <4 x float> @llvm.genx.oword.ld.v4f32(i32 0, i32 {{[^)]+}}, i32 {{[^)]+}}) auto z1 = block_load(acc, props_c); // CHECK: call <8 x i32> @llvm.genx.oword.ld.unaligned.v8i32(i32 0, i32 {{[^)]+}}, i32 {{[^)]+}})