Skip to content

Commit

Permalink
Unaligned read
Browse files Browse the repository at this point in the history
  • Loading branch information
dancazarin committed Mar 1, 2019
1 parent 9ec5752 commit aa603f7
Show file tree
Hide file tree
Showing 15 changed files with 696 additions and 198 deletions.
8 changes: 5 additions & 3 deletions cmake/target_set_arch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
set(ARCH_FLAGS_GNU_sse3 -msse3)
set(ARCH_FLAGS_GNU_ssse3 -mssse3)
set(ARCH_FLAGS_GNU_sse41 -msse4.1)
set(ARCH_FLAGS_GNU_avx -msse4.1 -mavx)
set(ARCH_FLAGS_GNU_avx2 -msse4.1 -mavx2 -mfma)
set(ARCH_FLAGS_GNU_avx512 -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl)
set(ARCH_FLAGS_GNU_sse42 -msse4.2)
set(ARCH_FLAGS_GNU_avx -msse4.2 -mavx)
set(ARCH_FLAGS_GNU_avx2 -msse4.2 -mavx2 -mfma)
set(ARCH_FLAGS_GNU_avx512 -msse4.2 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl)

if (CMAKE_SIZEOF_VOID_P EQUAL 8)
# SSE2 is part of x86_64
Expand All @@ -22,6 +23,7 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
set(ARCH_FLAGS_MS_sse3 ${ARCH_FLAG_MS_SSE2} -D__SSE3__)
set(ARCH_FLAGS_MS_ssse3 ${ARCH_FLAG_MS_SSE2} -D__SSSE3__)
set(ARCH_FLAGS_MS_sse41 ${ARCH_FLAG_MS_SSE2} -D__SSE3__ -D__SSSE3__ -D__SSE4_1__)
set(ARCH_FLAGS_MS_sse42 ${ARCH_FLAG_MS_SSE2} -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__)
set(ARCH_FLAGS_MS_avx -arch:AVX)
set(ARCH_FLAGS_MS_avx2 -arch:AVX2)
set(ARCH_FLAGS_MS_avx512 -arch:AVX512)
Expand Down
20 changes: 17 additions & 3 deletions include/kfr/cometa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <random>
#include <type_traits>
Expand Down Expand Up @@ -714,7 +715,7 @@ template <typename... List>
using indicesfor_t = cvalseq_t<size_t, sizeof...(List), 0>;

template <size_t group, size_t... indices, size_t N = group * sizeof...(indices)>
constexpr inline auto scale(csizes_t<indices...> i) CMT_NOEXCEPT
constexpr inline auto scale(csizes_t<indices...>) CMT_NOEXCEPT
{
return cconcat(csizeseq_t<group, group * indices>()...);
// return i[csizeseq_t<N>() / csize_t<group>()] * csize_t<group>() + csizeseq_t<N>() %
Expand Down Expand Up @@ -1941,10 +1942,10 @@ using overload_generic = overload_priority<0>;
#define CMT_GEN_LIST(c, m, ...) CMT_GEN_LIST##c(m, __VA_ARGS__)

template <typename Tout, typename Tin>
constexpr CMT_INLINE Tout bitcast_anything(const Tin& in)
CMT_INLINE Tout bitcast_anything(const Tin& in)
{
static_assert(sizeof(Tin) == sizeof(Tout), "Invalid arguments for bitcast_anything");
#ifdef CMT_COMPILER_INTEL
#if defined CMT_COMPILER_INTEL
const union {
const Tin in;
Tout out;
Expand All @@ -1971,6 +1972,19 @@ constexpr T just_value(T value)
return value;
}

template <typename Tout, typename>
CMT_INTRINSIC constexpr Tout pack_elements()
{
return 0;
}

template <typename Tout, typename Arg, typename... Args>
CMT_INTRINSIC constexpr Tout pack_elements(Arg x, Args... args)
{
return static_cast<typename std::make_unsigned<Arg>::type>(x) |
(pack_elements<Tout, Arg>(args...) << (sizeof(Arg) * 8));
}

enum class special_constant
{
undefined,
Expand Down
6 changes: 3 additions & 3 deletions include/kfr/dft/impl/ft.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value)
template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>)
{
return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
return concat(read(cbool<A>, csize<N * 2>, ptr_cast<T>(src + stride * indices))...);
}
template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>)
Expand All @@ -138,7 +138,7 @@ KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>&
template <size_t count, size_t N, bool A, typename T, size_t... indices>
KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>)
{
return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
return concat(read(cbool<A>, csize<N * 2>, ptr_cast<T>(src + stride * indices))...);
}
template <size_t count, size_t N, bool A, typename T, size_t... indices>
KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value,
Expand Down Expand Up @@ -1459,7 +1459,7 @@ KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T
template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
{
vec<T, Nout> temp = read<Nout>(ptr_cast<T>(ptr));
vec<T, Nout> temp = read(cunaligned, csize<Nout>, ptr_cast<T>(ptr));
if (transposed)
temp = ctranspose<sizeof...(N)>(temp);
split(temp, w...);
Expand Down
35 changes: 0 additions & 35 deletions include/kfr/simd/impl/backend_clang.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,41 +173,6 @@ KFR_INTRINSIC simd<T, N> simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x)
template <typename T, size_t N, bool A>
using simd_storage = struct_with_alignment<simd<T, N>, A>;

template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
KFR_INTRINSIC simd<T, N> simd_read(const T* src)
{
return ptr_cast<simd_storage<T, N, A>>(src)->value;
}

template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
KFR_INTRINSIC simd<T, N> simd_read(const T* src)
{
constexpr size_t first = prev_poweroftwo(N);
constexpr size_t rest = N - first;
constexpr auto extend_indices = cconcat(csizeseq<rest>, csizeseq<first - rest, index_undefined, 0>);
constexpr auto concat_indices = cvalseq_t<size_t, N>();
return simd_shuffle(
simd2_t<T, first, first>{}, simd_read<first, A>(src),
simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto),
concat_indices, overload_auto);
}

template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value)
{
ptr_cast<simd_storage<T, N, A>>(dest)->value = value;
}

template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value)
{
constexpr size_t first = prev_poweroftwo(N);
constexpr size_t rest = N - first;
simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq<first>, overload_auto));
simd_write<false, rest>(dest + first,
simd_shuffle(simd_t<T, N>{}, value, csizeseq<rest, first>, overload_auto));
}

template <typename T, size_t N>
KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index)
{
Expand Down
138 changes: 85 additions & 53 deletions include/kfr/simd/impl/backend_generic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,32 @@ using simd = typename simd_type<T, N>::type;
template <typename T, size_t N, typename U>
union simd_small_array {
static_assert(sizeof(T) * N == sizeof(U), "");
T arr[N];
U whole;

KFR_INTRINSIC static constexpr simd_small_array from(U whole)
using value_type = T;
constexpr static size_t size = N;
using packed_type = U;

KFR_INTRINSIC constexpr simd_small_array() CMT_NOEXCEPT = default;

KFR_INTRINSIC constexpr simd_small_array(U whole) CMT_NOEXCEPT : whole(whole) {}

template <typename... Args>
KFR_INTRINSIC constexpr simd_small_array(T a, T b, Args... args) CMT_NOEXCEPT
: whole(pack_elements<U, T>(a, b, args...))
{
union {
const U w;
simd_small_array r;
} u{ whole };
return u.r;
}

KFR_INTRINSIC static constexpr simd_small_array from(U whole) CMT_NOEXCEPT { return { whole }; }
};

template <typename T>
struct is_simd_small_array : cfalse_t
{
};
template <typename T, size_t N, typename U>
struct is_simd_small_array<simd_small_array<T, N, U>> : ctrue_t
{
};

#define KFR_SIMD_TYPE(T, N, ...) \
Expand Down Expand Up @@ -108,8 +123,6 @@ KFR_SIMD_SMALL_TYPE(i8, 8, u64)
KFR_SIMD_SMALL_TYPE(i16, 4, u64)
KFR_SIMD_SMALL_TYPE(i32, 2, u64)

KFR_SIMD_SMALL_TYPE(f32, 2, f64)

#ifdef CMT_ARCH_SSE
KFR_SIMD_TYPE(f32, 4, __m128)
KFR_SIMD_TYPE(f64, 2, __m128d)
Expand Down Expand Up @@ -207,11 +220,15 @@ KFR_SIMD_TYPE(f64, 2, float64x2_t);

#ifdef CMT_ARCH_SSE2
inline __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT { return _mm_set_epi64x(q1, q0); }
inline __m128i KFR_mm_setr_epi32(int32_t q0, int32_t q1, int32_t q2, int32_t q3) CMT_NOEXCEPT
{
return _mm_set_epi32(q3, q2, q1, q0);
}
KFR_INTRIN_MAKE(2, i64, KFR_mm_setr_epi64x)
KFR_INTRIN_MAKE(2, u64, KFR_mm_setr_epi64x)
KFR_INTRIN_MAKE(2, f64, _mm_setr_pd)
KFR_INTRIN_MAKE(4, i32, _mm_setr_epi32)
KFR_INTRIN_MAKE(4, u32, _mm_setr_epi32)
KFR_INTRIN_MAKE(4, i32, KFR_mm_setr_epi32)
KFR_INTRIN_MAKE(4, u32, KFR_mm_setr_epi32)
KFR_INTRIN_MAKE(4, f32, _mm_setr_ps)
KFR_INTRIN_MAKE(8, i16, _mm_setr_epi16)
KFR_INTRIN_MAKE(8, u16, _mm_setr_epi16)
Expand Down Expand Up @@ -301,7 +318,7 @@ KFR_INTRIN_SHUFFLE_LINEAR(i64, 1, 2, _mm_cvtsi128_si64(x))
KFR_INTRIN_SHUFFLE_LINEAR(u64, 1, 2, _mm_cvtsi128_si64(x))
KFR_INTRIN_SHUFFLE_LINEAR(f32, 1, 4, _mm_cvtss_f32(x))
KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 4, bitcast_anything<simd<float, 2>>(_mm_cvtsd_f64(_mm_castps_pd(x))))
KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_castpd_ps(_mm_set_sd(x.whole)))
KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_unpacklo_ps(_mm_set_ss(x.low), _mm_set_ss(x.high)))
KFR_INTRIN_SHUFFLE_LINEAR(f64, 1, 2, _mm_cvtsd_f64(x))

KFR_INTRIN_SHUFFLE_LINEAR(i8, 2, 16, simd<i8, 2>::from(u16(_mm_cvtsi128_si32(x))))
Expand Down Expand Up @@ -333,11 +350,24 @@ KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 2, 4, 2, simd<i32, 2>::from(KFR_u64sse_INDE
return __VA_ARGS__; \
}

#define KFR_INTRIN_CONVERT_NOOP_REF(Tout, Tin, N) \
KFR_INTRINSIC const simd<Tout, N>& simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) \
CMT_NOEXCEPT \
{ \
return x; \
}
#define KFR_INTRIN_CONVERT_NOOP(Tout, Tin, N) \
KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \
{ \
return x; \
}

KFR_INTRIN_CONVERT(f32, i32, 4, _mm_cvtepi32_ps(x))
KFR_INTRIN_CONVERT(i32, f32, 4, _mm_cvttps_epi32(x))
KFR_INTRIN_CONVERT(i32, f64, 2, simd<i32, 2>::from(_mm_cvtsi128_si64(_mm_cvttpd_epi32(x))))
KFR_INTRIN_CONVERT(f64, i32, 2, _mm_cvtepi32_pd(KFR_mm_setr_epi64x(x.whole, 0)))
KFR_INTRIN_CONVERT(i64, f64, 2, _mm_set_epi64x(_mm_cvttsd_si64(_mm_unpackhi_pd(x, x)), _mm_cvttsd_si64(x)))
KFR_INTRIN_CONVERT(i64, f64, 2,
KFR_mm_setr_epi64x(_mm_cvttsd_si64(x), _mm_cvttsd_si64(_mm_unpackhi_pd(x, x))))
KFR_INTRIN_CONVERT(f64, i64, 2,
_mm_unpacklo_pd(_mm_cvtsi64_sd(_mm_setzero_pd(), _mm_cvtsi128_si64(x)),
_mm_cvtsi64_sd(_mm_setzero_pd(), KFR_i64sse_INDEX(x, 1))))
Expand All @@ -355,6 +385,25 @@ KFR_INTRIN_CONVERT(f32, f64, 4,
simd<f32, 4>{ _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtpd_ps(x.low)),
_mm_castps_pd(_mm_cvtpd_ps(x.high)))) })
#endif

KFR_INTRIN_CONVERT_NOOP(u8, i8, 1)
KFR_INTRIN_CONVERT_NOOP(i8, u8, 1)
KFR_INTRIN_CONVERT_NOOP(u16, i16, 1)
KFR_INTRIN_CONVERT_NOOP(i16, u16, 1)
KFR_INTRIN_CONVERT_NOOP(u32, i32, 1)
KFR_INTRIN_CONVERT_NOOP(i32, u32, 1)
KFR_INTRIN_CONVERT_NOOP(u64, i64, 1)
KFR_INTRIN_CONVERT_NOOP(i64, u64, 1)

KFR_INTRIN_CONVERT_NOOP_REF(u8, i8, 16)
KFR_INTRIN_CONVERT_NOOP_REF(i8, u8, 16)
KFR_INTRIN_CONVERT_NOOP_REF(u16, i16, 8)
KFR_INTRIN_CONVERT_NOOP_REF(i16, u16, 8)
KFR_INTRIN_CONVERT_NOOP_REF(u32, i32, 4)
KFR_INTRIN_CONVERT_NOOP_REF(i32, u32, 4)
KFR_INTRIN_CONVERT_NOOP_REF(u64, i64, 2)
KFR_INTRIN_CONVERT_NOOP_REF(i64, u64, 2)

#endif // CMT_ARCH_SSE2

#ifdef CMT_ARCH_SSE41
Expand Down Expand Up @@ -707,12 +756,34 @@ KFR_INTRINSIC simd_array<T, N> to_simd_array(const simd<T, N>& x) CMT_NOEXCEPT
return bitcast_anything<simd_array<T, N>>(x);
}

#if defined CMT_COMPILER_MSVC

template <typename T, size_t N, KFR_ENABLE_IF(!is_simd_small_array<simd<T, N>>::value)>
KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT
{
return bitcast_anything<simd<T, N>>(x);
}

template <typename T, size_t N, size_t... indices>
KFR_INTRINSIC simd<T, N> from_simd_array_impl(const simd_array<T, N>& x, csizes_t<indices...>) CMT_NOEXCEPT
{
return { x.val[indices]... };
}

template <typename T, size_t N, KFR_ENABLE_IF(is_simd_small_array<simd<T, N>>::value)>
KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT
{
return from_simd_array_impl(x, csizeseq<N>);
}
#else
template <typename T, size_t N>
KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT
{
return bitcast_anything<simd<T, N>>(x);
}

#endif

#define KFR_COMPONENTWISE_RET(code) \
vec<T, N> result; \
for (size_t i = 0; i < N; i++) \
Expand Down Expand Up @@ -815,8 +886,7 @@ KFR_INTRINSIC const simd<T, N>& simd_bitcast(simd_cvt_t<T, T, N>, const simd<T,
template <typename T, size_t N, size_t index>
KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, csize_t<index>) CMT_NOEXCEPT
{
not_optimized(CMT_FUNC_SIGNATURE);
return to_simd_array<T, N>(value).val[index];
return simd_shuffle(simd_t<T, N>{}, value, csizes<index>, overload_auto);
}

template <typename T, size_t N, size_t index>
Expand Down Expand Up @@ -1022,53 +1092,15 @@ using simd_storage = struct_with_alignment<simd<T, N>, A>;

CMT_PRAGMA_GNU(GCC diagnostic pop)

template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT
{
return reinterpret_cast<typename simd_storage<T, N, A>::const_pointer>(src)->value;
}

template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT
{
constexpr size_t first = prev_poweroftwo(N);
constexpr size_t rest = N - first;
constexpr auto extend_indices =
cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>());
constexpr auto concat_indices = cvalseq_t<size_t, N>();
return simd_shuffle(
simd2_t<T, first, first>{}, simd_read<first, A>(src),
simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto),
concat_indices, overload_auto);
}

template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT
{
reinterpret_cast<typename simd_storage<T, N, A>::pointer>(dest)->value = value;
}

template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT
{
constexpr size_t first = prev_poweroftwo(N);
constexpr size_t rest = N - first;
simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<first>(), overload_auto));
simd_write<false, rest>(dest + first,
simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<rest, first>(), overload_auto));
}

template <typename T, size_t N>
KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index) CMT_NOEXCEPT
{
not_optimized(CMT_FUNC_SIGNATURE);
return to_simd_array<T, N>(value).val[index];
}

template <typename T, size_t N>
KFR_INTRINSIC simd<T, N> simd_set_element(const simd<T, N>& value, size_t index, T x) CMT_NOEXCEPT
{
not_optimized(CMT_FUNC_SIGNATURE);
simd_array<T, N> arr = to_simd_array<T, N>(value);
arr.val[index] = x;
return from_simd_array(arr);
Expand Down
Loading

0 comments on commit aa603f7

Please sign in to comment.