From de512f2e91032adec73c32e0a329337fe73d82f4 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 24 Sep 2021 09:44:02 +0200 Subject: [PATCH] aligned-load sum aggregation --- src/compute/aggregate/sum.rs | 30 +++++++++++++++++++++++++----- src/types/simd/mod.rs | 7 +++++++ src/types/simd/native.rs | 5 +++++ src/types/simd/packed.rs | 5 +++++ 4 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/compute/aggregate/sum.rs b/src/compute/aggregate/sum.rs index d0609d1e69a..2314aef95c2 100644 --- a/src/compute/aggregate/sum.rs +++ b/src/compute/aggregate/sum.rs @@ -19,23 +19,43 @@ pub trait Sum { fn simd_sum(self) -> T; } +fn split_by_alignment(values: &[T]) -> (&[T], &[T]) { + let alignment = std::mem::align_of::(); + + let vals_ptr = values.as_ptr(); + let bytes_offset = vals_ptr.align_offset(alignment); + let type_offset = if bytes_offset > 0 { + std::mem::align_of::() / bytes_offset + } else { + 0 + }; + + let head = &values[..type_offset]; + let aligned_values = &values[type_offset..]; + (head, aligned_values) +} + #[multiversion] #[clone(target = "x86_64+avx")] fn nonnull_sum(values: &[T]) -> T where - T: NativeType + Simd, + T: NativeType + Simd + Add + std::iter::Sum, T::Simd: Add + Sum, { - let mut chunks = values.chunks_exact(T::Simd::LANES); + let (head, aligned_values) = split_by_alignment::(values); + + let mut chunks = aligned_values.chunks_exact(T::Simd::LANES); + // Safety: + // we just made sure that we work on a slice af data aligned to T::Simd let sum = chunks.by_ref().fold(T::Simd::default(), |acc, chunk| { - acc + T::Simd::from_chunk(chunk) + acc + unsafe { T::Simd::from_chunk_aligned_unchecked(chunk) } }); let remainder = T::Simd::from_incomplete_chunk(chunks.remainder(), T::default()); let reduced = sum + remainder; - reduced.simd_sum() + reduced.simd_sum() + head.iter().copied().sum() } /// # Panics @@ -90,7 +110,7 @@ where /// Returns `None` if the array is empty or only contains null values. pub fn sum_primitive(array: &PrimitiveArray) -> Option where - T: NativeType + Simd, + T: NativeType + Simd + Add + std::iter::Sum, T::Simd: Add + Sum, { let null_count = array.null_count(); diff --git a/src/types/simd/mod.rs b/src/types/simd/mod.rs index a52c257d76a..41745dd81ba 100644 --- a/src/types/simd/mod.rs +++ b/src/types/simd/mod.rs @@ -28,6 +28,13 @@ pub trait NativeSimd: Default { /// * iff `v.len()` != `T::LANES` fn from_chunk(v: &[Self::Native]) -> Self; + /// Convert itself from a slice. + /// # Safety: + /// Caller must ensure: + /// * `v.len() == T::LANES` + /// * slice is aligned to `Self` + unsafe fn from_chunk_aligned_unchecked(v: &[Self::Native]) -> Self; + /// creates a new Self from `v` by populating items from `v` up to its length. /// Items from `v` at positions larger than the number of lanes are ignored; /// remaining items are populated with `remaining`. diff --git a/src/types/simd/native.rs b/src/types/simd/native.rs index 7dd007bed74..f2b55117d4a 100644 --- a/src/types/simd/native.rs +++ b/src/types/simd/native.rs @@ -29,6 +29,11 @@ macro_rules! simd { ($name)(v.try_into().unwrap()) } + #[inline] + unsafe fn from_chunk_aligned_unchecked(v: &[$type]) -> Self { + ($name)(v.try_into().unwrap()) + } + #[inline] fn from_incomplete_chunk(v: &[$type], remaining: $type) -> Self { let mut a = [remaining; $lanes]; diff --git a/src/types/simd/packed.rs b/src/types/simd/packed.rs index b8bb35d9806..cb5096ca826 100644 --- a/src/types/simd/packed.rs +++ b/src/types/simd/packed.rs @@ -23,6 +23,11 @@ macro_rules! simd { <$name>::from_slice_unaligned(v) } + #[inline] + unsafe fn from_chunk_aligned_unchecked(v: &[$type]) -> Self { + <$name>::from_slice_aligned_unchecked(v) + } + #[inline] fn from_incomplete_chunk(v: &[$type], remaining: $type) -> Self { let mut a = [remaining; $lanes];