diff --git a/ci/run.sh b/ci/run.sh
index a27eaa463bd0f..cdcd9b3bbee29 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -7,7 +7,7 @@ set -ex
 # Tests are all super fast anyway, and they fault often enough on travis that
 # having only one thread increases debuggability to be worth it.
 export RUST_TEST_THREADS=1
-#export RUST_BACKTRACE=1
+#export RUST_BACKTRACE=full
 #export RUST_TEST_NOCAPTURE=1
 
 FEATURES="strict,$FEATURES"
diff --git a/coresimd/ppsv/api/arithmetic_reductions.rs b/coresimd/ppsv/api/arithmetic_reductions.rs
index 44d032a1b3797..61494284b5784 100644
--- a/coresimd/ppsv/api/arithmetic_reductions.rs
+++ b/coresimd/ppsv/api/arithmetic_reductions.rs
@@ -4,58 +4,104 @@
 macro_rules! impl_arithmetic_reductions {
     ($id:ident, $elem_ty:ident) => {
         impl $id {
-            /// Lane-wise addition of the vector elements.
+            /// Horizontal sum of the vector elements.
             ///
-            /// FIXME: document guarantees with respect to:
-            ///    * integers: overflow behavior
-            ///    * floats: order and NaNs
+            /// The intrinsic performs a tree-reduction of the vector elements.
+            /// That is, for an 8 element vector:
+            ///
+            /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7))
+            ///
+            /// # Integer vectors
+            ///
+            /// If an operation overflows it returns the mathematical result
+            /// modulo `2^n` where `n` is the number of times it overflows.
+            ///
+            /// # Floating-point vectors
+            ///
+            /// If one of the vector element is `NaN` the reduction returns
+            /// `NaN`.
             #[cfg(not(target_arch = "aarch64"))]
             #[inline]
-            pub fn sum(self) -> $elem_ty {
+            pub fn wrapping_sum(self) -> $elem_ty {
                 use coresimd::simd_llvm::simd_reduce_add_ordered;
                 unsafe { simd_reduce_add_ordered(self, 0 as $elem_ty) }
             }
-            /// Lane-wise addition of the vector elements.
+            /// Horizontal sum of the vector elements.
+            ///
+            /// The intrinsic performs a tree-reduction of the vector elements.
+            /// That is, for an 8 element vector:
+            ///
+            /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7))
+            ///
+            /// # Integer vectors
+            ///
+            /// If an operation overflows it returns the mathematical result
+            /// modulo `2^n` where `n` is the number of times it overflows.
+            ///
+            /// # Floating-point vectors
             ///
-            /// FIXME: document guarantees with respect to:
-            ///    * integers: overflow behavior
-            ///    * floats: order and NaNs
+            /// If one of the vector element is `NaN` the reduction returns
+            /// `NaN`.
             #[cfg(target_arch = "aarch64")]
             #[inline]
-            pub fn sum(self) -> $elem_ty {
+            pub fn wrapping_sum(self) -> $elem_ty {
                 // FIXME: broken on AArch64
                 // https://bugs.llvm.org/show_bug.cgi?id=36796
+                use super::codegen::wrapping::Wrapping;
                 let mut x = self.extract(0) as $elem_ty;
                 for i in 1..$id::lanes() {
-                    x += self.extract(i) as $elem_ty;
+                    x = Wrapping::add(x, self.extract(i) as $elem_ty);
                 }
                 x
             }
 
-            /// Lane-wise multiplication of the vector elements.
+            /// Horizontal product of the vector elements.
             ///
-            /// FIXME: document guarantees with respect to:
-            ///    * integers: overflow behavior
-            ///    * floats: order and NaNs
+            /// The intrinsic performs a tree-reduction of the vector elements.
+            /// That is, for an 8 element vector:
+            ///
+            /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7))
+            ///
+            /// # Integer vectors
+            ///
+            /// If an operation overflows it returns the mathematical result
+            /// modulo `2^n` where `n` is the number of times it overflows.
+            ///
+            /// # Floating-point vectors
+            ///
+            /// If one of the vector element is `NaN` the reduction returns
+            /// `NaN`.
             #[cfg(not(target_arch = "aarch64"))]
             #[inline]
-            pub fn product(self) -> $elem_ty {
+            pub fn wrapping_product(self) -> $elem_ty {
                 use coresimd::simd_llvm::simd_reduce_mul_ordered;
                 unsafe { simd_reduce_mul_ordered(self, 1 as $elem_ty) }
             }
-            /// Lane-wise multiplication of the vector elements.
+            /// Horizontal product of the vector elements.
+            ///
+            /// The intrinsic performs a tree-reduction of the vector elements.
+            /// That is, for an 8 element vector:
+            ///
+            /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7))
+            ///
+            /// # Integer vectors
+            ///
+            /// If an operation overflows it returns the mathematical result
+            /// modulo `2^n` where `n` is the number of times it overflows.
+            ///
+            /// # Floating-point vectors
             ///
-            /// FIXME: document guarantees with respect to:
-            ///    * integers: overflow behavior
-            ///    * floats: order and NaNs
+            /// If one of the vector element is `NaN` the reduction returns
+            /// `NaN`.
             #[cfg(target_arch = "aarch64")]
             #[inline]
-            pub fn product(self) -> $elem_ty {
+            pub fn wrapping_product(self) -> $elem_ty {
                 // FIXME: broken on AArch64
                 // https://bugs.llvm.org/show_bug.cgi?id=36796
+                use super::codegen::wrapping::Wrapping;
                 let mut x = self.extract(0) as $elem_ty;
                 for i in 1..$id::lanes() {
-                    x *= self.extract(i) as $elem_ty;
+                    x = Wrapping::mul(x, self.extract(i) as $elem_ty);
                 }
                 x
             }
@@ -78,25 +124,25 @@ macro_rules! test_arithmetic_reductions {
         }
 
         #[test]
-        fn sum() {
+        fn wrapping_sum() {
             use coresimd::simd::$id;
             let v = $id::splat(0 as $elem_ty);
-            assert_eq!(v.sum(), 0 as $elem_ty);
+            assert_eq!(v.wrapping_sum(), 0 as $elem_ty);
             let v = $id::splat(1 as $elem_ty);
-            assert_eq!(v.sum(), $id::lanes() as $elem_ty);
+            assert_eq!(v.wrapping_sum(), $id::lanes() as $elem_ty);
             let v = alternating(2);
             assert_eq!(
-                v.sum(),
+                v.wrapping_sum(),
                 ($id::lanes() / 2 + $id::lanes()) as $elem_ty
             );
         }
         #[test]
-        fn product() {
+        fn wrapping_product() {
             use coresimd::simd::$id;
             let v = $id::splat(0 as $elem_ty);
-            assert_eq!(v.product(), 0 as $elem_ty);
+            assert_eq!(v.wrapping_product(), 0 as $elem_ty);
             let v = $id::splat(1 as $elem_ty);
-            assert_eq!(v.product(), 1 as $elem_ty);
+            assert_eq!(v.wrapping_product(), 1 as $elem_ty);
             let f = match $id::lanes() {
                 64 => 16,
                 32 => 8,
@@ -105,7 +151,7 @@ macro_rules! test_arithmetic_reductions {
             };
             let v = alternating(f);
             assert_eq!(
-                v.product(),
+                v.wrapping_product(),
                 (2_usize.pow(($id::lanes() / f) as u32) as $elem_ty)
             );
         }
diff --git a/coresimd/ppsv/api/minmax_reductions.rs b/coresimd/ppsv/api/minmax_reductions.rs
index 7b380c615b62d..6a791d2df2a73 100644
--- a/coresimd/ppsv/api/minmax_reductions.rs
+++ b/coresimd/ppsv/api/minmax_reductions.rs
@@ -4,22 +4,19 @@
 macro_rules! impl_minmax_reductions {
     ($id:ident, $elem_ty:ident) => {
         impl $id {
-            /// Largest vector value.
-            ///
-            /// FIXME: document behavior for float vectors with NaNs.
-            #[cfg(not(target_arch = "aarch64"))]
+            /// Largest vector element value.
+            #[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))]
             #[inline]
-            pub fn max(self) -> $elem_ty {
+            pub fn max_element(self) -> $elem_ty {
                 use coresimd::simd_llvm::simd_reduce_max;
                 unsafe { simd_reduce_max(self) }
             }
-            /// Largest vector value.
-            ///
-            /// FIXME: document behavior for float vectors with NaNs.
-            #[cfg(target_arch = "aarch64")]
+
+            /// Largest vector element value.
+            #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
             #[allow(unused_imports)]
             #[inline]
-            pub fn max(self) -> $elem_ty {
+            pub fn max_element(self) -> $elem_ty {
                 // FIXME: broken on AArch64
                 // https://bugs.llvm.org/show_bug.cgi?id=36796
                 use cmp::Ord;
@@ -31,22 +28,19 @@ macro_rules! impl_minmax_reductions {
                 x
             }
 
-            /// Smallest vector value.
-            ///
-            /// FIXME: document behavior for float vectors with NaNs.
-            #[cfg(not(target_arch = "aarch64"))]
+            /// Smallest vector element value.
+            #[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))]
             #[inline]
-            pub fn min(self) -> $elem_ty {
+            pub fn min_element(self) -> $elem_ty {
                 use coresimd::simd_llvm::simd_reduce_min;
                 unsafe { simd_reduce_min(self) }
             }
-            /// Smallest vector value.
-            ///
-            /// FIXME: document behavior for float vectors with NaNs.
-            #[cfg(target_arch = "aarch64")]
+
+            /// Smallest vector element value.
+            #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
             #[allow(unused_imports)]
             #[inline]
-            pub fn min(self) -> $elem_ty {
+            pub fn min_element(self) -> $elem_ty {
                 // FIXME: broken on AArch64
                 // https://bugs.llvm.org/show_bug.cgi?id=36796
                 use cmp::Ord;
@@ -65,29 +59,29 @@ macro_rules! impl_minmax_reductions {
 macro_rules! test_minmax_reductions {
     ($id:ident, $elem_ty:ident) => {
         #[test]
-        fn max() {
+        fn max_element() {
             use coresimd::simd::$id;
             let v = $id::splat(0 as $elem_ty);
-            assert_eq!(v.max(), 0 as $elem_ty);
+            assert_eq!(v.max_element(), 0 as $elem_ty);
             let v = v.replace(1, 1 as $elem_ty);
-            assert_eq!(v.max(), 1 as $elem_ty);
+            assert_eq!(v.max_element(), 1 as $elem_ty);
             let v = v.replace(0, 2 as $elem_ty);
-            assert_eq!(v.max(), 2 as $elem_ty);
+            assert_eq!(v.max_element(), 2 as $elem_ty);
         }
 
         #[test]
-        fn min() {
+        fn min_element() {
             use coresimd::simd::$id;
             let v = $id::splat(0 as $elem_ty);
-            assert_eq!(v.min(), 0 as $elem_ty);
+            assert_eq!(v.min_element(), 0 as $elem_ty);
             let v = v.replace(1, 1 as $elem_ty);
-            assert_eq!(v.min(), 0 as $elem_ty);
+            assert_eq!(v.min_element(), 0 as $elem_ty);
             let v = $id::splat(1 as $elem_ty);
             let v = v.replace(0, 2 as $elem_ty);
-            assert_eq!(v.min(), 1 as $elem_ty);
+            assert_eq!(v.min_element(), 1 as $elem_ty);
             let v = $id::splat(2 as $elem_ty);
             let v = v.replace(1, 1 as $elem_ty);
-            assert_eq!(v.min(), 1 as $elem_ty);
+            assert_eq!(v.min_element(), 1 as $elem_ty);
         }
     };
 }
diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs
index 6e793fc88e276..3067905a640ab 100644
--- a/coresimd/ppsv/mod.rs
+++ b/coresimd/ppsv/mod.rs
@@ -78,3 +78,52 @@ impl<T> FromBits<T> for T {
         t
     }
 }
+
+/// Workarounds code generation issues.
+#[cfg(target_arch = "aarch64")]
+mod codegen {
+    #[cfg(target_arch = "aarch64")]
+    pub mod wrapping {
+        pub trait Wrapping {
+            fn add(self, other: Self) -> Self;
+            fn mul(self, other: Self) -> Self;
+        }
+
+        macro_rules! int_impl {
+            ($id:ident) => {
+                impl Wrapping for $id {
+                    fn add(self, other: Self) -> Self {
+                        self.wrapping_add(other)
+                    }
+                    fn mul(self, other: Self) -> Self {
+                        self.wrapping_mul(other)
+                    }
+                }
+            };
+        }
+        int_impl!(i8);
+        int_impl!(i16);
+        int_impl!(i32);
+        int_impl!(i64);
+        int_impl!(u8);
+        int_impl!(u16);
+        int_impl!(u32);
+        int_impl!(u64);
+
+        macro_rules! float_impl {
+            ($id:ident) => {
+                impl Wrapping for $id {
+                    fn add(self, other: Self) -> Self {
+                        self + other
+                    }
+                    fn mul(self, other: Self) -> Self {
+                        self * other
+                    }
+                }
+            };
+        }
+        float_impl!(f32);
+        float_impl!(f64);
+    }
+
+}
diff --git a/crates/coresimd/tests/reductions.rs b/crates/coresimd/tests/reductions.rs
new file mode 100644
index 0000000000000..1ce2df28732cc
--- /dev/null
+++ b/crates/coresimd/tests/reductions.rs
@@ -0,0 +1,542 @@
+#![feature(cfg_target_feature, stdsimd, target_feature)]
+
+#[macro_use]
+extern crate stdsimd;
+
+use stdsimd::simd::*;
+
+#[cfg(target_arch = "powerpc")]
+macro_rules! is_powerpc_feature_detected {
+    ($t:tt) => {
+        false
+    };
+}
+
+macro_rules! invoke_arch {
+    ($macro:ident, $feature_macro:ident, $id:ident, $elem_ty:ident,
+     [$($feature:tt),*]) => {
+        $($macro!($feature, $feature_macro, $id, $elem_ty);)*
+    }
+}
+
+macro_rules! invoke_vectors {
+    ($macro:ident, [$(($id:ident, $elem_ty:ident)),*]) => {
+        $(
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            invoke_arch!($macro, is_x86_feature_detected, $id, $elem_ty,
+                        ["sse", "sse2", "sse3", "ssse3", "sse4.1",
+                         "sse4.2", "sse4a", "avx2", "avx2", "avx512f"]);
+            #[cfg(target_arch = "aarch64")]
+            invoke_arch!($macro, is_aarch64_feature_detected, $id, $elem_ty,
+                        ["neon"]);
+            #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+            invoke_arch!($macro, is_arm_feature_detected, $id, $elem_ty,
+                         ["neon"]);
+            #[cfg(target_arch = "powerpc")]
+            invoke_arch!($macro, is_powerpc_feature_detected, $id, $elem_ty, ["altivec"]);
+            #[cfg(target_arch = "powerpc64")]
+            invoke_arch!($macro, is_powerpc64_feature_detected, $id, $elem_ty, ["altivec"]);
+        )*
+    }
+}
+
+macro_rules! finvoke {
+    ($macro:ident) => {
+        invoke_vectors!(
+            $macro,
+            [
+                (f32x2, f32),
+                (f32x4, f32),
+                (f32x8, f32),
+                (f32x16, f32),
+                (f64x2, f64),
+                (f64x4, f64),
+                (f64x8, f64)
+            ]
+        );
+    };
+}
+
+macro_rules! iinvoke {
+    ($macro:ident) => {
+        invoke_vectors!(
+            $macro,
+            [
+                (i8x2, i8),
+                (i8x4, i8),
+                (i8x8, i8),
+                (i8x16, i8),
+                (i8x32, i8),
+                (i8x64, i8),
+                (i16x2, i16),
+                (i16x4, i16),
+                (i16x8, i16),
+                (i16x16, i16),
+                (i16x32, i16),
+                (i32x2, i32),
+                (i32x4, i32),
+                (i32x8, i32),
+                (i32x16, i32),
+                (i64x2, i64),
+                (i64x4, i64),
+                (i64x8, i64),
+                (u8x2, u8),
+                (u8x4, u8),
+                (u8x8, u8),
+                (u8x16, u8),
+                (u8x32, u8),
+                (u8x64, u8),
+                (u16x2, u16),
+                (u16x4, u16),
+                (u16x8, u16),
+                (u16x16, u16),
+                (u16x32, u16),
+                (u32x2, u32),
+                (u32x4, u32),
+                (u32x8, u32),
+                (u32x16, u32),
+                (u64x2, u64),
+                (u64x4, u64),
+                (u64x8, u64)
+            ]
+        );
+    };
+}
+
+macro_rules! min_nan_test {
+    ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => {
+        if $feature_macro!($feature) {
+            #[target_feature(enable = $feature)]
+            unsafe fn test_fn() {
+                let n0 = ::std::$elem_ty::NAN;
+
+                assert_eq!(n0.min(-3.0), -3.0);
+                assert_eq!((-3.0 as $elem_ty).min(n0), -3.0);
+
+                let v0 = $id::splat(-3.0);
+
+                // FIXME (https://github.com/rust-lang-nursery/stdsimd/issues/408):
+                // When the last element is NaN the current implementation produces incorrect results.
+                let bugbug = 1;
+                for i in 0..$id::lanes() - bugbug {
+                    let mut v = v0.replace(i, n0);
+                    // If there is a NaN, the result is always the smallest element:
+                    assert_eq!(v.min_element(), -3.0, "nan at {} => {} | {:?} | {:X}", i, v.min_element(), v, v.as_int());
+                    for j in 0..i {
+                        v = v.replace(j, n0);
+                        assert_eq!(v.min_element(), -3.0, "nan at {} => {} | {:?} | {:X}", i, v.min_element(), v, v.as_int());
+                    }
+                }
+                // If the vector contains all NaNs the result is NaN:
+                let vn = $id::splat(n0);
+                assert!(vn.min_element().is_nan(), "all nans | v={:?} | min={} | is_nan: {}",
+                        vn, vn.min_element(), vn.min_element().is_nan());
+            }
+            unsafe { test_fn() };
+        }
+    }
+}
+
+#[test]
+fn min_nan() {
+    finvoke!(min_nan_test);
+}
+
+macro_rules! max_nan_test {
+    ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => {
+        if $feature_macro!($feature) {
+            #[target_feature(enable = $feature)]
+            unsafe fn test_fn() {
+                let n0 = ::std::$elem_ty::NAN;
+
+                assert_eq!(n0.max(-3.0), -3.0);
+                assert_eq!((-3.0 as $elem_ty).max(n0), -3.0);
+
+                let v0 = $id::splat(-3.0);
+
+                // FIXME (https://github.com/rust-lang-nursery/stdsimd/issues/408):
+                // When the last element is NaN the current implementation produces incorrect results.
+                let bugbug = 1;
+                for i in 0..$id::lanes() - bugbug {
+                    let mut v = v0.replace(i, n0);
+                    // If there is a NaN the result is always the largest element:
+                    assert_eq!(v.max_element(), -3.0, "nan at {} => {} | {:?} | {:X}", i, v.max_element(), v, v.as_int());
+                    for j in 0..i {
+                        v = v.replace(j, n0);
+                        assert_eq!(v.max_element(), -3.0, "nan at {} => {} | {:?} | {:X}", i, v.max_element(), v, v.as_int());
+                    }
+                }
+
+                // If the vector contains all NaNs the result is NaN:
+                let vn = $id::splat(n0);
+                assert!(vn.max_element().is_nan(), "all nans | v={:?} | max={} | is_nan: {}",
+                        vn, vn.max_element(), vn.max_element().is_nan());
+            }
+            unsafe { test_fn() };
+        }
+    }
+}
+
+#[test]
+fn max_nan() {
+    finvoke!(max_nan_test);
+}
+
+macro_rules! wrapping_sum_nan_test {
+    ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => {
+        if $feature_macro!($feature) {
+            #[target_feature(enable = $feature)]
+            #[allow(unreachable_code)]
+            unsafe fn test_fn() {
+                // FIXME: https://bugs.llvm.org/show_bug.cgi?id=36732
+                // https://github.com/rust-lang-nursery/stdsimd/issues/409
+                return;
+
+                let n0 = ::std::$elem_ty::NAN;
+                let v0 = $id::splat(-3.0);
+                for i in 0..$id::lanes() {
+                    let mut v = v0.replace(i, n0);
+                    // If the vector contains a NaN the result is NaN:
+                    assert!(
+                        v.wrapping_sum().is_nan(),
+                        "nan at {} => {} | {:?}",
+                        i,
+                        v.wrapping_sum(),
+                        v
+                    );
+                    for j in 0..i {
+                        v = v.replace(j, n0);
+                        assert!(v.wrapping_sum().is_nan());
+                    }
+                }
+                let v = $id::splat(n0);
+                assert!(v.wrapping_sum().is_nan(), "all nans | {:?}", v);
+            }
+            unsafe { test_fn() };
+        }
+    };
+}
+
+#[test]
+fn wrapping_sum_nan() {
+    finvoke!(wrapping_sum_nan_test);
+}
+
+macro_rules! wrapping_product_nan_test {
+    ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => {
+        if $feature_macro!($feature) {
+            #[target_feature(enable = $feature)]
+            #[allow(unreachable_code)]
+            unsafe fn test_fn() {
+                // FIXME: https://bugs.llvm.org/show_bug.cgi?id=36732
+                // https://github.com/rust-lang-nursery/stdsimd/issues/409
+                return;
+
+                let n0 = ::std::$elem_ty::NAN;
+                let v0 = $id::splat(-3.0);
+                for i in 0..$id::lanes() {
+                    let mut v = v0.replace(i, n0);
+                    // If the vector contains a NaN the result is NaN:
+                    assert!(
+                        v.wrapping_product().is_nan(),
+                        "nan at {} | {:?}",
+                        i,
+                        v
+                    );
+                    for j in 0..i {
+                        v = v.replace(j, n0);
+                        assert!(v.wrapping_sum().is_nan());
+                    }
+                }
+                let v = $id::splat(n0);
+                assert!(
+                    v.wrapping_product().is_nan(),
+                    "all nans | {:?}",
+                    v
+                );
+            }
+            unsafe { test_fn() };
+        }
+    };
+}
+
+#[test]
+fn wrapping_product_nan() {
+    finvoke!(wrapping_product_nan_test);
+}
+
+trait AsInt {
+    type Int;
+    fn as_int(self) -> Self::Int;
+    fn from_int(Self::Int) -> Self;
+}
+
+macro_rules! as_int {
+    ($float:ident, $int:ident) => {
+        impl AsInt for $float {
+            type Int = $int;
+            fn as_int(self) -> $int {
+                unsafe { ::std::mem::transmute(self) }
+            }
+            fn from_int(x: $int) -> $float {
+                unsafe { ::std::mem::transmute(x) }
+            }
+        }
+    };
+}
+
+as_int!(f32, u32);
+as_int!(f64, u64);
+as_int!(f32x2, i32x2);
+as_int!(f32x4, i32x4);
+as_int!(f32x8, i32x8);
+as_int!(f32x16, i32x16);
+as_int!(f64x2, i64x2);
+as_int!(f64x4, i64x4);
+as_int!(f64x8, i64x8);
+
+// FIXME: these fail on i586 for some reason
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse2"))))]
+mod offset {
+    use super::*;
+
+    trait TreeReduceAdd {
+        type R;
+        fn tree_reduce_add(self) -> Self::R;
+    }
+
+    macro_rules! tree_reduce_add_f {
+    ($elem_ty:ident) => {
+        impl<'a> TreeReduceAdd for &'a [$elem_ty] {
+            type R = $elem_ty;
+            fn tree_reduce_add(self) -> $elem_ty {
+                if self.len() == 2 {
+                    println!("  lv: {}, rv: {} => {}", self[0], self[1], self[0] + self[1]);
+                    self[0] + self[1]
+                } else {
+                    let mid = self.len() / 2;
+                    let (left, right) = self.split_at(mid);
+                    println!("  splitting self: {:?} at mid {} into left: {:?}, right: {:?}", self, mid, self[0], self[1]);
+                    Self::tree_reduce_add(left) + Self::tree_reduce_add(right)
+                }
+            }
+        }
+    };
+}
+    tree_reduce_add_f!(f32);
+    tree_reduce_add_f!(f64);
+
+    macro_rules! wrapping_sum_roundoff_test {
+    ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => {
+        if $feature_macro!($feature) {
+            #[target_feature(enable = $feature)]
+            unsafe fn test_fn() {
+                let mut start = std::$elem_ty::EPSILON;
+                let mut wrapping_sum = 0. as $elem_ty;
+
+                let mut v = $id::splat(0. as $elem_ty);
+                for i in 0..$id::lanes() {
+                    let c = if i % 2 == 0 { 1e3 } else { -1. };
+                    start *= 3.14 * c;
+                    wrapping_sum += start;
+                    // println!("{} | start: {}", stringify!($id), start);
+                    v = v.replace(i, start);
+                }
+                let vwrapping_sum = v.wrapping_sum();
+                println!(
+                    "{} | lwrapping_sum: {}",
+                    stringify!($id),
+                    wrapping_sum
+                );
+                println!(
+                    "{} | vwrapping_sum: {}",
+                    stringify!($id),
+                    vwrapping_sum
+                );
+                let r = vwrapping_sum.as_int() == wrapping_sum.as_int();
+                // This is false in general; the intrinsic performs a
+                // tree-reduce:
+                println!("{} | equal: {}", stringify!($id), r);
+
+                let mut a = [0. as $elem_ty; $id::lanes()];
+                v.store_unaligned(&mut a);
+
+                let twrapping_sum = a.tree_reduce_add();
+                println!(
+                    "{} | twrapping_sum: {}",
+                    stringify!($id),
+                    twrapping_sum
+                );
+
+                // tolerate 1 ULP difference:
+                if vwrapping_sum.as_int() > twrapping_sum.as_int() {
+                    assert!(
+                        vwrapping_sum.as_int() - twrapping_sum.as_int()
+                            < 2,
+                        "v: {:?} | vwrapping_sum: {} | twrapping_sum: {}",
+                        v,
+                        vwrapping_sum,
+                        twrapping_sum
+                    );
+                } else {
+                    assert!(
+                        twrapping_sum.as_int() - vwrapping_sum.as_int()
+                            < 2,
+                        "v: {:?} | vwrapping_sum: {} | twrapping_sum: {}",
+                        v,
+                        vwrapping_sum,
+                        twrapping_sum
+                    );
+                }
+            }
+            unsafe { test_fn() };
+        }
+    };
+}
+
+    #[test]
+    fn wrapping_sum_roundoff_test() {
+        finvoke!(wrapping_sum_roundoff_test);
+    }
+
+    trait TreeReduceMul {
+        type R;
+        fn tree_reduce_mul(self) -> Self::R;
+    }
+
+    macro_rules! tree_reduce_mul_f {
+    ($elem_ty:ident) => {
+        impl<'a> TreeReduceMul for &'a [$elem_ty] {
+            type R = $elem_ty;
+            fn tree_reduce_mul(self) -> $elem_ty {
+                if self.len() == 2 {
+                    println!("  lv: {}, rv: {} => {}", self[0], self[1], self[0] * self[1]);
+                    self[0] * self[1]
+                } else {
+                    let mid = self.len() / 2;
+                    let (left, right) = self.split_at(mid);
+                    println!("  splitting self: {:?} at mid {} into left: {:?}, right: {:?}", self, mid, self[0], self[1]);
+                    Self::tree_reduce_mul(left) * Self::tree_reduce_mul(right)
+                }
+            }
+        }
+    };
+}
+
+    tree_reduce_mul_f!(f32);
+    tree_reduce_mul_f!(f64);
+
+    macro_rules! wrapping_product_roundoff_test {
+        ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => {
+            if $feature_macro!($feature) {
+                #[target_feature(enable = $feature)]
+                unsafe fn test_fn() {
+                    let mut start = std::$elem_ty::EPSILON;
+                    let mut mul = 1. as $elem_ty;
+
+                    let mut v = $id::splat(1. as $elem_ty);
+                    for i in 0..$id::lanes() {
+                        let c = if i % 2 == 0 { 1e3 } else { -1. };
+                        start *= 3.14 * c;
+                        mul *= start;
+                        println!("{} | start: {}", stringify!($id), start);
+                        v = v.replace(i, start);
+                    }
+                    let vmul = v.wrapping_product();
+                    println!("{} | lmul: {}", stringify!($id), mul);
+                    println!("{} | vmul: {}", stringify!($id), vmul);
+                    let r = vmul.as_int() == mul.as_int();
+                    // This is false in general; the intrinsic performs a
+                    // tree-reduce:
+                    println!("{} | equal: {}", stringify!($id), r);
+
+                    let mut a = [0. as $elem_ty; $id::lanes()];
+                    v.store_unaligned(&mut a);
+
+                    let tmul = a.tree_reduce_mul();
+                    println!("{} | tmul: {}", stringify!($id), tmul);
+
+                    // tolerate 1 ULP difference:
+                    if vmul.as_int() > tmul.as_int() {
+                        assert!(
+                            vmul.as_int() - tmul.as_int() < 2,
+                            "v: {:?} | vmul: {} | tmul: {}",
+                            v,
+                            vmul,
+                            tmul
+                        );
+                    } else {
+                        assert!(
+                            tmul.as_int() - vmul.as_int() < 2,
+                            "v: {:?} | vmul: {} | tmul: {}",
+                            v,
+                            vmul,
+                            tmul
+                        );
+                    }
+                }
+                unsafe { test_fn() };
+            }
+        };
+    }
+
+    #[test]
+    fn wrapping_product_roundoff_test() {
+        finvoke!(wrapping_product_roundoff_test);
+    }
+
+    macro_rules! wrapping_sum_overflow_test {
+        ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => {
+            if $feature_macro!($feature) {
+                #[target_feature(enable = $feature)]
+                unsafe fn test_fn() {
+                    let start = $elem_ty::max_value()
+                        - ($id::lanes() as $elem_ty / 2);
+
+                    let v = $id::splat(start as $elem_ty);
+                    let vwrapping_sum = v.wrapping_sum();
+
+                    let mut wrapping_sum = start;
+                    for _ in 1..$id::lanes() {
+                        wrapping_sum = wrapping_sum.wrapping_add(start);
+                    }
+                    assert_eq!(wrapping_sum, vwrapping_sum, "v = {:?}", v);
+                }
+                unsafe { test_fn() };
+            }
+        };
+    }
+
+    #[test]
+    fn wrapping_sum_overflow_test() {
+        iinvoke!(wrapping_sum_overflow_test);
+    }
+
+    macro_rules! mul_overflow_test {
+        ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => {
+            if $feature_macro!($feature) {
+                #[target_feature(enable = $feature)]
+                unsafe fn test_fn() {
+                    let start = $elem_ty::max_value()
+                        - ($id::lanes() as $elem_ty / 2);
+
+                    let v = $id::splat(start as $elem_ty);
+                    let vmul = v.wrapping_product();
+
+                    let mut mul = start;
+                    for _ in 1..$id::lanes() {
+                        mul = mul.wrapping_mul(start);
+                    }
+                    assert_eq!(mul, vmul, "v = {:?}", v);
+                }
+                unsafe { test_fn() };
+            }
+        };
+    }
+
+    #[test]
+    fn mul_overflow_test() {
+        iinvoke!(mul_overflow_test);
+    }
+
+}