Skip to content

Commit

Permalink
Completes SSE and adds some MMX intrinsics (rust-lang#247)
Browse files Browse the repository at this point in the history
* Completes SSE and adds some MMX intrinsics

MMX:

- `_mm_cmpgt_pi{8,16,32}`
- `_mm_unpack{hi,lo}_pi{8,16,32}`

SSE (is now complete):

- `_mm_cvtp{i,u}{8,16}_ps`
- add test for `_m_pmulhuw`

* fmt and clippy

* add an exception for intrinsics using cvtpi2ps
  • Loading branch information
gnzlbg authored and alexcrichton committed Jan 4, 2018
1 parent 43039ef commit bf6d801
Show file tree
Hide file tree
Showing 14 changed files with 408 additions and 268 deletions.
4 changes: 4 additions & 0 deletions coresimd/src/x86/i586/bswap.rs
@@ -1,3 +1,7 @@
//! Byte swap intrinsics.

#![cfg_attr(feature = "cargo-clippy", allow(stutter))]

#[cfg(test)]
use stdsimd_test::assert_instr;

Expand Down
3 changes: 2 additions & 1 deletion coresimd/src/x86/i586/sse.rs
Expand Up @@ -3299,7 +3299,8 @@ mod tests {
use v64::*;

let a = mem::transmute(i8x8::new(0, 0, 0, 0, 0, 0, 0, 7));
let mut mem = ::std::boxed::Box::<__m64>::new(mem::transmute(i8x8::splat(1)));
let mut mem =
::std::boxed::Box::<__m64>::new(mem::transmute(i8x8::splat(1)));
sse::_mm_stream_pi(&mut *mem as *mut _ as *mut _, a);
assert_eq!(a, *mem);
}
Expand Down
134 changes: 105 additions & 29 deletions coresimd/src/x86/i686/mmx.rs
Expand Up @@ -16,7 +16,7 @@ use stdsimd_test::assert_instr;

/// Constructs a 64-bit integer vector initialized to zero.
#[inline(always)]
#[target_feature = "+mmx,+sse"]
#[target_feature = "+mmx"]
// FIXME: this produces a movl instead of xorps on x86
// FIXME: this produces a xor intrinsic instead of xorps on x86_64
#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(xor))]
Expand All @@ -30,7 +30,7 @@ pub unsafe fn _mm_setzero_si64() -> __m64 {
/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
/// less than 0x80 are saturated to 0x80.
#[inline(always)]
#[target_feature = "+mmx,+sse"]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(packsswb))]
pub unsafe fn _mm_packs_pi16(a: __m64, b: __m64) -> __m64 {
packsswb(a, b)
Expand All @@ -42,63 +42,93 @@ pub unsafe fn _mm_packs_pi16(a: __m64, b: __m64) -> __m64 {
/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
/// less than 0x80 are saturated to 0x80.
#[inline(always)]
#[target_feature = "+mmx,+sse"]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(packssdw))]
pub unsafe fn _mm_packs_pi32(a: __m64, b: __m64) -> __m64 {
packssdw(a, b)
}

/// Compares the 8-bit integer elements of two 64-bit integer vectors of
/// [8 x i8] to determine if the element of the first vector is greater than
/// the corresponding element of the second vector.
///
/// The comparison yields 0 for false, 0xFF for true.
/// Compares whether each element of `a` is greater than the corresponding
/// element of `b` returning `0` for `false` and `-1` for `true`.
#[inline(always)]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(pcmpgtb))]
pub unsafe fn _mm_cmpgt_pi8(a: __m64, b: __m64) -> __m64 {
pcmpgtb(a, b)
}

/// Compares the 16-bit integer elements of two 64-bit integer vectors of
/// [4 x i16] to determine if the element of the first vector is greater than
/// the corresponding element of the second vector.
///
/// The comparison yields 0 for false, 0xFFFF for true.
/// Compares whether each element of `a` is greater than the corresponding
/// element of `b` returning `0` for `false` and `-1` for `true`.
#[inline(always)]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(pcmpgtw))]
pub unsafe fn _mm_cmpgt_pi16(a: __m64, b: __m64) -> __m64 {
pcmpgtw(a, b)
}

/// Unpacks the upper 32 bits from two 64-bit integer vectors of
/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
/// Compares whether each element of `a` is greater than the corresponding
/// element of `b` returning `0` for `false` and `-1` for `true`.
#[inline(always)]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(pcmpgtd))]
pub unsafe fn _mm_cmpgt_pi32(a: __m64, b: __m64) -> __m64 {
pcmpgtd(a, b)
}

/// Unpacks the upper two elements from two `i16x4` vectors and interleaves
/// them into the result: `[a.2, b.2, a.3, b.3]`.
#[inline(always)]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(punpckhwd))] // FIXME punpcklbw expected
pub unsafe fn _mm_unpackhi_pi16(a: __m64, b: __m64) -> __m64 {
punpckhwd(a, b)
}

/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
/// and interleaves them into a 64-bit integer vector of [8 x i8].
/// Unpacks the upper four elements from two `i8x8` vectors and interleaves
/// them into the result: `[a.4, b.4, a.5, b.5, a.6, b.6, a.7, b.7]`.
#[inline(always)]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(punpckhbw))]
pub unsafe fn _mm_unpackhi_pi8(a: __m64, b: __m64) -> __m64 {
punpckhbw(a, b)
}

/// Unpacks the lower four elements from two `i8x8` vectors and interleaves
/// them into the result: `[a.0, b.0, a.1, b.1, a.2, b.2, a.3, b.3]`.
#[inline(always)]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(punpcklbw))]
pub unsafe fn _mm_unpacklo_pi8(a: __m64, b: __m64) -> __m64 {
punpcklbw(a, b)
}

/// Unpacks the lower 32 bits from two 64-bit integer vectors of
/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
/// Unpacks the lower two elements from two `i16x4` vectors and interleaves
/// them into the result: `[a.0 b.0 a.1 b.1]`.
#[inline(always)]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(punpcklwd))]
pub unsafe fn _mm_unpacklo_pi16(a: __m64, b: __m64) -> __m64 {
punpcklwd(a, b)
}

/// Unpacks the upper element from two `i32x2` vectors and interleaves them
/// into the result: `[a.1, b.1]`.
#[inline(always)]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(punpckhdq))]
pub unsafe fn _mm_unpackhi_pi32(a: __m64, b: __m64) -> __m64 {
punpckhdq(a, b)
}

/// Unpacks the lower element from two `i32x2` vectors and interleaves them
/// into the result: `[a.0, b.0]`.
#[inline(always)]
#[target_feature = "+mmx"]
#[cfg_attr(test, assert_instr(punpckldq))]
pub unsafe fn _mm_unpacklo_pi32(a: __m64, b: __m64) -> __m64 {
punpckldq(a, b)
}

#[allow(improper_ctypes)]
extern "C" {
#[link_name = "llvm.x86.mmx.packsswb"]
Expand All @@ -109,12 +139,20 @@ extern "C" {
fn pcmpgtb(a: __m64, b: __m64) -> __m64;
#[link_name = "llvm.x86.mmx.pcmpgt.w"]
fn pcmpgtw(a: __m64, b: __m64) -> __m64;
#[link_name = "llvm.x86.mmx.pcmpgt.d"]
fn pcmpgtd(a: __m64, b: __m64) -> __m64;
#[link_name = "llvm.x86.mmx.punpckhwd"]
fn punpckhwd(a: __m64, b: __m64) -> __m64;
#[link_name = "llvm.x86.mmx.punpcklbw"]
fn punpcklbw(a: __m64, b: __m64) -> __m64;
#[link_name = "llvm.x86.mmx.punpcklwd"]
fn punpcklwd(a: __m64, b: __m64) -> __m64;
#[link_name = "llvm.x86.mmx.punpckhbw"]
fn punpckhbw(a: __m64, b: __m64) -> __m64;
#[link_name = "llvm.x86.mmx.punpcklbw"]
fn punpcklbw(a: __m64, b: __m64) -> __m64;
#[link_name = "llvm.x86.mmx.punpckhdq"]
fn punpckhdq(a: __m64, b: __m64) -> __m64;
#[link_name = "llvm.x86.mmx.punpckldq"]
fn punpckldq(a: __m64, b: __m64) -> __m64;
}

#[cfg(test)]
Expand All @@ -123,21 +161,21 @@ mod tests {
use x86::i686::mmx;
use stdsimd_test::simd_test;

#[simd_test = "sse"] // FIXME: should be mmx
#[simd_test = "mmx"]
unsafe fn _mm_setzero_si64() {
let r: __m64 = ::std::mem::transmute(0_i64);
assert_eq!(r, mmx::_mm_setzero_si64());
}

#[simd_test = "sse"] // FIXME: should be mmx
#[simd_test = "mmx"]
unsafe fn _mm_packs_pi16() {
let a = i16x4::new(-1, 2, -3, 4);
let b = i16x4::new(-5, 6, -7, 8);
let r = i8x8::new(-1, 2, -3, 4, -5, 6, -7, 8);
assert_eq!(r, i8x8::from(mmx::_mm_packs_pi16(a.into(), b.into())));
}

#[simd_test = "sse"] // FIXME: should be mmx
#[simd_test = "mmx"]
unsafe fn _mm_packs_pi32() {
let a = i32x2::new(-1, 2);
let b = i32x2::new(-5, 6);
Expand All @@ -162,11 +200,23 @@ mod tests {
}

#[simd_test = "mmx"]
unsafe fn _mm_unpackhi_pi16() {
let a = i16x4::new(0, 1, 2, 3);
let b = i16x4::new(4, 5, 6, 7);
let r = i16x4::new(2, 6, 3, 7);
assert_eq!(r, i16x4::from(mmx::_mm_unpackhi_pi16(a.into(), b.into())));
unsafe fn _mm_cmpgt_pi32() {
let a = i32x2::new(0, 3);
let b = i32x2::new(1, 2);
let r0 = i32x2::new(0, -1);
let r1 = i32x2::new(-1, 0);

assert_eq!(r0, mmx::_mm_cmpgt_pi32(a.into(), b.into()).into());
assert_eq!(r1, mmx::_mm_cmpgt_pi32(b.into(), a.into()).into());
}

#[simd_test = "mmx"]
unsafe fn _mm_unpackhi_pi8() {
let a = i8x8::new(0, 3, 4, 7, 8, 11, 12, 15);
let b = i8x8::new(1, 2, 5, 6, 9, 10, 13, 14);
let r = i8x8::new(8, 9, 11, 10, 12, 13, 15, 14);

assert_eq!(r, mmx::_mm_unpackhi_pi8(a.into(), b.into()).into());
}

#[simd_test = "mmx"]
Expand All @@ -177,11 +227,37 @@ mod tests {
assert_eq!(r, i8x8::from(mmx::_mm_unpacklo_pi8(a.into(), b.into())));
}

#[simd_test = "mmx"]
unsafe fn _mm_unpackhi_pi16() {
let a = i16x4::new(0, 1, 2, 3);
let b = i16x4::new(4, 5, 6, 7);
let r = i16x4::new(2, 6, 3, 7);
assert_eq!(r, i16x4::from(mmx::_mm_unpackhi_pi16(a.into(), b.into())));
}

#[simd_test = "mmx"]
unsafe fn _mm_unpacklo_pi16() {
let a = i16x4::new(0, 1, 2, 3);
let b = i16x4::new(4, 5, 6, 7);
let r = i16x4::new(0, 4, 1, 5);
assert_eq!(r, i16x4::from(mmx::_mm_unpacklo_pi16(a.into(), b.into())));
}

#[simd_test = "mmx"]
unsafe fn _mm_unpackhi_pi32() {
let a = i32x2::new(0, 3);
let b = i32x2::new(1, 2);
let r = i32x2::new(3, 2);

assert_eq!(r, mmx::_mm_unpackhi_pi32(a.into(), b.into()).into());
}

#[simd_test = "mmx"]
unsafe fn _mm_unpacklo_pi32() {
let a = i32x2::new(0, 3);
let b = i32x2::new(1, 2);
let r = i32x2::new(0, 1);

assert_eq!(r, mmx::_mm_unpacklo_pi32(a.into(), b.into()).into());
}
}
69 changes: 40 additions & 29 deletions coresimd/src/x86/i686/sse.rs
Expand Up @@ -221,25 +221,46 @@ pub unsafe fn _mm_cvt_pi2ps(a: f32x4, b: i32x2) -> f32x4 {
_mm_cvtpi32_ps(a, b)
}

/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
/// float].
/// Converts the lower 4 8-bit values of `a` into a 128-bit vector of 4 `f32`s.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(cvtpi2ps))]
pub unsafe fn _mm_cvtpi8_ps(a: __m64) -> f32x4 {
let b = mmx::_mm_setzero_si64();
let b = mmx::_mm_cmpgt_pi8(b, a);
let b = mmx::_mm_unpacklo_pi8(a, b);
_mm_cvtpi16_ps(b)
}

/// Converts the lower 4 8-bit values of `a` into a 128-bit vector of 4 `f32`s.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(cvtpi2ps))]
pub unsafe fn _mm_cvtpu8_ps(a: __m64) -> f32x4 {
let b = mmx::_mm_setzero_si64();
let b = mmx::_mm_unpacklo_pi8(a, b);
_mm_cvtpi16_ps(b)
}

/// Converts a 64-bit vector of `i16`s into a 128-bit vector of 4 `f32`s.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(cvtpi2ps))]
pub unsafe fn _mm_cvtpi16_ps(a: __m64) -> f32x4 {
let b = mmx::_mm_setzero_si64();
let b = mmx::_mm_cmpgt_pi16(mem::transmute(b), a);
let b = mmx::_mm_cmpgt_pi16(b, a);
let c = mmx::_mm_unpackhi_pi16(a, b);
let r = i586::_mm_setzero_ps();
let r = cvtpi2ps(r, mem::transmute(c));
let r = cvtpi2ps(r, c);
let r = i586::_mm_movelh_ps(r, r);
let c = mmx::_mm_unpacklo_pi16(a, b);
cvtpi2ps(r, mem::transmute(c))
cvtpi2ps(r, c)
}

/// Converts a 64-bit vector of 16-bit unsigned integer values into a
/// 128-bit vector of [4 x float].
/// Converts a 64-bit vector of `i16`s into a 128-bit vector of 4 `f32`s.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(cvtpi2ps))]
pub unsafe fn _mm_cvtpu16_ps(a: __m64) -> f32x4 {
let b = mmx::_mm_setzero_si64();
let c = mmx::_mm_unpackhi_pi16(a, b);
Expand All @@ -250,27 +271,6 @@ pub unsafe fn _mm_cvtpu16_ps(a: __m64) -> f32x4 {
cvtpi2ps(r, c)
}

/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
/// into a 128-bit vector of [4 x float].
#[inline(always)]
#[target_feature = "+sse"]
pub unsafe fn _mm_cvtpi8_ps(a: __m64) -> f32x4 {
let b = mmx::_mm_setzero_si64();
let b = mmx::_mm_cmpgt_pi8(b, a);
let b = mmx::_mm_unpacklo_pi8(a, b);
_mm_cvtpi16_ps(b)
}

/// Converts the lower four unsigned 8-bit integer values from a 64-bit
/// vector of [8 x u8] into a 128-bit vector of [4 x float].
#[inline(always)]
#[target_feature = "+sse"]
pub unsafe fn _mm_cvtpu8_ps(a: __m64) -> f32x4 {
let b = mmx::_mm_setzero_si64();
let b = mmx::_mm_unpacklo_pi8(a, b);
_mm_cvtpi16_ps(b)
}

/// Converts the two 32-bit signed integer values from each 64-bit vector
/// operand of [2 x i32] into a 128-bit vector of [4 x float].
#[inline(always)]
Expand Down Expand Up @@ -512,6 +512,13 @@ mod tests {
assert_eq!(r, u16x4::splat(15));
}

#[simd_test = "sse"]
unsafe fn _m_pmulhuw() {
let (a, b) = (u16x4::splat(1000), u16x4::splat(1001));
let r = sse::_m_pmulhuw(a.into(), b.into());
assert_eq!(r, u16x4::splat(15).into());
}

#[simd_test = "sse"]
unsafe fn _mm_avg_pu8() {
let (a, b) = (u8x8::splat(3), u8x8::splat(9));
Expand Down Expand Up @@ -601,7 +608,11 @@ mod tests {
let a = i8x8::splat(9);
let mask = i8x8::splat(0).replace(2, 0x80u8 as i8);
let mut r = i8x8::splat(0);
sse::_mm_maskmove_si64(a.into(), mask.into(), &mut r as *mut _ as *mut i8);
sse::_mm_maskmove_si64(
a.into(),
mask.into(),
&mut r as *mut _ as *mut i8,
);
assert_eq!(r, i8x8::splat(0).replace(2, 9));

let mut r = i8x8::splat(0);
Expand Down

0 comments on commit bf6d801

Please sign in to comment.