Permalink
Browse files

Initial commit.

  • Loading branch information...
huonw committed Aug 10, 2015
0 parents commit ed8b778cc97b321cb716cebe8dd4998863db8dd0
Showing with 2,024 additions and 0 deletions.
  1. +3 −0 .gitignore
  2. +10 −0 Cargo.toml
  3. +299 −0 benches/matrix.rs
  4. +38 −0 examples/convert.rs
  5. +25 −0 examples/dot-product.rs
  6. +178 −0 examples/fannkuch-redux.rs
  7. +133 −0 examples/mandelbrot.rs
  8. +280 −0 examples/matrix-inverse.rs
  9. +165 −0 examples/nbody.rs
  10. +9 −0 examples/ops.rs
  11. +67 −0 examples/spectral-norm.rs
  12. +57 −0 src/aarch64.rs
  13. +589 −0 src/lib.rs
  14. +81 −0 src/neon.rs
  15. +63 −0 src/sse2.rs
  16. +27 −0 src/ssse3.rs
@@ -0,0 +1,3 @@
perf.data*
/target
Cargo.lock
@@ -0,0 +1,10 @@
[package]
name = "simd"
version = "0.1.0"
authors = ["Huon Wilson <dbau.pp+github@gmail.com>"]

[dependencies]
rustc-serialize = "0.3.2"

serde = { version = "0.4", optional = true }
serde_macros = { version = "0.4", optional = true }
@@ -0,0 +1,299 @@
#![feature(test)]
extern crate test;
extern crate simd;

use test::black_box as bb;
use test::Bencher as B;
use simd::f32x4;


#[bench]
fn multiply_naive(b: &mut B) {
let x = [[1.0_f32; 4]; 4];
let y = [[2.0; 4]; 4];
b.iter(|| {
for _ in 0..100 {
let (x, y) = bb((&x, &y));

bb(&[[x[0][0] * y[0][0] + x[1][0] * y[0][1] + x[2][0] * y[0][2] + x[3][0] * y[0][3],
x[0][1] * y[0][0] + x[1][1] * y[0][1] + x[2][1] * y[0][2] + x[3][1] * y[0][3],
x[0][2] * y[0][0] + x[1][2] * y[0][1] + x[2][2] * y[0][2] + x[3][2] * y[0][3],
x[0][3] * y[0][0] + x[1][3] * y[0][1] + x[2][3] * y[0][2] + x[3][3] * y[0][3]],
[x[0][0] * y[1][0] + x[1][0] * y[1][1] + x[2][0] * y[1][2] + x[3][0] * y[1][3],
x[0][1] * y[1][0] + x[1][1] * y[1][1] + x[2][1] * y[1][2] + x[3][1] * y[1][3],
x[0][2] * y[1][0] + x[1][2] * y[1][1] + x[2][2] * y[1][2] + x[3][2] * y[1][3],
x[0][3] * y[1][0] + x[1][3] * y[1][1] + x[2][3] * y[1][2] + x[3][3] * y[1][3]],
[x[0][0] * y[2][0] + x[1][0] * y[2][1] + x[2][0] * y[2][2] + x[3][0] * y[2][3],
x[0][1] * y[2][0] + x[1][1] * y[2][1] + x[2][1] * y[2][2] + x[3][1] * y[2][3],
x[0][2] * y[2][0] + x[1][2] * y[2][1] + x[2][2] * y[2][2] + x[3][2] * y[2][3],
x[0][3] * y[2][0] + x[1][3] * y[2][1] + x[2][3] * y[2][2] + x[3][3] * y[2][3]],
[x[0][0] * y[3][0] + x[1][0] * y[3][1] + x[2][0] * y[3][2] + x[3][0] * y[3][3],
x[0][1] * y[3][0] + x[1][1] * y[3][1] + x[2][1] * y[3][2] + x[3][1] * y[3][3],
x[0][2] * y[3][0] + x[1][2] * y[3][1] + x[2][2] * y[3][2] + x[3][2] * y[3][3],
x[0][3] * y[3][0] + x[1][3] * y[3][1] + x[2][3] * y[3][2] + x[3][3] * y[3][3]],
]);
}
})
}

#[bench]
fn multiply_simd4(b: &mut B) {
let x = [f32x4::splat(1.0_f32); 4];
let y = [f32x4::splat(2.0); 4];
b.iter(|| {
for _ in 0..100 {
let (x, y) = bb((&x, &y));

let y0 = y[0];
let y1 = y[1];
let y2 = y[2];
let y3 = y[3];
bb(&[f32x4::splat(y0.extract(0)) * x[0] +
f32x4::splat(y0.extract(1)) * x[1] +
f32x4::splat(y0.extract(2)) * x[2] +
f32x4::splat(y0.extract(3)) * x[3],
f32x4::splat(y1.extract(0)) * x[0] +
f32x4::splat(y1.extract(1)) * x[1] +
f32x4::splat(y1.extract(2)) * x[2] +
f32x4::splat(y1.extract(3)) * x[3],
f32x4::splat(y2.extract(0)) * x[0] +
f32x4::splat(y2.extract(1)) * x[1] +
f32x4::splat(y2.extract(2)) * x[2] +
f32x4::splat(y2.extract(3)) * x[3],
f32x4::splat(y3.extract(0)) * x[0] +
f32x4::splat(y3.extract(1)) * x[1] +
f32x4::splat(y3.extract(2)) * x[2] +
f32x4::splat(y3.extract(3)) * x[3],
]);
}
})
}

#[bench]
fn inverse_naive(b: &mut B) {
let mut x = [[0_f32; 4]; 4];
for i in 0..4 { x[i][i] = 1.0 }

b.iter(|| {
for _ in 0..100 {
let x = bb(&x);

let mut t = [[0_f32; 4]; 4];
for i in 0..4 {
t[0][i] = x[i][0];
t[1][i] = x[i][1];
t[2][i] = x[i][2];
t[3][i] = x[i][3];
}

let _0 = t[2][2] * t[3][3];
let _1 = t[2][3] * t[3][2];
let _2 = t[2][1] * t[3][3];
let _3 = t[2][3] * t[3][1];
let _4 = t[2][1] * t[3][2];
let _5 = t[2][2] * t[3][1];
let _6 = t[2][0] * t[3][3];
let _7 = t[2][3] * t[3][0];
let _8 = t[2][0] * t[3][2];
let _9 = t[2][2] * t[3][0];
let _10 = t[2][0] * t[3][1];
let _11 = t[2][1] * t[3][0];

let d00 = _0 * t[1][1] + _3 * t[1][2] + _4 * t[1][3] -
(_1 * t[1][1] + _2 * t[1][2] + _5 * t[1][3]);
let d01 = _1 * t[1][0] + _6 * t[1][2] + _9 * t[1][3] -
(_0 * t[1][0] + _7 * t[1][2] + _8 * t[1][3]);
let d02 = _2 * t[1][0] + _7 * t[1][1] + _10 * t[1][3] -
(_3 * t[1][0] + _6 * t[1][1] + _11 * t[1][3]);
let d03 = _5 * t[1][0] + _8 * t[1][1] + _11 * t[1][2] -
(_4 * t[1][0] + _9 * t[1][1] + _10 * t[1][2]);
let d10 = _1 * t[0][1] + _2 * t[0][2] + _5 * t[0][3] -
(_0 * t[0][1] + _3 * t[0][2] + _4 * t[0][3]);
let d11 = _0 * t[0][0] + _7 * t[0][2] + _8 * t[0][3] -
(_1 * t[0][0] + _6 * t[0][2] + _9 * t[0][3]);
let d12 = _3 * t[0][0] + _6 * t[0][1] + _11 * t[0][3] -
(_2 * t[0][0] + _7 * t[0][1] + _10 * t[0][3]);
let d13 = _4 * t[0][0] + _9 * t[0][1] + _10 * t[0][2] -
(_5 * t[0][0] + _8 * t[0][1] + _11 * t[0][2]);

let _0 = t[0][2] * t[1][3];
let _1 = t[0][3] * t[1][2];
let _2 = t[0][1] * t[1][3];
let _3 = t[0][3] * t[1][1];
let _4 = t[0][1] * t[1][2];
let _5 = t[0][2] * t[1][1];
let _6 = t[0][0] * t[1][3];
let _7 = t[0][3] * t[1][0];
let _8 = t[0][0] * t[1][2];
let _9 = t[0][2] * t[1][0];
let _10 = t[0][0] * t[1][1];
let _11 = t[0][1] * t[1][0];

let d20 = _0*t[3][1] + _3*t[3][2] + _4*t[3][3]-
(_1*t[3][1] + _2*t[3][2] + _5*t[3][3]);
let d21 = _1*t[3][0] + _6*t[3][2] + _9*t[3][3]-
(_0*t[3][0] + _7*t[3][2] + _8*t[3][3]);
let d22 = _2*t[3][0] + _7*t[3][1] + _10*t[3][3]-
(_3*t[3][0] + _6*t[3][1] + _11*t[3][3]);
let d23 = _5*t[3][0] + _8*t[3][1] + _11*t[3][2]-
(_4*t[3][0] + _9*t[3][1] + _10*t[3][2]);
let d30 = _2*t[2][2] + _5*t[2][3] + _1*t[2][1]-
(_4*t[2][3] + _0*t[2][1] + _3*t[2][2]);
let d31 = _8*t[2][3] + _0*t[2][0] + _7*t[2][2]-
(_6*t[2][2] + _9*t[2][3] + _1*t[2][0]);
let d32 = _6*t[2][1] + _11*t[2][3] + _3*t[2][0]-
(_10*t[2][3] + _2*t[2][0] + _7*t[2][1]);
let d33 = _10*t[2][2] + _4*t[2][0] + _9*t[2][1]-
(_8*t[2][1] + _11*t[2][2] + _5*t[2][0]);

let det = t[0][0] * d00 + t[0][1] * d01 + t[0][2] * d02 + t[0][3] * d03;

let det = 1.0 / det;
let mut ret = [[d00, d01, d02, d03],
[d10, d11, d12, d13],
[d20, d21, d22, d23],
[d30, d31, d32, d33]];
for i in 0..4 {
for j in 0..4 {
ret[i][j] *= det;
}
}
bb(&ret);
}
})
}

#[bench]
fn inverse_simd4(b: &mut B) {
let mut x = [f32x4::splat(0_f32); 4];
for i in 0..4 { x[i] = x[i].replace(i as u32, 1.0); }

fn shuf0145(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(0), v.extract(1),
w.extract(4 - 4), w.extract(5 - 4))
}
fn shuf0246(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(0), v.extract(2),
w.extract(4 - 4), w.extract(6 - 4))
}
fn shuf1357(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(1), v.extract(3),
w.extract(5 - 4), w.extract(7 - 4))
}
fn shuf2367(v: f32x4, w: f32x4) -> f32x4 {
f32x4::new(v.extract(2), v.extract(3),
w.extract(6 - 4), w.extract(7 - 4))
}

fn swiz1032(v: f32x4) -> f32x4 {
f32x4::new(v.extract(1), v.extract(0),
v.extract(3), v.extract(2))
}
fn swiz2301(v: f32x4) -> f32x4 {
f32x4::new(v.extract(2), v.extract(3),
v.extract(0), v.extract(1))
}

b.iter(|| {
for _ in 0..100 {
let src0;
let src1;
let src2;
let src3;
let mut tmp1;
let row0;
let mut row1;
let mut row2;
let mut row3;
let mut minor0;
let mut minor1;
let mut minor2;
let mut minor3;
let mut det;

let x = bb(&x);
src0 = x[0];
src1 = x[1];
src2 = x[2];
src3 = x[3];

tmp1 = shuf0145(src0, src1);
row1 = shuf0145(src2, src3);
row0 = shuf0246(tmp1, row1);
row1 = shuf1357(row1, tmp1);

tmp1 = shuf2367(src0, src1);
row3 = shuf2367(src2, src3);
row2 = shuf0246(tmp1, row3);
row3 = shuf0246(row3, tmp1);


tmp1 = row2 * row3;
tmp1 = swiz1032(tmp1);
minor0 = row1 * tmp1;
minor1 = row0 * tmp1;
tmp1 = swiz2301(tmp1);
minor0 = (row1 * tmp1) - minor0;
minor1 = (row0 * tmp1) - minor1;
minor1 = swiz2301(minor1);


tmp1 = row1 * row2;
tmp1 = swiz1032(tmp1);
minor0 = (row3 * tmp1) + minor0;
minor3 = row0 * tmp1;
tmp1 = swiz2301(tmp1);

minor0 = minor0 - row3 * tmp1;
minor3 = row0 * tmp1 - minor3;
minor3 = swiz2301(minor3);


tmp1 = row3 * swiz2301(row1);
tmp1 = swiz1032(tmp1);
row2 = swiz2301(row2);
minor0 = row2 * tmp1 + minor0;
minor2 = row0 * tmp1;
tmp1 = swiz2301(tmp1);
minor0 = minor0 - row2 * tmp1;
minor2 = row0 * tmp1 - minor2;
minor2 = swiz2301(minor2);


tmp1 = row0 * row1;
tmp1 = swiz1032(tmp1);
minor2 = minor2 + row3 * tmp1;
minor3 = row2 * tmp1 - minor3;
tmp1 = swiz2301(tmp1);
minor2 = row3 * tmp1 - minor2;
minor3 = minor3 - row2 * tmp1;



tmp1 = row0 * row3;
tmp1 = swiz1032(tmp1);
minor1 = minor1 - row2 * tmp1;
minor2 = row1 * tmp1 + minor2;
tmp1 = swiz2301(tmp1);
minor1 = row2 * tmp1 + minor1;
minor2 = minor2 - row1 * tmp1;

tmp1 = row0 * row2;
tmp1 = swiz1032(tmp1);
minor1 = row3 * tmp1 + minor1;
minor3 = minor3 - row1 * tmp1;
tmp1 = swiz2301(tmp1);
minor1 = minor1 - row3 * tmp1;
minor3 = row1 * tmp1 + minor3;

det = row0 * minor0;
det = swiz2301(det) + det;
det = swiz1032(det) + det;
//tmp1 = det.approx_reciprocal(); det = tmp1 * (f32x4::splat(2.0) - det * tmp1);
det = f32x4::splat(1.0) / det;

bb(&[minor0 * det, minor1 * det, minor2 * det, minor3 * det]);
}
})

}
@@ -0,0 +1,38 @@
extern crate simd;
use simd::f32x4;

#[inline(never)]
pub fn convert_scalar(x: &mut [i32], y: &[f32]) {
assert_eq!(x.len(), y.len());

let mut i = 0;
while i < x.len() & !3 {
x[i] = y[i] as i32;
i += 1;
}
}

#[inline(never)]
pub fn convert(x: &mut [i32], y: &[f32]) {
assert_eq!(x.len(), y.len());

let mut i = 0;
while i < x.len() & !3 {
let v = f32x4::load(y, i);
v.to_i32().store(x, i);
i += 4
}
}

fn main() {
let x = &mut [0; 12];
let y = [1.0; 12];
convert(x, &y);
convert_scalar(x, &y);
println!("{:?}", x);
let x = &mut [0; 16];
let y = [1.0; 16];
convert(x, &y);
convert_scalar(x, &y);
println!("{:?}", x);
}
@@ -0,0 +1,25 @@
extern crate simd;
use simd::f32x4;

#[inline(never)]
pub fn dot(x: &[f32], y: &[f32]) -> f32 {
assert_eq!(x.len(), y.len());

let len = std::cmp::min(x.len(), y.len());

let mut sum = f32x4::splat(0.0);
let mut i = 0;
while i < len & !3 {
let x = f32x4::load(x, i);
let y = f32x4::load(y, i);
sum = sum + x * y;
i += 4
}
sum.extract(0) + sum.extract(1) + sum.extract(2) + sum.extract(3)
}

fn main() {
println!("{}", dot(&[1.0, 3.0, 5.0, 7.0], &[2.0, 4.0, 6.0, 8.0]));
println!("{}", dot(&[1.0, 3.0, 6.0, 7.0, 10.0, 6.0, 3.0, 2.0],
&[2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0]));
}
Oops, something went wrong.

0 comments on commit ed8b778

Please sign in to comment.