Permalink
Browse files

Added fast_init option.

  • Loading branch information...
1 parent 11d6485 commit 7312572cb6fa6fbf03dec7667bb5cc02e1531499 @jkrempus committed Feb 13, 2012
Showing with 174 additions and 20 deletions.
  1. +2 −0 .gitignore
  2. +1 −1 bench.d
  3. +107 −5 misc_tests.d
  4. +9 −7 pfft/bitreverse.d
  5. +53 −6 pfft/fft_impl.d
  6. +2 −1 pfft/sse.d
View
@@ -11,3 +11,5 @@ test_nophobos
*.old
*.deps
*.asm
+*.gcda
+*.ignore
View
@@ -63,7 +63,7 @@ void bench(int log2n)
auto tables = fft_table(log2n);
ulong flopsPerIter = 5UL * log2n * (1UL << log2n);
- ulong niter = 1_000_000_000L / flopsPerIter;
+ ulong niter = 10_000_000_000L / flopsPerIter;
niter = niter ? niter : 1;
double t = get_time();
View
@@ -1,7 +1,9 @@
-import std.stdio, std.array, std.range;
-import pfft.bitreverse;
+import std.stdio, std.array, std.range, std.algorithm, std.datetime,
+ std.conv;
+import pfft.bitreverse, pfft.sse;
+import core.simd;
-void test_bit_reverse_simple_small()
+void test_bit_reverse_simple_small(string[] args)
{
auto a = array(iota(32));
@@ -24,7 +26,107 @@ void test_bit_reverse_simple_small()
writeln("");
}
-void main()
+alias BitReverse!(SSE, Options) BR;
+
+void interleave_chunks(size_t chunk_size, T)(T* p, int log2n)
+{
+ int shift = log2n - 1;
+ size_t mask = (1UL << shift) - 1;
+ size_t next(size_t j)
+ {
+ return (j & mask)*2 + (j >> shift);
+ };
+ size_t previous(size_t j)
+ {
+ return (j >> 1) + ((j & 1UL) << shift);
+ };
+ foreach(i; 1UL..mask + 1)
+ if(next(i) > i && previous(i) > i)
+ {
+ auto j = i;
+ T[chunk_size] a = void;
+ BR.copy_array!T(a.ptr, p + j * chunk_size, chunk_size);
+ j = next(j);
+ do
+ {
+ BR.swap_array!T(a.ptr, p + j * chunk_size, chunk_size);
+ j = next(j);
+ }
+ while (j != i);
+ BR.copy_array!T(p + j * chunk_size, a.ptr, chunk_size);
+ }
+}
+
+struct NTimes(T, int n)
+{
+ static if(n)
+ {
+ T a;
+ NTimes!(T, n - 1) b;
+ }
+}
+
+void interleave_small(size_t buffer_size)(float4* p, size_t n)
+{
+ assert(n < buffer_size);
+
+ float4[buffer_size] buff = void;
+
+ BR.copy_array!float4(buff.ptr, p, cast(int)n);
+
+ auto a = buff.ptr, b = a + n / 2;
+
+ foreach(i; 0 .. n / 2)
+ SSE.interleave!4(a[i], b[i], p[2*i], p[2*i + 1]);
+}
+
+void interleave(int log2_buffer_size)(float* p, int log2n)
+{
+ enum buffer_size = 1UL << log2_buffer_size;
+
+ if(log2n <= log2_buffer_size)
+ interleave_small!buffer_size(cast(float4*)p, (1 << log2n) / 4);
+ else
+ {
+ interleave_chunks!(buffer_size / 2)(p, log2n - log2_buffer_size + 1);
+
+ foreach(i; iota(0UL, 1UL << log2n, buffer_size))
+ interleave_small!buffer_size(cast(float4*)(p + i), buffer_size / 4);
+ }
+}
+
+void bench_interleave(string[] args)
+{
+ auto log2n = to!int(args[1]);
+ auto arr = new float[1 << log2n];
+ arr[] = 0f;
+
+ ulong flopsPerIter = 5UL * (log2n - 1) * (1UL << (log2n - 1));
+ ulong niter = 10_000_000_000L / flopsPerIter;
+ niter = niter ? niter : 1;
+
+ StopWatch sw;
+ sw.start();
+
+ foreach(i; 0 .. niter)
+ interleave!12(arr.ptr, log2n);
+
+ sw.stop();
+
+ writeln( to!double(niter) * flopsPerIter / sw.peek().nsecs());
+}
+
+void test_interleave(string[] args)
+{
+ auto log2n = to!int(args[1]);
+ auto arr = array(map!"cast(float)a"(iota(1<<log2n)));
+ interleave!6(arr.ptr, log2n);
+ writeln(arr);
+}
+
+void main(string[] args)
{
- test_bit_reverse_simple_small();
+ //test_bit_reverse_simple_small();
+ bench_interleave(args);
+ //test_interleave(args);
}
View
@@ -157,9 +157,10 @@ struct BitReverse(alias V, Options)
src[3] = a7;
}
- static void swap_some(int len, TT)(TT * a, TT * b)
- if(len*TT.sizeof % (vec.sizeof * 4) == 0)
+ static void swap_array(TT)(TT * a, TT * b, int len)
{
+ assert(len*TT.sizeof % (vec.sizeof * 4) == 0);
+
foreach(i; 0 .. len * TT.sizeof / 4 / vec.sizeof)
swap4once((cast(vec*)a) + 4 * i, (cast(vec*)b) + 4 * i);
}
@@ -184,9 +185,10 @@ struct BitReverse(alias V, Options)
dst[7] = a7;
}
- static void copy_some(int len, TT)(TT * a, TT * b)
- if(len*TT.sizeof % (vec.sizeof*8) == 0)
+ static void copy_array(TT)(TT * a, TT * b, int len)
{
+ assert((len*TT.sizeof % (vec.sizeof*8) == 0));
+
foreach(i; 0 .. len * TT.sizeof / 8 / vec.sizeof)
copy8once((cast(vec*)a) + 8 * i, (cast(vec*)b) + 8 * i);
}
@@ -209,20 +211,20 @@ struct BitReverse(alias V, Options)
{
for(T* pp = p + i0 * l, pb = buffer; pp < pend; pb += l, pp += m)
- copy_some!l(pb, pp);//memcpy(pb, pp, l * T.sizeof);
+ copy_array(pb, pp, l);
bit_reverse_small(buffer,log2l+log2l, table);
if(i1 != i0)
{
for(T* pp = p + i1 * l, pb = buffer; pp < pend; pb += l, pp += m)
- swap_some!l(pp,pb);
+ swap_array(pp, pb, l);
bit_reverse_small(buffer,log2l+log2l, table);
}
for(T* pp = p + i0*l, pb = buffer; pp < pend; pp += m, pb += l)
- copy_some!l(pp, pb);//memcpy(pp, pb, l*T.sizeof);
+ copy_array(pp, pb, l);
}
}
}
View
@@ -125,7 +125,39 @@ template FFT(alias V, Options)
return r - 1;
}
- void fft_table_impl(int log2n, int n_reversed_loops, Pair * r)
+ void fft_table_sines_cosines_fast(int log2n, Pair * r)
+ {
+ auto p0 = r;
+ auto p1 = p0 + 1;
+ auto p1end = p0 + (1<<log2n) - 1;
+
+ real dphi = - asin(1.0);
+
+ (*p0)[0] = 1;
+ (*p0)[1] = 0;
+ while(p1 < p1end)
+ {
+ T cdphi = cos(dphi);
+ T sdphi = sin(dphi);
+ dphi *= cast(T)0.5;
+
+ auto p0end = p1;
+ while(p0 < p0end)
+ {
+ auto c = (*p0)[0];
+ auto s = (*p0)[1];
+ p0++;
+ (*p1)[0] = c;
+ (*p1)[1] = s;
+ p1++;
+ (*p1)[0] = c * cdphi - s * sdphi;
+ (*p1)[1] = c * sdphi + s * cdphi;
+ p1++;
+ }
+ }
+ }
+
+ void fft_table_sines_cosines(int log2n, Pair * r)
{
auto p = r;
for (int s = 1; s <= log2n; ++s)
@@ -137,6 +169,21 @@ template FFT(alias V, Options)
p[i][0] = cos(dphi*i);
p[i][1] = -sin(dphi*i);
}
+ p += m/2;
+ }
+ }
+
+ void fft_table_impl(int log2n, int n_reversed_loops, Pair * r)
+ {
+ static if(is(typeof(Options.fast_init)))
+ fft_table_sines_cosines_fast(log2n, r);
+ else
+ fft_table_sines_cosines(log2n, r);
+
+ auto p = r;
+ for (int s = 1; s <= log2n; ++s)
+ {
+ size_t m = 1 << s;
if(s <= log2n - n_reversed_loops)
bit_reverse_simple(p, s - 1);
else
@@ -236,7 +283,7 @@ template FFT(alias V, Options)
pi[i0] = ai0 + ai1;
pi[i1] = ai0 - ai1;
- pr[i2] = ar2 + ai3; // needed to swap add and sub in these four lines
+ pr[i2] = ar2 + ai3;
pr[i3] = ar2 - ai3;
pi[i2] = ai2 - ar3;
pi[i3] = ai2 + ar3;
@@ -352,9 +399,9 @@ template FFT(alias V, Options)
auto ibuffer = aligned_ptr!vec(imem.ptr, 64);
for(vec* pp = pr, pb = rbuffer; pp < pr + N; pb += chunk_size, pp += stride)
- BR.copy_some!(chunk_size)(pb, pp);
+ BR.copy_array(pb, pp, chunk_size);
for(vec* pp = pi, pb = ibuffer; pp < pi + N; pb += chunk_size, pp += stride)
- BR.copy_some!(chunk_size)(pb, pp);
+ BR.copy_array(pb, pp, chunk_size);
size_t m2 = l*chunk_size/2;
size_t m2_limit = m2>>nPasses;
@@ -375,9 +422,9 @@ template FFT(alias V, Options)
}
for(vec* pp = pr, pb = rbuffer; pp < pr + N; pb += chunk_size, pp += stride)
- BR.copy_some!(chunk_size)(pp, pb);
+ BR.copy_array(pp, pb, chunk_size);
for(vec* pp = pi, pb = ibuffer; pp < pi + N; pb += chunk_size, pp += stride)
- BR.copy_some!(chunk_size)(pp, pb);
+ BR.copy_array(pp, pb, chunk_size);
}
void fft_passes_recursive(
View
@@ -161,9 +161,10 @@ struct Options
{
enum log2_bitreverse_large_chunk_size = 5;
enum large_limit = 14;
- enum log2_optimal_n = 9;
+ enum log2_optimal_n = 10;
enum passes_per_recursive_call = 5;
enum log2_recursive_passes_chunk_size = 5;
+ enum { fast_init };
}
version(CPP_IMPL)

0 comments on commit 7312572

Please sign in to comment.