Skip to content

Commit

Permalink
converted some more reductions to use preprocessor defs
Browse files Browse the repository at this point in the history
  • Loading branch information
michael boulton committed Sep 14, 2015
1 parent 84df834 commit 68a0272
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 128 deletions.
67 changes: 22 additions & 45 deletions state_kern.cl
Expand Up @@ -2008,55 +2008,32 @@ void reduction_epsum_within_tile(__local real2_t *tile)
const unsigned int ntX = get_local_size(0);
real_t corrected_next_term, new_sum;

for (int offset=ntX>>1; offset > 32; offset >>= 1){
if (tiX < offset){
// Kahan sum
corrected_next_term = tile[tiX+offset].s0 + (tile[tiX+offset].s1 +tile[tiX].s1);
for (int offset = ntX >> 1; offset > MIN_REDUCE_SYNC_SIZE; offset >>= 1)
{
if (tiX < offset)
{
corrected_next_term = tile[tiX+offset].s0 + (tile[tiX+offset].s1 +tile[tiX].s1);
new_sum = tile[tiX].s0 + corrected_next_term;
tile[tiX].s1 = corrected_next_term - (new_sum - tile[tiX].s0);
tile[tiX].s0 = new_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tiX < MIN_REDUCE_SYNC_SIZE)
{
for (int offset = MIN_REDUCE_SYNC_SIZE; offset > 1; offset >>= 1)
{
corrected_next_term = tile[tiX+offset].s0 + (tile[tiX+offset].s1 +tile[tiX].s1);
new_sum = tile[tiX].s0 + corrected_next_term;
tile[tiX].s1 = corrected_next_term - (new_sum - tile[tiX].s0);
tile[tiX].s0 = new_sum;
barrier(CLK_LOCAL_MEM_FENCE);
}
corrected_next_term = tile[tiX+1].s0 + (tile[tiX+1].s1 +tile[tiX].s1);
new_sum = tile[tiX].s0 + corrected_next_term;
tile[tiX].s1 = corrected_next_term - (new_sum - tile[tiX].s0);
tile[tiX].s0 = new_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (tiX < 32){
// Kahan sum -- unrolled
corrected_next_term = tile[tiX+32].s0 + (tile[tiX+32].s1 +tile[tiX].s1);
new_sum = tile[tiX].s0 + corrected_next_term;
tile[tiX].s1 = corrected_next_term - (new_sum - tile[tiX].s0);
tile[tiX].s0 = new_sum;
barrier(CLK_LOCAL_MEM_FENCE); /* Fix for Cuda 4.1 */

corrected_next_term = tile[tiX+16].s0 + (tile[tiX+16].s1 +tile[tiX].s1);
new_sum = tile[tiX].s0 + corrected_next_term;
tile[tiX].s1 = corrected_next_term - (new_sum - tile[tiX].s0);
tile[tiX].s0 = new_sum;
barrier(CLK_LOCAL_MEM_FENCE); /* Fix for Cuda 4.1 */

corrected_next_term = tile[tiX+8].s0 + (tile[tiX+8].s1 +tile[tiX].s1);
new_sum = tile[tiX].s0 + corrected_next_term;
tile[tiX].s1 = corrected_next_term - (new_sum - tile[tiX].s0);
tile[tiX].s0 = new_sum;
barrier(CLK_LOCAL_MEM_FENCE); /* Fix for Cuda 4.1 */

corrected_next_term = tile[tiX+4].s0 + (tile[tiX+4].s1 +tile[tiX].s1);
new_sum = tile[tiX].s0 + corrected_next_term;
tile[tiX].s1 = corrected_next_term - (new_sum - tile[tiX].s0);
tile[tiX].s0 = new_sum;
barrier(CLK_LOCAL_MEM_FENCE); /* Fix for Cuda 4.1 */

corrected_next_term = tile[tiX+2].s0 + (tile[tiX+2].s1 +tile[tiX].s1);
new_sum = tile[tiX].s0 + corrected_next_term;
tile[tiX].s1 = corrected_next_term - (new_sum - tile[tiX].s0);
tile[tiX].s0 = new_sum;
barrier(CLK_LOCAL_MEM_FENCE); /* Fix for Cuda 4.1 */

corrected_next_term = tile[tiX+1].s0 + (tile[tiX+1].s1 +tile[tiX].s1);
new_sum = tile[tiX].s0 + corrected_next_term;
tile[tiX].s1 = corrected_next_term - (new_sum - tile[tiX].s0);
tile[tiX].s0 = new_sum;
}

}

__kernel void reduce_epsum_mass_stage1of2_cl(
Expand Down
146 changes: 63 additions & 83 deletions wave_kern.cl
Expand Up @@ -79,10 +79,10 @@ void reduction_minmax_within_tile4(__local int4 *tile);

void reduction_minmax_within_tile4(__local int4 *tile)
{
const unsigned int tiX = get_local_id(0);
const unsigned int ntX = get_local_size(0);
const unsigned int tiX = get_local_id(0);
const unsigned int ntX = get_local_size(0);

for (int offset=ntX>>1; offset > 32; offset >>= 1){
for (int offset=ntX>>1; offset > MIN_REDUCE_SYNC_SIZE; offset >>= 1){
if (tiX < offset){
if (tile[tiX+offset].s0 < tile[tiX].s0) tile[tiX].s0 = tile[tiX+offset].s0;
if (tile[tiX+offset].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+offset].s1;
Expand All @@ -92,101 +92,81 @@ void reduction_minmax_within_tile4(__local int4 *tile)
barrier(CLK_LOCAL_MEM_FENCE);
}

if (tiX < 32){
if (tile[tiX+32].s0 < tile[tiX].s0) tile[tiX].s0 = tile[tiX+32].s0;
if (tile[tiX+32].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+32].s1;
if (tile[tiX+32].s2 < tile[tiX].s2) tile[tiX].s2 = tile[tiX+32].s2;
if (tile[tiX+32].s3 > tile[tiX].s3) tile[tiX].s3 = tile[tiX+32].s3;
barrier(CLK_LOCAL_MEM_FENCE);

if (tile[tiX+16].s0 < tile[tiX].s0) tile[tiX].s0 = tile[tiX+16].s0;
if (tile[tiX+16].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+16].s1;
if (tile[tiX+16].s2 < tile[tiX].s2) tile[tiX].s2 = tile[tiX+16].s2;
if (tile[tiX+16].s3 > tile[tiX].s3) tile[tiX].s3 = tile[tiX+16].s3;
barrier(CLK_LOCAL_MEM_FENCE);

if (tile[tiX+8].s0 < tile[tiX].s0) tile[tiX].s0 = tile[tiX+8].s0;
if (tile[tiX+8].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+8].s1;
if (tile[tiX+8].s2 < tile[tiX].s2) tile[tiX].s2 = tile[tiX+8].s2;
if (tile[tiX+8].s3 > tile[tiX].s3) tile[tiX].s3 = tile[tiX+8].s3;
barrier(CLK_LOCAL_MEM_FENCE);

if (tile[tiX+4].s0 < tile[tiX].s0) tile[tiX].s0 = tile[tiX+4].s0;
if (tile[tiX+4].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+4].s1;
if (tile[tiX+4].s2 < tile[tiX].s2) tile[tiX].s2 = tile[tiX+4].s2;
if (tile[tiX+4].s3 > tile[tiX].s3) tile[tiX].s3 = tile[tiX+4].s3;
barrier(CLK_LOCAL_MEM_FENCE);
if (tiX < MIN_REDUCE_SYNC_SIZE)
{
for (int offset = MIN_REDUCE_SYNC_SIZE; offset > 1; offset >>= 1)
{
if (tile[tiX + offset].s0 < tile[tiX].s0) tile[tiX].s0 = tile[tiX + offset].s0;
if (tile[tiX + offset].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX + offset].s1;
if (tile[tiX + offset].s2 < tile[tiX].s2) tile[tiX].s2 = tile[tiX + offset].s2;
if (tile[tiX + offset].s3 > tile[tiX].s3) tile[tiX].s3 = tile[tiX + offset].s3;
barrier(CLK_LOCAL_MEM_FENCE);
}

if (tile[tiX + 1].s0 < tile[tiX].s0) tile[tiX].s0 = tile[tiX + 1].s0;
if (tile[tiX + 1].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX + 1].s1;
if (tile[tiX + 1].s2 < tile[tiX].s2) tile[tiX].s2 = tile[tiX + 1].s2;
if (tile[tiX + 1].s3 > tile[tiX].s3) tile[tiX].s3 = tile[tiX + 1].s3;
}
}

if (tile[tiX+2].s0 < tile[tiX].s0) tile[tiX].s0 = tile[tiX+2].s0;
if (tile[tiX+2].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+2].s1;
if (tile[tiX+2].s2 < tile[tiX].s2) tile[tiX].s2 = tile[tiX+2].s2;
if (tile[tiX+2].s3 > tile[tiX].s3) tile[tiX].s3 = tile[tiX+2].s3;
barrier(CLK_LOCAL_MEM_FENCE);
real MAX(real a, real b)
{
return max(a, b);
}

if (tile[tiX+1].s0 < tile[tiX].s0) tile[tiX].s0 = tile[tiX+1].s0;
if (tile[tiX+1].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+1].s1;
if (tile[tiX+1].s2 < tile[tiX].s2) tile[tiX].s2 = tile[tiX+1].s2;
if (tile[tiX+1].s3 > tile[tiX].s3) tile[tiX].s3 = tile[tiX+1].s3;
#define REDUCE_IN_TILE(operation, _tile_arr) \
for (int offset = ntX >> 1; offset > MIN_REDUCE_SYNC_SIZE; offset >>= 1) \
{ \
if (tiX < offset) \
{ \
_tile_arr[tiX] = operation(_tile_arr[tiX], _tile_arr[tiX+offset]); \
} \
barrier(CLK_LOCAL_MEM_FENCE); \
} \
if (tiX < MIN_REDUCE_SYNC_SIZE) \
{ \
for (int offset = MIN_REDUCE_SYNC_SIZE; offset > 1; offset >>= 1) \
{ \
_tile_arr[tiX] = operation(_tile_arr[tiX], _tile_arr[tiX+offset]); \
barrier(CLK_LOCAL_MEM_FENCE); \
} \
_tile_arr[tiX] = operation(_tile_arr[tiX], _tile_arr[tiX+1]); \
}
}

void reduction_max_within_tile1(__local real *tile)
{
const unsigned int tiX = get_local_id(0);
const unsigned int ntX = get_local_size(0);

for (int offset=ntX>>1; offset > 32; offset >>= 1){
if (tiX < offset){
if (tile[tiX+offset] > tile[tiX]) tile[tiX] = tile[tiX+offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (tiX < 32){
if (tile[tiX+32] > tile[tiX]) tile[tiX] = tile[tiX+32];
barrier(CLK_LOCAL_MEM_FENCE);
if (tile[tiX+16] > tile[tiX]) tile[tiX] = tile[tiX+16];
barrier(CLK_LOCAL_MEM_FENCE);
if (tile[tiX+8] > tile[tiX]) tile[tiX] = tile[tiX+8];
barrier(CLK_LOCAL_MEM_FENCE);
if (tile[tiX+4] > tile[tiX]) tile[tiX] = tile[tiX+4];
barrier(CLK_LOCAL_MEM_FENCE);
if (tile[tiX+2] > tile[tiX]) tile[tiX] = tile[tiX+2];
barrier(CLK_LOCAL_MEM_FENCE);
if (tile[tiX+1] > tile[tiX]) tile[tiX] = tile[tiX+1];
}

REDUCE_IN_TILE(MAX, tile);
}

void reduction_max_within_tile2(__local real2 *tile)
{
const unsigned int tiX = get_local_id(0);
const unsigned int ntX = get_local_size(0);

for (int offset=ntX>>1; offset > 32; offset >>= 1){
if (tiX < offset){
if (tile[tiX+offset].s0 > tile[tiX].s0) tile[tiX].s0 = tile[tiX+offset].s0;
if (tile[tiX+offset].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+offset].s1;
}
barrier(CLK_LOCAL_MEM_FENCE);
const unsigned int tiX = get_local_id(0);
const unsigned int ntX = get_local_size(0);

for (int offset = ntX >> 1; offset > MIN_REDUCE_SYNC_SIZE; offset >>= 1)
{
if (tiX < offset)
{
if (tile[tiX+offset].s0 > tile[tiX].s0) tile[tiX].s0 = tile[tiX+offset].s0;
if (tile[tiX+offset].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+offset].s1;
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (tiX < 32){
if (tile[tiX+32].s0 > tile[tiX].s0) tile[tiX].s0 = tile[tiX+32].s0;
if (tile[tiX+16].s0 > tile[tiX].s0) tile[tiX].s0 = tile[tiX+16].s0;
if (tile[tiX+8].s0 > tile[tiX].s0) tile[tiX].s0 = tile[tiX+8].s0;
if (tile[tiX+4].s0 > tile[tiX].s0) tile[tiX].s0 = tile[tiX+4].s0;
if (tile[tiX+2].s0 > tile[tiX].s0) tile[tiX].s0 = tile[tiX+2].s0;
if (tile[tiX+1].s0 > tile[tiX].s0) tile[tiX].s0 = tile[tiX+1].s0;

if (tile[tiX+32].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+32].s1;
if (tile[tiX+16].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+16].s1;
if (tile[tiX+8].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+8].s1;
if (tile[tiX+4].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+4].s1;
if (tile[tiX+2].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+2].s1;
if (tile[tiX+1].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX+1].s1;
if (tiX < MIN_REDUCE_SYNC_SIZE)
{
for (int offset = MIN_REDUCE_SYNC_SIZE; offset > 1; offset >>= 1)
{
if (tile[tiX + offset].s0 > tile[tiX].s0) tile[tiX].s0 = tile[tiX + offset].s0;
if (tile[tiX + offset].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX + offset].s1;
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tile[tiX + 1].s0 > tile[tiX].s0) tile[tiX].s0 = tile[tiX + 1].s0;
if (tile[tiX + 1].s1 > tile[tiX].s1) tile[tiX].s1 = tile[tiX + 1].s1;
}

}


0 comments on commit 68a0272

Please sign in to comment.