Skip to content

Commit

Permalink
use extra global memory to help reduce roundoff error, fix fangq#41
Browse files Browse the repository at this point in the history
  • Loading branch information
fangq committed Jul 20, 2018
1 parent 4fe099f commit 93188f1
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 10 deletions.
1 change: 1 addition & 0 deletions src/br2cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ cudaMemcpyDeviceToHost);}
#define int3(a,b,c) make_int3(a,b,c) /**< int3 constructor */
#define uint2(a,b) make_uint2(a,b) /**< uint2 constructor */
#define uint3(a,b,c) make_uint3(a,b,c) /**< uint3 constructor */
#define uint4(a,b,c,d) make_uint4(a,b,c,d) /**< uint4 constructor */
#define float1(a) make_float1(a) /**< float1 constructor */
#define float2(a,b) make_float2(a,b) /**< float2 constructor */
#define float3(a,b,c) make_float3(a,b,c) /**< float3 constructor */
Expand Down
29 changes: 20 additions & 9 deletions src/mcx_core.cu
Original file line number Diff line number Diff line change
Expand Up @@ -102,19 +102,19 @@ extern __shared__ float sharedmem[];
* @brief Floating-point atomic addition
*/

__device__ inline void atomicadd(float* address, float value){
__device__ inline float atomicadd(float* address, float value){

#if __CUDA_ARCH__ >= 200 ///< for Fermi, atomicAdd supports floats

atomicAdd(address,value);
return atomicAdd(address,value);

#elif __CUDA_ARCH__ >= 110

// float-atomic-add from
// http://forums.nvidia.com/index.php?showtopic=158039&view=findpost&p=991561
float old = value;
while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f);

return old;
#endif

}
Expand Down Expand Up @@ -1247,7 +1247,13 @@ kernel void mcx_main_loop(uint media[],float field[],float genergy[],uint n_seed
}else{
/** accummulate the quality to the volume using atomic operations */
// ifndef CUDA_NO_SM_11_ATOMIC_INTRINSICS
atomicadd(& field[idx1dold+tshift*gcfg->dimlen.z], weight);
float oldval=atomicadd(& field[idx1dold+tshift*gcfg->dimlen.z], weight);
if(oldval>MAX_ACCUM){
if(atomicadd(& field[idx1dold+tshift*gcfg->dimlen.z], -oldval)<0.f)
atomicadd(& field[idx1dold+tshift*gcfg->dimlen.z], oldval);
else
atomicadd(& field[idx1dold+tshift*gcfg->dimlen.z+gcfg->dimlen.w], oldval);
}
GPUDEBUG(("atomic write to [%d] %e, w=%f\n",idx1dold,weight,p.w));
}
#endif
Expand Down Expand Up @@ -1529,7 +1535,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){
size_t fieldlen;
uint3 cp0=cfg->crop0,cp1=cfg->crop1;
uint2 cachebox;
uint3 dimlen;
uint4 dimlen;
float Vvox,fullload=0.f;

dim3 mcgrid, mcblock;
Expand Down Expand Up @@ -1560,7 +1566,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){
unsigned int is2d=(cfg->dim.x==1 ? 1 : (cfg->dim.y==1 ? 2 : (cfg->dim.z==1 ? 3 : 0)));
MCXParam param={cfg->steps,minstep,0,0,cfg->tend,R_C0*cfg->unitinmm,
(uint)cfg->issave2pt,(uint)cfg->isreflect,(uint)cfg->isrefint,(uint)cfg->issavedet,1.f/cfg->tstep,
p0,c0,maxidx,uint3(0,0,0),cp0,cp1,uint2(0,0),cfg->minenergy,
p0,c0,maxidx,uint4(0,0,0,0),cp0,cp1,uint2(0,0),cfg->minenergy,
cfg->sradius*cfg->sradius,minstep*R_C0*cfg->unitinmm,cfg->srctype,
cfg->srcparam1,cfg->srcparam2,cfg->voidtime,cfg->maxdetphoton,
cfg->medianum-1,cfg->detnum,cfg->maxgate,0,0,cfg->reseedlimit,ABS(cfg->sradius+2.f)<EPS /*isatomic*/,
Expand Down Expand Up @@ -1748,7 +1754,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){

CUDA_ASSERT(cudaMalloc((void **) &gmedia, sizeof(uint)*(dimxyz)));
//CUDA_ASSERT(cudaBindTexture(0, texmedia, gmedia));
CUDA_ASSERT(cudaMalloc((void **) &gfield, sizeof(float)*fieldlen));
CUDA_ASSERT(cudaMalloc((void **) &gfield, sizeof(float)*fieldlen*2));
CUDA_ASSERT(cudaMalloc((void **) &gPpos, sizeof(float4)*gpu[gpuid].autothread));
CUDA_ASSERT(cudaMalloc((void **) &gPdir, sizeof(float4)*gpu[gpuid].autothread));
CUDA_ASSERT(cudaMalloc((void **) &gPlen, sizeof(float4)*gpu[gpuid].autothread));
Expand Down Expand Up @@ -1806,6 +1812,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){
dimlen.y=cfg->dim.y*cfg->dim.x;

dimlen.z=cfg->dim.x*cfg->dim.y*cfg->dim.z;
dimlen.w=fieldlen;

param.dimlen=dimlen;
param.cachebox=cachebox;
Expand Down Expand Up @@ -1911,7 +1918,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){

//total number of repetition for the simulations, results will be accumulated to field
for(iter=0;iter<ABS(cfg->respin);iter++){
CUDA_ASSERT(cudaMemset(gfield,0,sizeof(float)*fieldlen)); // cost about 1 ms
CUDA_ASSERT(cudaMemset(gfield,0,sizeof(float)*fieldlen*2)); // cost about 1 ms
CUDA_ASSERT(cudaMemset(gPdet,0,sizeof(float)*cfg->maxdetphoton*(detreclen)));
if(cfg->issaveseed)
CUDA_ASSERT(cudaMemset(gseeddata,0,sizeof(RandType)*cfg->maxdetphoton*RAND_BUF_LEN));
Expand Down Expand Up @@ -2041,8 +2048,12 @@ is more than what your have specified (%d), please use the -H option to specify

//handling the 2pt distributions
if(cfg->issave2pt){
CUDA_ASSERT(cudaMemcpy(field, gfield,sizeof(float)*fieldlen,cudaMemcpyDeviceToHost));
float *rawfield=(float*)malloc(sizeof(float)*fieldlen*2);
CUDA_ASSERT(cudaMemcpy(rawfield, gfield,sizeof(float)*fieldlen*2,cudaMemcpyDeviceToHost));
MCX_FPRINTF(cfg->flog,"transfer complete:\t%d ms\n",GetTimeMillis()-tic); fflush(cfg->flog);
for(i=0;i<(int)fieldlen;i++) //accumulate field, can be done in the GPU
field[i]=rawfield[i]+rawfield[i+fieldlen];
free(rawfield);

if(ABS(cfg->respin)>1){
for(i=0;i<(int)fieldlen;i++) //accumulate field, can be done in the GPU
Expand Down
3 changes: 2 additions & 1 deletion src/mcx_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ extern "C" {
#define MCX_DEBUG_RNG 1 /**< MCX debug flags */
#define MCX_DEBUG_MOVE 2
#define MCX_DEBUG_PROGRESS 4
#define MAX_ACCUM 1000.f

#define ROULETTE_SIZE 10.f /**< Russian Roulette size */

Expand Down Expand Up @@ -110,7 +111,7 @@ typedef struct __align__(16) KernelParams {
float4 ps; /**< initial position vector, for pencil beam */
float4 c0; /**< initial directon vector, for pencil beam */
float3 maxidx; /**< maximum index in x/y/z directions for out-of-bound tests */
uint3 dimlen; /**< maximum index used to convert x/y/z to 1D array index */
uint4 dimlen; /**< maximum index used to convert x/y/z to 1D array index */
uint3 cp0; /**< 3D coordinates of one diagonal of the cached region (obsolete) */
uint3 cp1; /**< 3D coordinates of the other diagonal of the cached region (obsolete) */
uint2 cachebox; /**< stride for cachebox data acess (obsolete) */
Expand Down

0 comments on commit 93188f1

Please sign in to comment.