use extra global memory to help reduce roundoff error, fix fangq#41

jdtatz · Jul 20, 2018 · 93188f1 · 93188f1
1 parent 4fe099f
commit 93188f1
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 10 deletions.
diff --git a/src/br2cu.h b/src/br2cu.h
@@ -44,6 +44,7 @@ cudaMemcpyDeviceToHost);}
 #define int3(a,b,c) make_int3(a,b,c)               /**< int3 constructor */
 #define uint2(a,b) make_uint2(a,b)                 /**< uint2 constructor */
 #define uint3(a,b,c) make_uint3(a,b,c)             /**< uint3 constructor */
+#define uint4(a,b,c,d) make_uint4(a,b,c,d)         /**< uint4 constructor */
 #define float1(a) make_float1(a)                   /**< float1 constructor */
 #define float2(a,b) make_float2(a,b)               /**< float2 constructor */
 #define float3(a,b,c) make_float3(a,b,c)           /**< float3 constructor */

diff --git a/src/mcx_core.cu b/src/mcx_core.cu
@@ -102,19 +102,19 @@ extern __shared__ float sharedmem[];
  * @brief Floating-point atomic addition
  */
 
-__device__ inline void atomicadd(float* address, float value){
+__device__ inline float atomicadd(float* address, float value){
 
 #if __CUDA_ARCH__ >= 200 ///< for Fermi, atomicAdd supports floats
 
-  atomicAdd(address,value);
+  return atomicAdd(address,value);
 
 #elif __CUDA_ARCH__ >= 110
 
 // float-atomic-add from 
 // http://forums.nvidia.com/index.php?showtopic=158039&view=findpost&p=991561
   float old = value;  
   while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f);
-
+  return old;
 #endif
 
 }
@@ -1247,7 +1247,13 @@ kernel void mcx_main_loop(uint media[],float field[],float genergy[],uint n_seed
                }else{
 	          /** accummulate the quality to the volume using atomic operations  */
                   // ifndef CUDA_NO_SM_11_ATOMIC_INTRINSICS
-		  atomicadd(& field[idx1dold+tshift*gcfg->dimlen.z], weight);
+		  float oldval=atomicadd(& field[idx1dold+tshift*gcfg->dimlen.z], weight);
+		  if(oldval>MAX_ACCUM){
+			if(atomicadd(& field[idx1dold+tshift*gcfg->dimlen.z], -oldval)<0.f)
+			    atomicadd(& field[idx1dold+tshift*gcfg->dimlen.z], oldval);
+			else
+			    atomicadd(& field[idx1dold+tshift*gcfg->dimlen.z+gcfg->dimlen.w], oldval);
+		  }
                   GPUDEBUG(("atomic write to [%d] %e, w=%f\n",idx1dold,weight,p.w));
                }
   #endif
@@ -1529,7 +1535,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){
      size_t fieldlen;
      uint3 cp0=cfg->crop0,cp1=cfg->crop1;
      uint2 cachebox;
-     uint3 dimlen;
+     uint4 dimlen;
      float Vvox,fullload=0.f;
 
      dim3 mcgrid, mcblock;
@@ -1560,7 +1566,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){
      unsigned int is2d=(cfg->dim.x==1 ? 1 : (cfg->dim.y==1 ? 2 : (cfg->dim.z==1 ? 3 : 0)));
      MCXParam param={cfg->steps,minstep,0,0,cfg->tend,R_C0*cfg->unitinmm,
                      (uint)cfg->issave2pt,(uint)cfg->isreflect,(uint)cfg->isrefint,(uint)cfg->issavedet,1.f/cfg->tstep,
-		     p0,c0,maxidx,uint3(0,0,0),cp0,cp1,uint2(0,0),cfg->minenergy,
+		     p0,c0,maxidx,uint4(0,0,0,0),cp0,cp1,uint2(0,0),cfg->minenergy,
                      cfg->sradius*cfg->sradius,minstep*R_C0*cfg->unitinmm,cfg->srctype,
 		     cfg->srcparam1,cfg->srcparam2,cfg->voidtime,cfg->maxdetphoton,
 		     cfg->medianum-1,cfg->detnum,cfg->maxgate,0,0,cfg->reseedlimit,ABS(cfg->sradius+2.f)<EPS /*isatomic*/,
@@ -1748,7 +1754,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){
 
      CUDA_ASSERT(cudaMalloc((void **) &gmedia, sizeof(uint)*(dimxyz)));
      //CUDA_ASSERT(cudaBindTexture(0, texmedia, gmedia));
-     CUDA_ASSERT(cudaMalloc((void **) &gfield, sizeof(float)*fieldlen));
+     CUDA_ASSERT(cudaMalloc((void **) &gfield, sizeof(float)*fieldlen*2));
      CUDA_ASSERT(cudaMalloc((void **) &gPpos, sizeof(float4)*gpu[gpuid].autothread));
      CUDA_ASSERT(cudaMalloc((void **) &gPdir, sizeof(float4)*gpu[gpuid].autothread));
      CUDA_ASSERT(cudaMalloc((void **) &gPlen, sizeof(float4)*gpu[gpuid].autothread));
@@ -1806,6 +1812,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){
      dimlen.y=cfg->dim.y*cfg->dim.x;
 
      dimlen.z=cfg->dim.x*cfg->dim.y*cfg->dim.z;
+     dimlen.w=fieldlen;
 
      param.dimlen=dimlen;
      param.cachebox=cachebox;
@@ -1911,7 +1918,7 @@ void mcx_run_simulation(Config *cfg,GPUInfo *gpu){
 
        //total number of repetition for the simulations, results will be accumulated to field
        for(iter=0;iter<ABS(cfg->respin);iter++){
-           CUDA_ASSERT(cudaMemset(gfield,0,sizeof(float)*fieldlen)); // cost about 1 ms
+           CUDA_ASSERT(cudaMemset(gfield,0,sizeof(float)*fieldlen*2)); // cost about 1 ms
            CUDA_ASSERT(cudaMemset(gPdet,0,sizeof(float)*cfg->maxdetphoton*(detreclen)));
            if(cfg->issaveseed)
 	       CUDA_ASSERT(cudaMemset(gseeddata,0,sizeof(RandType)*cfg->maxdetphoton*RAND_BUF_LEN));
@@ -2041,8 +2048,12 @@ is more than what your have specified (%d), please use the -H option to specify
 
 	   //handling the 2pt distributions
            if(cfg->issave2pt){
-               CUDA_ASSERT(cudaMemcpy(field, gfield,sizeof(float)*fieldlen,cudaMemcpyDeviceToHost));
+	       float *rawfield=(float*)malloc(sizeof(float)*fieldlen*2);
+               CUDA_ASSERT(cudaMemcpy(rawfield, gfield,sizeof(float)*fieldlen*2,cudaMemcpyDeviceToHost));
                MCX_FPRINTF(cfg->flog,"transfer complete:\t%d ms\n",GetTimeMillis()-tic);  fflush(cfg->flog);
+	       for(i=0;i<(int)fieldlen;i++)  //accumulate field, can be done in the GPU
+	           field[i]=rawfield[i]+rawfield[i+fieldlen];
+	       free(rawfield);
 
                if(ABS(cfg->respin)>1){
                    for(i=0;i<(int)fieldlen;i++)  //accumulate field, can be done in the GPU

diff --git a/src/mcx_core.h b/src/mcx_core.h
@@ -38,6 +38,7 @@ extern "C" {
 #define MCX_DEBUG_RNG       1                   /**< MCX debug flags */
 #define MCX_DEBUG_MOVE      2
 #define MCX_DEBUG_PROGRESS  4
+#define MAX_ACCUM           1000.f
 
 #define ROULETTE_SIZE       10.f                /**< Russian Roulette size */
 
@@ -110,7 +111,7 @@ typedef struct  __align__(16) KernelParams {
   float4 ps;                         /**< initial position vector, for pencil beam */
   float4 c0;                         /**< initial directon vector, for pencil beam */
   float3 maxidx;                     /**< maximum index in x/y/z directions for out-of-bound tests */
-  uint3  dimlen;                     /**< maximum index used to convert x/y/z to 1D array index */
+  uint4  dimlen;                     /**< maximum index used to convert x/y/z to 1D array index */
   uint3  cp0;                        /**< 3D coordinates of one diagonal of the cached region  (obsolete) */
   uint3  cp1;                        /**< 3D coordinates of the other diagonal of the cached region  (obsolete) */
   uint2  cachebox;                   /**< stride for cachebox data acess  (obsolete) */