Add new evaluatros based on Ellpack Ilp kernel, starts kernels refact…

…oring and move duplicate code, especially SpMV operation to separate functions. Test new mechanism of choosing texture reference.
ksopyla · Sep 21, 2013 · 6b7c6b4 · 6b7c6b4
1 parent 238cdb7
commit 6b7c6b4
Show file tree

Hide file tree

Showing 18 changed files with 1,524 additions and 57 deletions.
diff --git a/KMLib.CUDA/CUDAmodules/Config.h b/KMLib.CUDA/CUDAmodules/Config.h
@@ -8,16 +8,22 @@ web page: http://wmii.uwm.edu.pl/~ksopyla/projects/svm-net-with-cuda-kmlib/
 #ifndef KERNELS_CONFIG
 #define KERNELS_CONFIG
 
+#include <float.h>
 
 
 //texture fo labels assiociated with vectors
 texture<float,1,cudaReadModeElementType> labelsTexRef;
 
 texture<float,1,cudaReadModeElementType> mainVecTexRef;
-//texture<float,1,cudaReadModeElementType> mainVectorTexRef;
+//needed for prediction and acynchronous transfers
+texture<float,1,cudaReadModeElementType> mainVec2TexRef;
+
+
 
 #define BLOCK_SIZE 256
 
+#define BLOCK_SIZE_RED 128
+
 #define WARP_SIZE 32
 
 #define PREFETCH_SIZE 2
@@ -60,4 +66,75 @@ extern "C" __global__ void setSignForPrediction(float * inputArray,const int siz
 
 
 
+
+
+extern "C" __global__ void reduce(const float* input, float* output, const int N)
+{
+
+	__shared__ float shVals[BLOCK_SIZE_RED];     
+
+	// perform first level of reduction,
+	// reading from global memory, writing to shared memory
+	unsigned int tid = threadIdx.x;
+	unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x;
+	unsigned int gridSize = blockDim.x*2*gridDim.x;
+
+	shVals[tid]=-FLT_MAX;   
+	float sum=0;
+
+	while (i < N)
+	{   
+		sum+=input[i];
+
+		// ensure we don't read out of bounds 
+		if (i + BLOCK_SIZE_RED < N) {
+			sum+=input[i+BLOCK_SIZE_RED];
+		}
+		i += gridSize;
+	} 
+
+	// each thread puts its local sum into shared memory 
+	shVals[tid] = sum;
+	__syncthreads();
+
+	if (BLOCK_SIZE_RED >= 512) { 
+		if (tid < 256) { 
+			shVals[tid]=sum=sum+ shVals[tid+256]; 
+		} 
+		__syncthreads(); 
+	}
+	if (BLOCK_SIZE_RED >= 256) { 
+		if (tid < 128) { 
+			shVals[tid]=sum=sum+ shVals[tid+128]; 
+		} 
+		__syncthreads(); 
+	}
+	if (BLOCK_SIZE_RED >= 128) { 
+		if (tid < 64) { 
+			shVals[tid]=sum=sum+ shVals[tid+64]; 
+		} 
+		__syncthreads(); 
+	}
+
+
+	if (tid < 32)
+	{
+		 volatile float *smem = shVals;
+
+		 smem[tid] = sum = sum + smem[tid + 32];
+		 smem[tid] = sum = sum + smem[tid + 16];
+		 smem[tid] = sum = sum + smem[tid +  8];
+		 smem[tid] = sum = sum + smem[tid +  4];
+		 smem[tid] = sum = sum + smem[tid +  2];
+		 smem[tid] = sum = sum + smem[tid +  1];
+	}
+
+	// write result for this block to global mem 
+	if (tid == 0) {
+		output[blockIdx.x] = shVals[0];
+	}
+
+}
+
+
 #endif /* KERNELS_CONFIG */
diff --git a/KMLib.CUDA/CUDAmodules/KernelsEllpack.cu b/KMLib.CUDA/CUDAmodules/KernelsEllpack.cu
@@ -9,6 +9,15 @@ web page: http://wmii.uwm.edu.pl/~ksopyla/projects/svm-net-with-cuda-kmlib/
 
 #include <Config.h>
 
+
+template<int TexSel> __device__ float SpMV_Ellpack_ILP(const float * vals,
+									   const int * colIdx, 
+									   const int * rowLength,
+									   const int row,
+									   const int num_rows);
+
+
+
 //cuda kernel funtion for computing SVM RBF kernel, uses 
 // Ellpack-R fromat for storing sparse matrix, labels are in texture cache,  uses ILP - prefetch vector elements in registers
 // arrays vals and colIdx should be aligned to PREFETCH_SIZE
@@ -21,6 +30,11 @@ web page: http://wmii.uwm.edu.pl/~ksopyla/projects/svm-net-with-cuda-kmlib/
 //num_rows -number of vectors
 //mainVecIndex - main vector index, needed for retriving its label
 //gamma - gamma parameter for RBF 
+
+
+
+
+
 extern "C" __global__ void rbfEllpackFormatKernel_ILP(const float * vals,
 									   const int * colIdx, 
 									   const int * rowLength, 
@@ -43,6 +57,7 @@ extern "C" __global__ void rbfEllpackFormatKernel_ILP(const float * vals,
 	//	shDot[threadIdx.x*PREFETCH_SIZE+j]=0.0;
 	//}
 
+	//myTex1Dfetch<1>(5);
 
 	if(threadIdx.x==0)
 	{
@@ -108,6 +123,43 @@ extern "C" __global__ void rbfEllpackFormatKernel_ILP(const float * vals,
 
 }
 
+extern "C" __global__ void rbfEllpackFormatKernel_ILP_func(const float * vals,
+									   const int * colIdx, 
+									   const int * rowLength, 
+									   const float* selfDot,
+									   float * results,
+									   const int num_rows,
+									   const int mainVecIndex,
+									   const float gamma)
+{
+
+
+	__shared__ float shGamma;
+	__shared__ int shMainVecIdx;
+	__shared__ float shMainSelfDot;
+	__shared__ float shLabel;
+	__shared__ int shRows;
+
+	if(threadIdx.x==0)
+	{
+		shMainVecIdx=mainVecIndex;
+		shGamma = gamma;
+		shMainSelfDot = selfDot[shMainVecIdx];
+		shLabel = tex1Dfetch(labelsTexRef,shMainVecIdx);
+		shRows= num_rows;
+	}
+	__syncthreads();
+
+	const int row   = blockDim.x * blockIdx.x + threadIdx.x;  // global thread index
+
+	if(row<shRows)
+	{
+		float dot = SpMV_Ellpack_ILP<1>(vals,colIdx,rowLength,row,num_rows);
+		results[row]=tex1Dfetch(labelsTexRef,row)*shLabel*expf(-shGamma*(selfDot[row]+shMainSelfDot-2*dot));
+
+	}	
+
+}
 
 
 //cuda kernel funtion for computing SVM RBF kernel, uses 
@@ -1026,6 +1078,115 @@ extern "C" __global__ void chiSquaredEllpackKernel(const float * vals,
 
 }
 
+/********************** Evaluators **********************************/
+
+
+extern "C" __global__ void rbfEllpackILPEvaluator(const float * vals,
+									   const int * colIdx, 
+									   const int * rowLength, 
+									   const float* svSelfDot,
+										const float* svAlpha,
+										const float* svY,
+									   float * results,
+									   const int num_rows,
+									   const float vecSelfDot,
+									   const float gamma,
+										const int texSel)
+{
+
+	__shared__ float shGamma;
+	__shared__ float shVecSelfDot;
+	__shared__ int shRows;
+
+	if(threadIdx.x==0)
+	{
+		shGamma = gamma;
+		shVecSelfDot = vecSelfDot,
+		shRows= num_rows;
+	}
+	__syncthreads();
+
+	const int row   = blockDim.x * blockIdx.x + threadIdx.x;  // global thread index
+
+	if(row<shRows)
+	{
+		//hack for choosing different texture reference when launch in defferent streams
+		float dot = texSel==1 ? SpMV_Ellpack_ILP<1>(vals,colIdx,rowLength,row,num_rows): SpMV_Ellpack_ILP<2>(vals,colIdx,rowLength,row,num_rows) ;
+		results[row]=svY[row]*svAlpha[row]*expf(-shGamma*(svSelfDot[row]+shVecSelfDot-2*dot));
+	}	
+
+}
+
+
+
+
+
+/*********************** Sparse matrix dense vector multiplication helpers **********/
+
+
+
+template<int TexSel> __device__ float fetchTex(int idx);
+
+template<> __device__ float fetchTex<1>(int idx) { return tex1Dfetch(mainVecTexRef,idx); }
+template<> __device__ float fetchTex<2>(int idx) { return tex1Dfetch(mainVec2TexRef,idx); }
+
+//cuda kernel funtion for computing SpMV
+// Ellpack-R fromat for storing sparse matrix,  uses ILP - prefetch vector elements in registers
+// arrays vals and colIdx should be aligned to PREFETCH_SIZE
+//Params:
+//vals - array of vectors values
+//colIdx  - array of column indexes in ellpack-r fromat
+//rowLength -array, contains number of nonzero elements in each row
+//num_rows -number of vectors
+template<int TexSel> __device__ float SpMV_Ellpack_ILP(const float * vals,
+									   const int * colIdx, 
+									   const int * rowLength,
+									   const int row,
+									   const int num_rows)
+{
+
+	__shared__ int shRows;
+	if(threadIdx.x==0)
+	{
+		shRows = num_rows;
+	}
+	__syncthreads();
+
+	float preVals[PREFETCH_SIZE];
+	int preColls[PREFETCH_SIZE];
+
+	float dot[PREFETCH_SIZE]={0};
+
+	int maxEl = rowLength[row];
+
+
+	for(int i=0; i<maxEl;i++)
+	{
+		#pragma unroll
+		for(int j=0; j<PREFETCH_SIZE;j++)			
+		{
+			preColls[j]=colIdx[ (i*PREFETCH_SIZE+j)*shRows+row];
+			preVals[j]=vals[ (i*PREFETCH_SIZE+j)*shRows+row];
+		}
+
+		#pragma unroll
+		for(int j=0; j<PREFETCH_SIZE;j++){
+			//dot[j]+=preVals[j]*tex1Dfetch(mainVecTexRef,preColls[j]);
+			dot[j]+=preVals[j]* fetchTex<TexSel>(preColls[j]);
+		}
+
+	}
+
+	#pragma unroll
+	for(int j=1; j<PREFETCH_SIZE;j++){
+		dot[0]+=dot[j];
+	}
+
+	return dot[0];		
+}
+
+
+
 
 
 

diff --git a/KMLib.CUDA/CUDAmodules/KernelsEllpackCol2.cu b/KMLib.CUDA/CUDAmodules/KernelsEllpackCol2.cu
@@ -15,6 +15,7 @@ texture<float,1,cudaReadModeElementType>  VecI_TexRef;
 texture<float,1,cudaReadModeElementType>  VecJ_TexRef;
 
 
+
 //cuda kernel funtion for computing SVM RBF kernel, uses 
 // Ellpack-R fromat for storing sparse matrix, labels are in texture cache,  uses ILP - prefetch vector elements in registers
 // arrays vals and colIdx should be aligned to PREFETCH_SIZE

diff --git a/KMLib.CUDA/CUDAmodules/gpuFOSmoSolver.cu b/KMLib.CUDA/CUDAmodules/gpuFOSmoSolver.cu
@@ -127,8 +127,6 @@ extern "C" __global__ void FindMaxI_MinJ(const float* y,
 	while (i < N)
 	{   
 		yi=(int)y[i];
-
-
 		alpha_i=alpha[i];
 		yib = y[i+blockDim.x];
 		alpha_ib=alpha[i+blockDim.x];
@@ -145,13 +143,13 @@ extern "C" __global__ void FindMaxI_MinJ(const float* y,
 		maxGi<temp1Max ? shIdxI[tid]=i : 0;
 		maxGi = fmaxf(maxGi, temp1Max );
 
-		maxGi<temp2Max ? shIdxI[tid]=i+blockDim.x : 0;
-		maxGi = fmaxf(maxGi, temp2Max );
-
-
 		maxGj<temp1Min ? shIdxJ[tid]=i : 0;
 		maxGj = fmaxf(maxGj, temp1Min );
 
+
+		maxGi<temp2Max ? shIdxI[tid]=i+blockDim.x : 0;
+		maxGi = fmaxf(maxGi, temp2Max );
+
 		maxGj<temp2Min ? shIdxJ[tid]=i+blockDim.x : 0;
 		maxGj = fmaxf(maxGj,temp2Min);
 

diff --git a/KMLib.CUDA/CudaRBFCSREvaluator.cs b/KMLib.CUDA/CudaRBFCSREvaluator.cs
@@ -161,6 +161,7 @@ public override float[] Predict(SparseVec[] elements)
                 cuda.SynchronizeStream(stream);
                 //copy asynchronously from buffer to devece
                 cuda.CopyHostToDeviceAsync(mainVecPtr, svVecIntPtrs[k % 2], memSvSize, stream);
+
                 //set the last parameter in kernel (column index)   
                 // colIndexParamOffset
                 cuda.SetParameter(cuFunc, lastParameterOffset, (uint)k);