Skip to content

Commit

Permalink
Add new evaluatros based on Ellpack Ilp kernel, starts kernels refact…
Browse files Browse the repository at this point in the history
…oring and move duplicate code, especially SpMV operation to separate functions. Test new mechanism of choosing texture reference.
  • Loading branch information
ksopyla committed Sep 21, 2013
1 parent 238cdb7 commit 6b7c6b4
Show file tree
Hide file tree
Showing 18 changed files with 1,524 additions and 57 deletions.
79 changes: 78 additions & 1 deletion KMLib.CUDA/CUDAmodules/Config.h
Expand Up @@ -8,16 +8,22 @@ web page: http://wmii.uwm.edu.pl/~ksopyla/projects/svm-net-with-cuda-kmlib/
#ifndef KERNELS_CONFIG
#define KERNELS_CONFIG

#include <float.h>


//texture fo labels assiociated with vectors
texture<float,1,cudaReadModeElementType> labelsTexRef;

texture<float,1,cudaReadModeElementType> mainVecTexRef;
//texture<float,1,cudaReadModeElementType> mainVectorTexRef;
//needed for prediction and acynchronous transfers
texture<float,1,cudaReadModeElementType> mainVec2TexRef;



#define BLOCK_SIZE 256

#define BLOCK_SIZE_RED 128

#define WARP_SIZE 32

#define PREFETCH_SIZE 2
Expand Down Expand Up @@ -60,4 +66,75 @@ extern "C" __global__ void setSignForPrediction(float * inputArray,const int siz





extern "C" __global__ void reduce(const float* input, float* output, const int N)
{

__shared__ float shVals[BLOCK_SIZE_RED];

// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x;
unsigned int gridSize = blockDim.x*2*gridDim.x;

shVals[tid]=-FLT_MAX;
float sum=0;

while (i < N)
{
sum+=input[i];

// ensure we don't read out of bounds
if (i + BLOCK_SIZE_RED < N) {
sum+=input[i+BLOCK_SIZE_RED];
}
i += gridSize;
}

// each thread puts its local sum into shared memory
shVals[tid] = sum;
__syncthreads();

if (BLOCK_SIZE_RED >= 512) {
if (tid < 256) {
shVals[tid]=sum=sum+ shVals[tid+256];
}
__syncthreads();
}
if (BLOCK_SIZE_RED >= 256) {
if (tid < 128) {
shVals[tid]=sum=sum+ shVals[tid+128];
}
__syncthreads();
}
if (BLOCK_SIZE_RED >= 128) {
if (tid < 64) {
shVals[tid]=sum=sum+ shVals[tid+64];
}
__syncthreads();
}


if (tid < 32)
{
volatile float *smem = shVals;

smem[tid] = sum = sum + smem[tid + 32];
smem[tid] = sum = sum + smem[tid + 16];
smem[tid] = sum = sum + smem[tid + 8];
smem[tid] = sum = sum + smem[tid + 4];
smem[tid] = sum = sum + smem[tid + 2];
smem[tid] = sum = sum + smem[tid + 1];
}

// write result for this block to global mem
if (tid == 0) {
output[blockIdx.x] = shVals[0];
}

}


#endif /* KERNELS_CONFIG */
161 changes: 161 additions & 0 deletions KMLib.CUDA/CUDAmodules/KernelsEllpack.cu
Expand Up @@ -9,6 +9,15 @@ web page: http://wmii.uwm.edu.pl/~ksopyla/projects/svm-net-with-cuda-kmlib/

#include <Config.h>


template<int TexSel> __device__ float SpMV_Ellpack_ILP(const float * vals,
const int * colIdx,
const int * rowLength,
const int row,
const int num_rows);



//cuda kernel funtion for computing SVM RBF kernel, uses
// Ellpack-R fromat for storing sparse matrix, labels are in texture cache, uses ILP - prefetch vector elements in registers
// arrays vals and colIdx should be aligned to PREFETCH_SIZE
Expand All @@ -21,6 +30,11 @@ web page: http://wmii.uwm.edu.pl/~ksopyla/projects/svm-net-with-cuda-kmlib/
//num_rows -number of vectors
//mainVecIndex - main vector index, needed for retriving its label
//gamma - gamma parameter for RBF





extern "C" __global__ void rbfEllpackFormatKernel_ILP(const float * vals,
const int * colIdx,
const int * rowLength,
Expand All @@ -43,6 +57,7 @@ extern "C" __global__ void rbfEllpackFormatKernel_ILP(const float * vals,
// shDot[threadIdx.x*PREFETCH_SIZE+j]=0.0;
//}

//myTex1Dfetch<1>(5);

if(threadIdx.x==0)
{
Expand Down Expand Up @@ -108,6 +123,43 @@ extern "C" __global__ void rbfEllpackFormatKernel_ILP(const float * vals,

}

extern "C" __global__ void rbfEllpackFormatKernel_ILP_func(const float * vals,
const int * colIdx,
const int * rowLength,
const float* selfDot,
float * results,
const int num_rows,
const int mainVecIndex,
const float gamma)
{


__shared__ float shGamma;
__shared__ int shMainVecIdx;
__shared__ float shMainSelfDot;
__shared__ float shLabel;
__shared__ int shRows;

if(threadIdx.x==0)
{
shMainVecIdx=mainVecIndex;
shGamma = gamma;
shMainSelfDot = selfDot[shMainVecIdx];
shLabel = tex1Dfetch(labelsTexRef,shMainVecIdx);
shRows= num_rows;
}
__syncthreads();

const int row = blockDim.x * blockIdx.x + threadIdx.x; // global thread index

if(row<shRows)
{
float dot = SpMV_Ellpack_ILP<1>(vals,colIdx,rowLength,row,num_rows);
results[row]=tex1Dfetch(labelsTexRef,row)*shLabel*expf(-shGamma*(selfDot[row]+shMainSelfDot-2*dot));

}

}


//cuda kernel funtion for computing SVM RBF kernel, uses
Expand Down Expand Up @@ -1026,6 +1078,115 @@ extern "C" __global__ void chiSquaredEllpackKernel(const float * vals,

}

/********************** Evaluators **********************************/


extern "C" __global__ void rbfEllpackILPEvaluator(const float * vals,
const int * colIdx,
const int * rowLength,
const float* svSelfDot,
const float* svAlpha,
const float* svY,
float * results,
const int num_rows,
const float vecSelfDot,
const float gamma,
const int texSel)
{

__shared__ float shGamma;
__shared__ float shVecSelfDot;
__shared__ int shRows;

if(threadIdx.x==0)
{
shGamma = gamma;
shVecSelfDot = vecSelfDot,
shRows= num_rows;
}
__syncthreads();

const int row = blockDim.x * blockIdx.x + threadIdx.x; // global thread index

if(row<shRows)
{
//hack for choosing different texture reference when launch in defferent streams
float dot = texSel==1 ? SpMV_Ellpack_ILP<1>(vals,colIdx,rowLength,row,num_rows): SpMV_Ellpack_ILP<2>(vals,colIdx,rowLength,row,num_rows) ;
results[row]=svY[row]*svAlpha[row]*expf(-shGamma*(svSelfDot[row]+shVecSelfDot-2*dot));
}

}





/*********************** Sparse matrix dense vector multiplication helpers **********/



template<int TexSel> __device__ float fetchTex(int idx);

template<> __device__ float fetchTex<1>(int idx) { return tex1Dfetch(mainVecTexRef,idx); }
template<> __device__ float fetchTex<2>(int idx) { return tex1Dfetch(mainVec2TexRef,idx); }

//cuda kernel funtion for computing SpMV
// Ellpack-R fromat for storing sparse matrix, uses ILP - prefetch vector elements in registers
// arrays vals and colIdx should be aligned to PREFETCH_SIZE
//Params:
//vals - array of vectors values
//colIdx - array of column indexes in ellpack-r fromat
//rowLength -array, contains number of nonzero elements in each row
//num_rows -number of vectors
template<int TexSel> __device__ float SpMV_Ellpack_ILP(const float * vals,
const int * colIdx,
const int * rowLength,
const int row,
const int num_rows)
{

__shared__ int shRows;
if(threadIdx.x==0)
{
shRows = num_rows;
}
__syncthreads();

float preVals[PREFETCH_SIZE];
int preColls[PREFETCH_SIZE];

float dot[PREFETCH_SIZE]={0};

int maxEl = rowLength[row];


for(int i=0; i<maxEl;i++)
{
#pragma unroll
for(int j=0; j<PREFETCH_SIZE;j++)
{
preColls[j]=colIdx[ (i*PREFETCH_SIZE+j)*shRows+row];
preVals[j]=vals[ (i*PREFETCH_SIZE+j)*shRows+row];
}

#pragma unroll
for(int j=0; j<PREFETCH_SIZE;j++){
//dot[j]+=preVals[j]*tex1Dfetch(mainVecTexRef,preColls[j]);
dot[j]+=preVals[j]* fetchTex<TexSel>(preColls[j]);
}

}

#pragma unroll
for(int j=1; j<PREFETCH_SIZE;j++){
dot[0]+=dot[j];
}

return dot[0];
}






Expand Down
1 change: 1 addition & 0 deletions KMLib.CUDA/CUDAmodules/KernelsEllpackCol2.cu
Expand Up @@ -15,6 +15,7 @@ texture<float,1,cudaReadModeElementType> VecI_TexRef;
texture<float,1,cudaReadModeElementType> VecJ_TexRef;



//cuda kernel funtion for computing SVM RBF kernel, uses
// Ellpack-R fromat for storing sparse matrix, labels are in texture cache, uses ILP - prefetch vector elements in registers
// arrays vals and colIdx should be aligned to PREFETCH_SIZE
Expand Down
10 changes: 4 additions & 6 deletions KMLib.CUDA/CUDAmodules/gpuFOSmoSolver.cu
Expand Up @@ -127,8 +127,6 @@ extern "C" __global__ void FindMaxI_MinJ(const float* y,
while (i < N)
{
yi=(int)y[i];


alpha_i=alpha[i];
yib = y[i+blockDim.x];
alpha_ib=alpha[i+blockDim.x];
Expand All @@ -145,13 +143,13 @@ extern "C" __global__ void FindMaxI_MinJ(const float* y,
maxGi<temp1Max ? shIdxI[tid]=i : 0;
maxGi = fmaxf(maxGi, temp1Max );

maxGi<temp2Max ? shIdxI[tid]=i+blockDim.x : 0;
maxGi = fmaxf(maxGi, temp2Max );


maxGj<temp1Min ? shIdxJ[tid]=i : 0;
maxGj = fmaxf(maxGj, temp1Min );


maxGi<temp2Max ? shIdxI[tid]=i+blockDim.x : 0;
maxGi = fmaxf(maxGi, temp2Max );

maxGj<temp2Min ? shIdxJ[tid]=i+blockDim.x : 0;
maxGj = fmaxf(maxGj,temp2Min);

Expand Down
1 change: 1 addition & 0 deletions KMLib.CUDA/CudaRBFCSREvaluator.cs
Expand Up @@ -161,6 +161,7 @@ public override float[] Predict(SparseVec[] elements)
cuda.SynchronizeStream(stream);
//copy asynchronously from buffer to devece
cuda.CopyHostToDeviceAsync(mainVecPtr, svVecIntPtrs[k % 2], memSvSize, stream);

//set the last parameter in kernel (column index)
// colIndexParamOffset
cuda.SetParameter(cuFunc, lastParameterOffset, (uint)k);
Expand Down

0 comments on commit 6b7c6b4

Please sign in to comment.