Skip to content

Commit

Permalink
first working version of CuRBFEllILPEvaluator, which does prediction.…
Browse files Browse the repository at this point in the history
… So far it doesn't use async memory transfers and streams.
  • Loading branch information
ksopyla committed Sep 22, 2013
1 parent 6b7c6b4 commit b59cca7
Show file tree
Hide file tree
Showing 10 changed files with 143 additions and 102 deletions.
4 changes: 2 additions & 2 deletions KMLib.CUDA/CUDAmodules/Config.h
Expand Up @@ -35,7 +35,7 @@ texture<float,1,cudaReadModeElementType> mainVec2TexRef;
#define maxNNZ 100


//rho for computing
//rho for computingwse
//__constant__ float RHO=-2;
//sets -1 for negative values and 1 for gather or equal than 0
//params:
Expand Down Expand Up @@ -79,7 +79,7 @@ extern "C" __global__ void reduce(const float* input, float* output, const int N
unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x;
unsigned int gridSize = blockDim.x*2*gridDim.x;

shVals[tid]=-FLT_MAX;
shVals[tid]=0;
float sum=0;

while (i < N)
Expand Down
1 change: 1 addition & 0 deletions KMLib.CUDA/CUDAmodules/KernelsEllpack.cu
Expand Up @@ -1113,6 +1113,7 @@ extern "C" __global__ void rbfEllpackILPEvaluator(const float * vals,
//hack for choosing different texture reference when launch in defferent streams
float dot = texSel==1 ? SpMV_Ellpack_ILP<1>(vals,colIdx,rowLength,row,num_rows): SpMV_Ellpack_ILP<2>(vals,colIdx,rowLength,row,num_rows) ;
results[row]=svY[row]*svAlpha[row]*expf(-shGamma*(svSelfDot[row]+shVecSelfDot-2*dot));
//results[row]=dot;
}

}
Expand Down
16 changes: 10 additions & 6 deletions KMLib.CUDA/CudaRBFCSREvaluator.cs
Expand Up @@ -48,19 +48,23 @@ public override float[] Predict(SparseVec[] elements)
throw new ApplicationException("Evaluator is not initialized. Call init method");


//tranfsorm elements to matrix in CSR format
//transform elements to matrix in CSR format
// elements values
float[] vecVals;
//elements indexes
int[] vecIdx;
//elements lenght
//elements length
int[] vecLenght;
CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, elements);


float[] elSelfDot = new float[elements.Length];



Stopwatch t = Stopwatch.StartNew();

//elSelfDot = elements.Select(e => e.DotProduct()).ToArray();
for (int j = 0; j < elements.Length; j++)
{
float res = 0;
Expand Down Expand Up @@ -99,7 +103,7 @@ public override float[] Predict(SparseVec[] elements)
outputIntPtr = cuda.HostAllocate(memElementsSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0);

// Set the cuda kernel paramerters
// Set the cuda kernel parameters
#region set cuda parameters
uint Rows = (uint)elements.Length;
uint Cols = (uint)TrainedModel.SupportElements.Length;
Expand Down Expand Up @@ -128,7 +132,7 @@ public override float[] Predict(SparseVec[] elements)
offset += IntPtr.Size;
cuda.SetParameter(cuFunc, offset, elSelf.Pointer);
offset += IntPtr.Size;
//set output (reslut) param
//set output (result) param
cuda.SetParameter(cuFunc, offset, outputPtr.Pointer);
offset += IntPtr.Size;
//set number of elements param
Expand Down Expand Up @@ -159,13 +163,13 @@ public override float[] Predict(SparseVec[] elements)
CudaHelpers.InitBuffer(TrainedModel.SupportElements[k], svVecIntPtrs[k % 2]);

cuda.SynchronizeStream(stream);
//copy asynchronously from buffer to devece
//copy asynchronously from buffer to device
cuda.CopyHostToDeviceAsync(mainVecPtr, svVecIntPtrs[k % 2], memSvSize, stream);

//set the last parameter in kernel (column index)
// colIndexParamOffset
cuda.SetParameter(cuFunc, lastParameterOffset, (uint)k);
//launch kernl
//launch kernel
cuda.LaunchAsync(cuFunc, gridDimX, 1, stream);

if (k > 0)
Expand Down
122 changes: 80 additions & 42 deletions KMLib.CUDA/GPUEvaluators/CuEvaluator.cs
Expand Up @@ -22,7 +22,7 @@ namespace KMLib.GPU
/// <summary>
/// base class for all cuda enabled evaluators
/// </summary>
/// <remarks>It sores nesessary data for cuda initialization</remarks>
/// <remarks>It sores necessary data for cuda initialization</remarks>
public abstract class CuEvaluator : Evaluator<SparseVec>
{

Expand Down Expand Up @@ -63,43 +63,45 @@ public abstract class CuEvaluator : Evaluator<SparseVec>
/// </summary>
protected int reductionBlocks;
/// <summary>
/// numer of theread used for reduction
/// number of threads used for reduction
/// </summary>
protected int reductionThreads;

/// <summary>
/// threads per block, def=128,
/// </summary>
protected int maxReductionThreads = 256;
protected int maxReductionThreads = 128;


/// <summary>
/// threads per blokc for eval function
/// threads per block for evaluation function
/// </summary>
protected int evalThreads = CUDAConfig.XBlockSize;
/// <summary>
/// blocks per grid for eval function
/// blocks per grid for evaluation function
/// </summary>
protected int evalBlocks=-1;


/// <summary>
/// array of 2 buffers for concurent data transfer
/// array of 2 buffers for concurrent data transfer
/// </summary>
protected IntPtr[] vecIntPtrs = new IntPtr[NUM_STREAMS];
protected IntPtr[] mainVecIntPtrs = new IntPtr[NUM_STREAMS];

/// <summary>
/// dense support vector float buffer size
/// </summary>
protected uint memSize=0;
protected uint vectorsDimMemSize=0;


protected int vectorSelfDotParamOffset;
protected int texSelParamOffset;
protected int kernelResultParamOffset;


#region cuda types

/// <summary>
/// Cuda .net class for cuda opeation
/// Cuda .net class for cuda operation
/// </summary>
protected CUDA cuda;

Expand All @@ -112,7 +114,7 @@ public abstract class CuEvaluator : Evaluator<SparseVec>
/// <summary>
/// cuda kernel function for computing evaluation values
/// </summary>
protected CUfunction[] cuFuncEval= new CUfunction[NUM_STREAMS];
protected CUfunction cuFuncEval;

/// <summary>
/// cuda kernel function for computing evaluation values
Expand All @@ -130,19 +132,19 @@ public abstract class CuEvaluator : Evaluator<SparseVec>
/// <summary>
/// cuda mainVector pointer to device memory (it stores dense vector for prediction)
/// </summary>
protected CUdeviceptr[] mainVecPtr= new CUdeviceptr[NUM_STREAMS];
protected CUdeviceptr[] mainVecCuPtr= new CUdeviceptr[NUM_STREAMS];


protected CUdeviceptr[] outputCuPtr = new CUdeviceptr[NUM_STREAMS];
protected CUdeviceptr[] evalOutputCuPtr = new CUdeviceptr[NUM_STREAMS];


/// <summary>
/// cuda pointer to labels, neded for coping to texture
/// cuda pointer to labels, needed for coping to texture
/// </summary>
protected CUdeviceptr labelsPtr;

/// <summary>
/// cuda pointer to support vector non zero alpha coeficients
/// cuda pointer to support vector non zero alpha coefficients
/// </summary>
protected CUdeviceptr alphasPtr;

Expand All @@ -151,14 +153,14 @@ public abstract class CuEvaluator : Evaluator<SparseVec>
/// </summary>
protected CUstream[] stream = new CUstream[NUM_STREAMS];

private CUdeviceptr[] cuReducePtr = new CUdeviceptr[NUM_STREAMS];
private CUdeviceptr[] reduceCuPtr = new CUdeviceptr[NUM_STREAMS];


private IntPtr[] redIntPtrs=new IntPtr[NUM_STREAMS];
private IntPtr[] reduceIntPtrs=new IntPtr[NUM_STREAMS];


/// <summary>
/// Offset in cuda setparameter function for pointer to memory to reduce
/// Offset in cuda 'setparameter' function for pointer to memory to reduce
/// </summary>
private int offsetMemToReduce;

Expand All @@ -184,7 +186,7 @@ public override float[] Predict(SparseVec[] elements)

uint reduceSize = (uint)reductionBlocks * sizeof(float);

int loop = (elements.Length +NUM_STREAMS)/ NUM_STREAMS;
int loop = (elements.Length +NUM_STREAMS-1)/ NUM_STREAMS;
for (int i = 0; i < loop; i++)
{

Expand All @@ -195,22 +197,46 @@ public override float[] Predict(SparseVec[] elements)
{
var vec = elements[idx];

//remove
//float[] svDots = TrainedModel.SupportElements.Select(sv => sv.DotProduct(vec)).ToArray();

//set nonzero values to dense vector accessible through vecIntPtr
CudaHelpers.InitBuffer(vec, mainVecIntPtrs[s]);

//set nonzero values to dense vector accesible throught vecIntPtr
CudaHelpers.InitBuffer(vec, vecIntPtrs[s]);

//cuda.CopyHostToDeviceAsync(mainVecPtr[s], vecIntPtrs[s], memSize, stream[s]);
cuda.CopyHostToDevice(mainVecCuPtr[s], mainVecIntPtrs[s], vectorsDimMemSize);

//remove
//float[] v = new float[vec.Dim + 1];
//cuda.CopyDeviceToHost(mainVecCuPtr[s],v);

cuda.CopyHostToDeviceAsync(mainVecPtr[s], vecIntPtrs[s], memSize, stream[s]);

//cuFunc user different textures
cuda.LaunchAsync(cuFuncEval[s], evalBlocks, 1, stream[s]);
cuda.SetParameter(cuFuncEval, kernelResultParamOffset, evalOutputCuPtr[s]);
cuda.SetParameter(cuFuncEval, vectorSelfDotParamOffset, vec.DotProduct());
cuda.SetParameter(cuFuncEval, texSelParamOffset, s+1);

//cuda.LaunchAsync(cuFuncEval[s], evalBlocks, 1, stream[s]);
cuda.Launch(cuFuncEval, evalBlocks, 1);

//remove
//float[] t = new float[sizeSV];
//cuda.CopyDeviceToHost(evalOutputCuPtr[s], t);

cuda.SetParameter(cuFuncReduce, offsetMemToReduce, outputCuPtr[s]);
cuda.LaunchAsync(cuFuncReduce, reductionBlocks, 1, stream[s]);

cuda.CopyDeviceToHostAsync(cuReducePtr[s], redIntPtrs[s], reduceSize, stream[s]);

cuda.SetParameter(cuFuncReduce, offsetMemToReduce, evalOutputCuPtr[s]);
cuda.SetParameter(cuFuncReduce, offsetOutMemReduce, reduceCuPtr[s]);
//cuda.LaunchAsync(cuFuncReduce, reductionBlocks, 1, stream[s]);
cuda.Launch(cuFuncReduce, reductionBlocks, 1);

//cuda.CopyDeviceToHostAsync(cuReducePtr[s], redIntPtrs[s], reduceSize, stream[s]);
cuda.CopyDeviceToHost(reduceCuPtr[s], reduceIntPtrs[s], reduceSize);

//remove
//float[] r = new float[maxReductionBlocks];
//cuda.CopyDeviceToHost(reduceCuPtr[s], r);

}
}
Expand All @@ -225,11 +251,11 @@ public override float[] Predict(SparseVec[] elements)
{
var vec = elements[idx];
//clear the buffer
//set nonzero values to dense vector accesible throught vecIntPtr
CudaHelpers.SetBufferIdx(vec, vecIntPtrs[s], 0.0f);
float evalValue = ReduceOnHost(redIntPtrs[s], reduceSize);
//set nonzero values to dense vector accessible thought vecIntPtr
CudaHelpers.SetBufferIdx(vec, mainVecIntPtrs[s], 0.0f);
float evalValue = ReduceOnHost(reduceIntPtrs[s], reductionBlocks);

prediction[i] = evalValue;
prediction[idx] = evalValue;
}
}

Expand All @@ -244,22 +270,27 @@ public override float[] Predict(SparseVec[] elements)



private float ReduceOnHost(IntPtr reduceIntPtr, uint reduceSize)
private float ReduceOnHost(IntPtr reduceIntPtr, int reduceSize)
{

double sum = 0;
unsafe
{

float* vecPtr = (float*)reduceIntPtr.ToPointer();
for (uint j = 0; j < reduceSize; j++)
for (int j = 0; j < reduceSize; j++)
{

sum+=vecPtr[j];
}

}


//float[] t = new float[maxReductionBlocks];
//System.Runtime.InteropServices.Marshal.Copy(reduceIntPtr, t, 0, t.Length);
//float s = t.Sum();

sum -= TrainedModel.Bias;

float ret = sum < 0 ? -1 : 1;
Expand All @@ -281,7 +312,7 @@ public override void Init()

SetCudaRedFunctionParams();

SetCudaEvalFunctionParams();


IsInitialized = true;

Expand All @@ -301,11 +332,11 @@ protected void SetCudaRedFunctionParams()

int offset = 0;
offsetMemToReduce = offset;
cuda.SetParameter(cuFuncReduce, offset, outputCuPtr[0].Pointer);
cuda.SetParameter(cuFuncReduce, offset, evalOutputCuPtr[0].Pointer);
offset += IntPtr.Size;

offsetOutMemReduce = offset;
cuda.SetParameter(cuFuncReduce, offset, cuReducePtr[0].Pointer);
cuda.SetParameter(cuFuncReduce, offset, reduceCuPtr[0].Pointer);
offset += IntPtr.Size;

cuda.SetParameter(cuFuncReduce, offset, (uint)sizeSV);
Expand All @@ -314,6 +345,7 @@ protected void SetCudaRedFunctionParams()
cuda.SetParameterSize(cuFuncReduce, (uint)offset);
}


protected void SetCudaData()
{

Expand All @@ -334,21 +366,27 @@ protected void SetCudaData()
alphasPtr = cuda.CopyHostToDevice(svAlphas);


memSize = (uint)(TrainedModel.SupportElements[0].Dim * sizeof(float));
vectorsDimMemSize = (uint)((TrainedModel.SupportElements[0].Dim+1) * sizeof(float));
for (int i = 0; i < NUM_STREAMS; i++)
{
stream[i] = cuda.CreateStream();

//allocates memory for one vector, size = vector dim
vecIntPtrs[i] = cuda.AllocateHost(memSize);
mainVecPtr[i] = cuda.Allocate(memSize);
mainVecIntPtrs[i] = cuda.AllocateHost(vectorsDimMemSize);
mainVecCuPtr[i] = cuda.CopyHostToDevice(mainVecIntPtrs[i], vectorsDimMemSize);

//allocate memory for output, size == #SV
outputCuPtr[i] = cuda.Allocate(svAlphas);
evalOutputCuPtr[i] = cuda.Allocate(svAlphas);

cuVecTexRef[i] = cuda.GetModuleTexture(cuModule, cudaVecTexRefName[i]);
//cuda.SetTextureFlags(cuVecTexRef[i], 0);
cuda.SetTextureAddress(cuVecTexRef[i], mainVecPtr[i], memSize);
cuda.SetTextureAddress(cuVecTexRef[i], mainVecCuPtr[i], vectorsDimMemSize);

uint reduceMemSize = (uint)maxReductionBlocks * sizeof(float);
reduceIntPtrs[i] = cuda.AllocateHost(reduceMemSize);
reduceCuPtr[i] = cuda.CopyHostToDevice(reduceIntPtrs[i], reduceMemSize);
//reduceCuPtr[i] = cuda.Allocate((uint)maxReductionBlocks * sizeof(float));

}


Expand All @@ -366,8 +404,8 @@ private void InitCuda()

cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));

cuFuncEval[0] = cuda.GetModuleFunction(cudaEvaluatorKernelName );
cuFuncEval[1] = cuda.GetModuleFunction(cudaEvaluatorKernelName);
cuFuncEval = cuda.GetModuleFunction(cudaEvaluatorKernelName );


cuFuncReduce = cuda.GetModuleFunction(cudaReduceKernelName);
}
Expand Down

0 comments on commit b59cca7

Please sign in to comment.