Skip to content

Commit

Permalink
Initial commit of vector addition sample
Browse files Browse the repository at this point in the history
  • Loading branch information
jholewinski committed Aug 10, 2011
0 parents commit a2e309d
Show file tree
Hide file tree
Showing 2 changed files with 302 additions and 0 deletions.
265 changes: 265 additions & 0 deletions kernels/vector-add/vector-add.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
/*
* Copyright (C) 2011 by Justin Holewinski
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

#include <iostream>
#include <fstream>
#include <cmath>

#include <sys/time.h>

#include "cuda.h"


typedef float Real;


//==--- Utility Functions --------------------------------------------------== //
const char * statusToString(CUresult error)
{
switch (error) {
case CUDA_SUCCESS: return "No errors";
case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";

case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";

case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
case CUDA_ERROR_MAP_FAILED: return "Map failed";
case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
case CUDA_ERROR_NOT_MAPPED: return "Not mapped";

case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";

case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";

case CUDA_ERROR_NOT_FOUND: return "Not found";

case CUDA_ERROR_NOT_READY: return "CUDA not ready";

case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";

case CUDA_ERROR_UNKNOWN: return "Unknown error";
default: return "Unknown error ID";
}
}

void checkSuccess(CUresult status,
const char *func,
const char *errorBuffer = 0)
{
if (status != CUDA_SUCCESS) {
if (errorBuffer != 0) {
std::cerr << "ERROR LOG:" << std::endl
<< errorBuffer << std::endl;
}

std::cerr << "ERROR: Could not execute '" << func << "', error ("
<< status << ") " << statusToString(status) << std::endl;
exit(1);
}
}

double getTimeStamp()
{
struct timezone Tzp;
struct timeval Tp;
int stat;
stat = gettimeofday (&Tp, &Tzp);
if (stat != 0)
std::cerr << "Error return from gettimeofday: " << stat << "\n";
return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
}



//==--- Entry Point --------------------------------------------------------== //

int main(int argc,
char** argv) {

CUcontext context;
CUdevice device;
CUmodule module;
CUresult status;
CUfunction function;

const int kLogSize = 1024;
char logBuffer[kLogSize];

int blockSizeX = 512;
int blockSizeMultiple = 5000;
int problemSize = blockSizeX * blockSizeMultiple;


// Initialize CUDA
std::cout << "Initializing CUDA\n";
checkSuccess(cuInit(0), "cuInit");
std::cout << "Selecting first compute device\n";
checkSuccess(cuDeviceGet(&device, 0), "cuDeviceGet");
std::cout << "Creating CUDA context\n";
checkSuccess(cuCtxCreate(&context, 0, device), "cuCtxCreate");

// Read the PTX kernel from disk
std::ifstream kernelFile("vector-add.kernel.ptx");
if (!kernelFile.is_open()) {
std::cerr << "Failed to open vector-add.kernel.ptx\n";
return 1;
}

// Load entire kernel into a string
std::string source(std::istreambuf_iterator<char>(kernelFile),
(std::istreambuf_iterator<char>()));

// Configure JIT options
CUjit_option jitOptions[] = { CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
CU_JIT_ERROR_LOG_BUFFER };
void* jitOptionValues[] = { reinterpret_cast<void*>(kLogSize), logBuffer };

// Load the kernel onto the device
status = cuModuleLoadDataEx(&module, source.c_str(),
sizeof(jitOptions)/sizeof(jitOptions[0]),
jitOptions, jitOptionValues);
checkSuccess(status, "cuModuleLoadDataEx", logBuffer);

status = cuModuleGetFunction(&function, module, "vector_add");
checkSuccess(status, "cuModuleGetFunction");


// Print some diagnostics about the kernel compilation
int numRegisters;
cuFuncGetAttribute(&numRegisters, CU_FUNC_ATTRIBUTE_NUM_REGS, function);
std::cout << "Register Usage: " << numRegisters << "\n";


// Setup buffers
Real* hostA = new Real[problemSize];
Real* hostB = new Real[problemSize];
Real* refC = new Real[problemSize];
Real* cmpC = new Real[problemSize];

std::cout << "Problem Size: " << problemSize << "\n";

CUdeviceptr deviceA;
CUdeviceptr deviceB;
CUdeviceptr deviceC;

status = cuMemAlloc(&deviceA, problemSize * sizeof(Real));
checkSuccess(status, "cuMemAlloc");
status = cuMemAlloc(&deviceB, problemSize * sizeof(Real));
checkSuccess(status, "cuMemAlloc");
status = cuMemAlloc(&deviceC, problemSize * sizeof(Real));
checkSuccess(status, "cuMemAlloc");


// Populate arrays with test data
for (int i = 0; i < problemSize; ++i) {
hostA[i] = hostB[i] = (Real)i;
refC[i] = cmpC[i] = (Real)0.0;
}


// Copy buffers to device
status = cuMemcpyHtoD(deviceA, hostA, problemSize * sizeof(Real));
checkSuccess(status, "cuMemcpyHtoD");
status = cuMemcpyHtoD(deviceB, hostB, problemSize * sizeof(Real));
checkSuccess(status, "cuMemcpyHtoD");


// Setup block shape
status = cuFuncSetBlockShape(function, blockSizeX, 1, 1);
checkSuccess(status, "cuFuncSetBlockShape");


// Bind kernel paramters
status = cuParamSetv(function, 0, &deviceA, sizeof(CUdeviceptr));
checkSuccess(status, "cuParamSetv");
status = cuParamSetv(function, sizeof(CUdeviceptr), &deviceB, sizeof(CUdeviceptr));
checkSuccess(status, "cuParamSetv");
status = cuParamSetv(function, 2*sizeof(CUdeviceptr), &deviceC, sizeof(CUdeviceptr));
checkSuccess(status, "cuParamSetv");
status = cuParamSeti(function, 3*sizeof(CUdeviceptr), problemSize);
checkSuccess(status, "cuParamSeti");

status = cuParamSetSize(function, 3*sizeof(CUdeviceptr) + sizeof(int));
checkSuccess(status, "cuParamSetSize");


// Launch the kernel
double deviceStart = getTimeStamp();

status = cuLaunchGrid(function, blockSizeMultiple, 1);
checkSuccess(status, "cuLaunchGrid");
cuCtxSynchronize();

double deviceEnd = getTimeStamp();


// Copy results back to the host
status = cuMemcpyDtoH(cmpC, deviceC, problemSize * sizeof(Real));
checkSuccess(status, "cuMemoryDtoH");


// Compute the reference solution
double hostStart = getTimeStamp();

for (int i = 0; i < problemSize; ++i) {
refC[i] = hostA[i] + hostB[i];
}

double hostEnd = getTimeStamp();


// Compare the results
int numWrong = 0;

for (int i = 0; i < problemSize; ++i) {
if (std::abs(refC[i] - cmpC[i]) > (Real)1e-5) {
numWrong++;
}
}

if (numWrong == 0) {
std::cout << "Host reference comparison test PASSED\n";
}
else {
std::cout << "Host reference comparison test FAILED\n";
}

std::cout << "Device Time: " << (deviceEnd - deviceStart) << "s\n";
std::cout << "Host Time: " << (hostEnd - hostStart) << "s\n";

return 0;
}

37 changes: 37 additions & 0 deletions kernels/vector-add/vector-add.kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (C) 2011 by Justin Holewinski
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

extern "C"
void vector_add(float* A,
float* B,
float* C,
int N) {

// Determine our global offset into the vector
int myId = (__builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x())
+ __builtin_ptx_read_tid_x();

// Perform one component of the vector addition
if (myId < N) {
C[myId] = A[myId] + B[myId];
}
}

0 comments on commit a2e309d

Please sign in to comment.