Initial commit of vector addition sample

jholewinski · Aug 10, 2011 · a2e309d · a2e309d
commit a2e309d
Show file tree

Hide file tree

Showing 2 changed files with 302 additions and 0 deletions.
diff --git a/kernels/vector-add/vector-add.cpp b/kernels/vector-add/vector-add.cpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (C) 2011 by Justin Holewinski
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <cmath>
+
+#include <sys/time.h>
+
+#include "cuda.h"
+
+
+typedef float Real;
+
+
+//==--- Utility Functions --------------------------------------------------== //
+const char * statusToString(CUresult error)
+{
+  switch (error) {
+    case CUDA_SUCCESS: return "No errors";
+    case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
+    case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
+    case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
+    case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
+
+    case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
+    case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
+
+    case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
+    case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
+    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
+    case CUDA_ERROR_MAP_FAILED: return "Map failed";
+    case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
+    case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
+    case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
+    case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
+    case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
+    case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
+
+    case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
+    case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
+
+    case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
+
+    case CUDA_ERROR_NOT_FOUND: return "Not found";
+
+    case CUDA_ERROR_NOT_READY: return "CUDA not ready";
+
+    case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
+    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
+    case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
+    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
+
+    case CUDA_ERROR_UNKNOWN: return "Unknown error";
+    default: return "Unknown error ID";
+  }
+}
+
+void checkSuccess(CUresult    status,
+                  const char *func,
+                  const char *errorBuffer = 0)
+{
+  if (status != CUDA_SUCCESS) {
+    if (errorBuffer != 0) {
+      std::cerr << "ERROR LOG:" << std::endl
+                << errorBuffer << std::endl;
+    }
+
+    std::cerr << "ERROR: Could not execute '" << func << "', error ("
+              << status << ") " << statusToString(status) << std::endl;
+    exit(1);
+  }
+}
+
+double getTimeStamp()
+{
+  struct timezone Tzp;
+  struct timeval  Tp;
+  int             stat;
+  stat = gettimeofday (&Tp, &Tzp);
+  if (stat != 0)
+    std::cerr << "Error return from gettimeofday: " << stat << "\n";
+  return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
+}
+
+
+
+//==--- Entry Point --------------------------------------------------------== //
+
+int main(int argc,
+         char** argv) {
+
+  CUcontext  context;
+  CUdevice   device;
+  CUmodule   module;
+  CUresult   status;
+  CUfunction function;
+
+  const int kLogSize = 1024;
+  char      logBuffer[kLogSize];
+
+  int blockSizeX        = 512;
+  int blockSizeMultiple = 5000;
+  int problemSize       = blockSizeX * blockSizeMultiple;
+
+
+  // Initialize CUDA
+  std::cout << "Initializing CUDA\n";
+  checkSuccess(cuInit(0), "cuInit");
+  std::cout << "Selecting first compute device\n";
+  checkSuccess(cuDeviceGet(&device, 0), "cuDeviceGet");
+  std::cout << "Creating CUDA context\n";
+  checkSuccess(cuCtxCreate(&context, 0, device), "cuCtxCreate");
+
+  // Read the PTX kernel from disk
+  std::ifstream kernelFile("vector-add.kernel.ptx");
+  if (!kernelFile.is_open()) {
+    std::cerr << "Failed to open vector-add.kernel.ptx\n";
+    return 1;
+  }
+
+  // Load entire kernel into a string
+  std::string source(std::istreambuf_iterator<char>(kernelFile),
+                     (std::istreambuf_iterator<char>()));
+
+  // Configure JIT options
+  CUjit_option jitOptions[] = { CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+                                CU_JIT_ERROR_LOG_BUFFER };
+  void* jitOptionValues[]   = { reinterpret_cast<void*>(kLogSize), logBuffer };
+
+  // Load the kernel onto the device
+  status = cuModuleLoadDataEx(&module, source.c_str(),
+                              sizeof(jitOptions)/sizeof(jitOptions[0]),
+                              jitOptions, jitOptionValues);
+  checkSuccess(status, "cuModuleLoadDataEx", logBuffer);
+
+  status = cuModuleGetFunction(&function, module, "vector_add");
+  checkSuccess(status, "cuModuleGetFunction");
+
+
+  // Print some diagnostics about the kernel compilation
+  int numRegisters;
+  cuFuncGetAttribute(&numRegisters, CU_FUNC_ATTRIBUTE_NUM_REGS, function);
+  std::cout << "Register Usage:  " << numRegisters << "\n";
+
+
+  // Setup buffers
+  Real* hostA = new Real[problemSize];
+  Real* hostB = new Real[problemSize];
+  Real* refC  = new Real[problemSize];
+  Real* cmpC  = new Real[problemSize];
+
+  std::cout << "Problem Size:  " << problemSize << "\n";
+
+  CUdeviceptr deviceA;
+  CUdeviceptr deviceB;
+  CUdeviceptr deviceC;
+
+  status = cuMemAlloc(&deviceA, problemSize * sizeof(Real));
+  checkSuccess(status, "cuMemAlloc");
+  status = cuMemAlloc(&deviceB, problemSize * sizeof(Real));
+  checkSuccess(status, "cuMemAlloc");
+  status = cuMemAlloc(&deviceC, problemSize * sizeof(Real));
+  checkSuccess(status, "cuMemAlloc");
+
+
+  // Populate arrays with test data
+  for (int i = 0; i < problemSize; ++i) {
+    hostA[i] = hostB[i] = (Real)i;
+    refC[i] = cmpC[i] = (Real)0.0;
+  }
+
+
+  // Copy buffers to device
+  status = cuMemcpyHtoD(deviceA, hostA, problemSize * sizeof(Real));
+  checkSuccess(status, "cuMemcpyHtoD");
+  status = cuMemcpyHtoD(deviceB, hostB, problemSize * sizeof(Real));
+  checkSuccess(status, "cuMemcpyHtoD");
+
+
+  // Setup block shape
+  status = cuFuncSetBlockShape(function, blockSizeX, 1, 1);
+  checkSuccess(status, "cuFuncSetBlockShape");
+
+
+  // Bind kernel paramters
+  status = cuParamSetv(function, 0, &deviceA, sizeof(CUdeviceptr));
+  checkSuccess(status, "cuParamSetv");
+  status = cuParamSetv(function, sizeof(CUdeviceptr), &deviceB, sizeof(CUdeviceptr));
+  checkSuccess(status, "cuParamSetv");
+  status = cuParamSetv(function, 2*sizeof(CUdeviceptr), &deviceC, sizeof(CUdeviceptr));
+  checkSuccess(status, "cuParamSetv");
+  status = cuParamSeti(function, 3*sizeof(CUdeviceptr), problemSize);
+  checkSuccess(status, "cuParamSeti");
+
+  status = cuParamSetSize(function, 3*sizeof(CUdeviceptr) + sizeof(int));
+  checkSuccess(status, "cuParamSetSize");
+
+
+  // Launch the kernel
+  double deviceStart = getTimeStamp();
+
+  status = cuLaunchGrid(function, blockSizeMultiple, 1);
+  checkSuccess(status, "cuLaunchGrid");
+  cuCtxSynchronize();
+
+  double deviceEnd = getTimeStamp();
+
+
+  // Copy results back to the host
+  status = cuMemcpyDtoH(cmpC, deviceC, problemSize * sizeof(Real));
+  checkSuccess(status, "cuMemoryDtoH");
+
+
+  // Compute the reference solution
+  double hostStart = getTimeStamp();
+
+  for (int i = 0; i < problemSize; ++i) {
+    refC[i] = hostA[i] + hostB[i];
+  }
+
+  double hostEnd = getTimeStamp();
+
+
+  // Compare the results
+  int numWrong = 0;
+
+  for (int i = 0; i < problemSize; ++i) {
+    if (std::abs(refC[i] - cmpC[i]) > (Real)1e-5) {
+      numWrong++;
+    }
+  }
+
+  if (numWrong == 0) {
+    std::cout << "Host reference comparison test PASSED\n";
+  }
+  else {
+    std::cout << "Host reference comparison test FAILED\n";
+  }
+
+  std::cout << "Device Time:  " << (deviceEnd - deviceStart) << "s\n";
+  std::cout << "Host Time:    " << (hostEnd - hostStart) << "s\n";
+
+  return 0;
+}
+
diff --git a/kernels/vector-add/vector-add.kernel.cpp b/kernels/vector-add/vector-add.kernel.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2011 by Justin Holewinski
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern "C"
+void vector_add(float* A,
+                float* B,
+                float* C,
+                int    N) {
+
+  // Determine our global offset into the vector
+  int myId = (__builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x())
+    + __builtin_ptx_read_tid_x();
+
+  // Perform one component of the vector addition
+  if (myId < N) {
+    C[myId] = A[myId] + B[myId];
+  }
+}