## Numerical integration (Riemann sum): calculating $\Phi(1) = \frac 1 {\sqrt{2\pi}} \int_{0}^1 e^{-x^2/2} \, dx$
(see, e.g.: https://mathworld.wolfram.com/NormalDistributionFunction.html).

#### CUDA version with two kernels (trapezoid median + sum reducer)

In [1]:
%%sh
cat > riemann_cuda_double_reduce.cu << EOF
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define N 1000000000

/* CUDA error wraper */
static void CUDA_ERROR( cudaError_t err) 
{
    if (err != cudaSuccess) {
        printf("CUDA ERROR: %s, exiting\n", cudaGetErrorString(err));
        exit(-1);
    }
}

__global__ void medianTrapezoid(double *a, int n)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  double x = (double)idx / (double)n;
 
  if(idx < n)
    a[idx] = (exp(-x * x / 2.0) + exp(-(x + 1 / (double)n) * (x + 1 / (double)n) / 2.0)) / 2.0;
}

__global__ void reducerSum(double *a, double *out, int n, int block_size) {
    int idx = threadIdx.x;
    double sum = 0;
    for (int i = idx; i < n; i += block_size)
        sum += a[i];
    extern __shared__ double r[];
    r[idx] = sum;
    __syncthreads();
    for (int size = block_size/2; size>0; size/=2) {
        if (idx<size)
            r[idx] += r[idx+size];
        __syncthreads();
    }
    if (idx == 0)
        *out = r[0];
}

double riemannCUDA(int n)
{
  ///size of the arrays in bytes
  size_t size = n * sizeof(double);

  int block_size = 1024;

  // allocate array on host and device
  double* a_h = (double *)malloc(size);
  double* out_h = (double *)malloc(sizeof(double));
  double* r = (double *)malloc(block_size * sizeof(double));
  double* a_d; cudaMalloc((double **) &a_d, size);
  double* out; cudaMalloc((double **) &out, sizeof(double));

  // do calculation on device
  
  int n_blocks = n/block_size + (n % block_size == 0 ? 0:1);
  printf("CUDA kernel 'medianTrapezoid' launch with %d blocks of %d threads\n", n_blocks, block_size);
  medianTrapezoid <<< n_blocks, block_size >>> (a_d, n);
  int n_blocks2 = 1;
  printf("CUDA kernel 'reducerSum' launch with %d blocks of %d threads\n\n", n_blocks2, block_size);
  reducerSum <<< n_blocks2, block_size, block_size*sizeof(double) >>> (a_d, out, n, block_size);
  
  // copy results from device to host
  cudaMemcpy(out_h, out, sizeof(double), cudaMemcpyDeviceToHost);

  // add up results
  double sum;
  sum = *out_h;
  sum *= (1.0 / sqrt(2.0 * M_PI)) / (double)n;
  
  // clean up
  free(a_h); cudaFree(a_d);
  free(out_h); cudaFree(out);
  cudaFree(r);
  
  return sum;
}


int main(int argc, char** argv){

  /*get info on our GPU, defaulting to first one*/
  cudaDeviceProp prop;
  CUDA_ERROR(cudaGetDeviceProperties(&prop,0));
  printf("Found GPU '%s' with %g GB of global memory, max %d threads per block, and %d multiprocessors\n", 
         prop.name, prop.totalGlobalMem/(1024.0*1024.0*1024.0),
         prop.maxThreadsPerBlock,prop.multiProcessorCount);
 
  /*init CUDA*/
  CUDA_ERROR(cudaSetDevice(0));

  clock_t t1; 
  t1 = clock();

  double sum = riemannCUDA(N);

  t1 = clock() - t1;

  double time_taken1 = ((double)t1)/CLOCKS_PER_SEC; // in seconds

  printf("Riemann sum CUDA (double precision) for N = %d    : %.17g \n", N, sum);
  printf("Total time (measured by CPU)                              : %f s\n", time_taken1);
} 
EOF

In [2]:
!nvcc -o riemann_cuda_double_reduce riemann_cuda_double_reduce.cu && ./riemann_cuda_double_reduce

Found GPU 'Tesla T4' with 14.7556 GB of global memory, max 1024 threads per block, and 40 multiprocessors
tcmalloc: large alloc 8000004096 bytes == 0x55a94fd6e000 @  0x7ff332b8e1e7 0x55a94ee1ee96 0x55a94ee1f1b6 0x7ff331bbfbf7 0x55a94ee1ed0a
CUDA kernel 'medianTrapezoid' launch with 976563 blocks of 1024 threads
CUDA kernel 'reducerSum' launch with 1 blocks of 1024 threads

Riemann sum CUDA (double precision) for N = 1000000000    : 0.34134474606854243 
Total time (measured by CPU)                              : 1.122841 s


#### CUDA profiling (trapezoid median + sum reducer)

In [3]:
!nvprof ./riemann_cuda_double_reduce

==233== NVPROF is profiling process 233, command: ./riemann_cuda_double_reduce
Found GPU 'Tesla T4' with 14.7556 GB of global memory, max 1024 threads per block, and 40 multiprocessors
tcmalloc: large alloc 8000004096 bytes == 0x5616089a0000 @  0x7f388b58d1e7 0x561604424e96 0x5616044251b6 0x7f388a5bebf7 0x561604424d0a
CUDA kernel 'medianTrapezoid' launch with 976563 blocks of 1024 threads
CUDA kernel 'reducerSum' launch with 1 blocks of 1024 threads

Riemann sum CUDA (double precision) for N = 1000000000    : 0.34134474606854243 
Total time (measured by CPU)                              : 1.040373 s
==233== Profiling application: ./riemann_cuda_double_reduce
==233== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   55.73%  471.54ms         1  471.54ms  471.54ms  471.54ms  medianTrapezoid(double*, int)
                   44.27%  374.63ms         1  374.63ms  374.63ms  374.63ms  reducerSum(double*, double*, int, int)
  