# OpenMP 4.5 GPU example

In [1]:
!apt-cache search nvptx

gcc-7-offload-nvptx - GCC offloading compiler to NVPTX
gcc-8-offload-nvptx - GCC offloading compiler to NVPTX
gcc-offload-nvptx - GCC offloading compiler to NVPTX
libgomp-plugin-nvptx1 - GCC OpenMP v4.5 plugin for offloading to NVPTX
nvptx-tools - collection of tools for use with nvptx-none GCC toolchains


In [2]:
!apt-get install gcc-8-offload-nvptx  libgomp-plugin-nvptx1 

Reading package lists... Done
Building dependency tree       
Reading state information... Done
gcc-8-offload-nvptx is already the newest version (8.4.0-1ubuntu1~18.04).
libgomp-plugin-nvptx1 is already the newest version (8.4.0-1ubuntu1~18.04).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [3]:
%%file riemann_openmp_gpu.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

#define N 1000000000

double riemann(int n)
{
  double sum = 0;
  
  #pragma omp target teams distribute parallel for simd map(tofrom: sum) map(to: n) reduction(+:sum)
  for(int i = 0; i < n; ++i)
  {
    double x = (double) i / (double) n;
    sum += (exp(-x * x / 2.0) + exp(-(x + 1 / (double)n) * (x + 1 / (double)n) / 2.0)) / 2.0;
  }

  sum *= (1.0 / sqrt(2.0 * M_PI)) / (double) n;

  return sum;
}

int main(int argc, char** argv){
  double start = omp_get_wtime();
  double sum = riemann(N);

  printf("Riemann sum OpenMP GPU (double precision) for N = %d     : %.17g \n", N, sum);
  printf("Total time: \t %f s\n", omp_get_wtime()-start);
}

Overwriting riemann_openmp_gpu.c


In [4]:
!find / -name nvcc

/usr/local/cuda-11.1/bin/nvcc
/usr/local/cuda-10.1/bin/nvcc
/usr/local/cuda-10.0/bin/nvcc
/usr/local/cuda-11.0/bin/nvcc


In [5]:
!PATH=/usr/local/cuda-10.1/bin:${PATH} gcc-8 -O3 -Wall riemann_openmp_gpu.c -o riemann_openmp_gpu -fopenmp -foffload=-lm -fno-stack-protector -lm

In [6]:
!./riemann_openmp_gpu

Riemann sum OpenMP GPU (double precision) for N = 1000000000     : 0.34134474606854309 
Total time: 	 0.921027 s


In [7]:
!nvprof ./riemann_openmp_gpu

==570== NVPROF is profiling process 570, command: ./riemann_openmp_gpu
Riemann sum OpenMP GPU (double precision) for N = 1000000000     : 0.3413447460685432 
Total time: 	 1.094383 s
==570== Profiling application: ./riemann_openmp_gpu
==570== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  719.70ms         1  719.70ms  719.70ms  719.70ms  riemann$_omp_fn$0
                    0.00%  3.6480us         1  3.6480us  3.6480us  3.6480us  [CUDA memcpy DtoH]
                    0.00%  1.9520us         1  1.9520us  1.9520us  1.9520us  [CUDA memcpy HtoD]
      API calls:   72.41%  719.72ms         1  719.72ms  719.72ms  719.72ms  cuCtxSynchronize
                   19.37%  192.51ms         1  192.51ms  192.51ms  192.51ms  cuCtxCreate
                    5.65%  56.183ms         1  56.183ms  56.183ms  56.183ms  cuCtxDestroy
                    0.92%  9.1318ms         2  4.5659ms  296.02us  8.8358ms  cuMemFree
          