# OpenMP 4.5 GPU example

In [1]:
!apt-cache search nvptx

gcc-7-offload-nvptx - GCC offloading compiler to NVPTX
gcc-8-offload-nvptx - GCC offloading compiler to NVPTX
gcc-offload-nvptx - GCC offloading compiler to NVPTX
libgomp-plugin-nvptx1 - GCC OpenMP v4.5 plugin for offloading to NVPTX
nvptx-tools - collection of tools for use with nvptx-none GCC toolchains


In [2]:
!apt-get install gcc-8-offload-nvptx  libgomp-plugin-nvptx1 

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  cpp-8 gcc-8 libasan5 libgcc-8-dev libubsan1 nvptx-tools
Suggested packages:
  gcc-8-locales gcc-8-multilib gcc-8-doc libgcc1-dbg libgomp1-dbg libitm1-dbg
  libatomic1-dbg libasan5-dbg liblsan0-dbg libtsan0-dbg libubsan1-dbg
  libmpx2-dbg libquadmath0-dbg nvidia-cuda-toolkit
The following NEW packages will be installed:
  cpp-8 gcc-8 gcc-8-offload-nvptx libasan5 libgcc-8-dev libgomp-plugin-nvptx1
  libubsan1 nvptx-tools
0 upgraded, 8 newly installed, 0 to remove and 37 not upgraded.
Need to get 25.8 MB of archives.
After this operation, 122 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 cpp-8 amd64 8.4.0-1ubuntu1~18.04 [7,225 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libasan5 amd64 8.4.0-1ubuntu1~18.04 [366 kB]
Get:3 http://archive.ubuntu.com/ubuntu

In [3]:
%%file riemann_openmp_gpu.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

#define N 1000000000

double riemann(int n)
{
  double sum = 0;
  
  #pragma omp target teams distribute parallel for simd map(tofrom: sum) map(to: n) reduction(+:sum)
  for(int i = 0; i < n; ++i)
  {
    double x = (double) i / (double) n;
    sum += (exp(-x * x / 2.0) + exp(-(x + 1 / (double)n) * (x + 1 / (double)n) / 2.0)) / 2.0;
  }

  sum *= (1.0 / sqrt(2.0 * M_PI)) / (double) n;

  return sum;
}

int main(int argc, char** argv){
  double start = omp_get_wtime();
  double sum = riemann(N);

  printf("Riemann sum OpenMP GPU (double precision) for N = %d     : %.17g \n", N, sum);
  printf("Total time: \t %f s\n", omp_get_wtime()-start);
}

Writing riemann_openmp_gpu.c


In [4]:
!find / -name nvcc

find: ‘/proc/30/task/30/net’: Invalid argument
find: ‘/proc/30/net’: Invalid argument
/usr/local/cuda-10.0/bin/nvcc
/usr/local/cuda-10.1/bin/nvcc
/usr/local/cuda-11.1/bin/nvcc
/usr/local/cuda-11.0/bin/nvcc


In [5]:
!PATH=/usr/local/cuda-10.1/bin:${PATH} gcc-8 -O3 -Wall riemann_openmp_gpu.c -o riemann_openmp_gpu -fopenmp -foffload=-lm -fno-stack-protector -lm

In [8]:
!./riemann_openmp_gpu

Riemann sum OpenMP GPU (double precision) for N = 1000000000     : 0.34134474606854326 
Total time: 	 0.928782 s


In [9]:
!nvprof ./riemann_openmp_gpu

==494== NVPROF is profiling process 494, command: ./riemann_openmp_gpu
Riemann sum OpenMP GPU (double precision) for N = 1000000000     : 0.3413447460685432 
Total time: 	 1.141155 s
==494== Profiling application: ./riemann_openmp_gpu
==494== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  779.14ms         1  779.14ms  779.14ms  779.14ms  riemann$_omp_fn$0
                    0.00%  4.2560us         1  4.2560us  4.2560us  4.2560us  [CUDA memcpy DtoH]
                    0.00%  1.9200us         1  1.9200us  1.9200us  1.9200us  [CUDA memcpy HtoD]
      API calls:   74.99%  779.16ms         1  779.16ms  779.16ms  779.16ms  cuCtxSynchronize
                   16.95%  176.10ms         1  176.10ms  176.10ms  176.10ms  cuCtxCreate
                    5.08%  52.745ms         1  52.745ms  52.745ms  52.745ms  cuCtxDestroy
                    1.24%  12.882ms         2  6.4409ms  345.64us  12.536ms  cuMemFree
          