In [1]:
!pip -q install pycuda

[?25l[K     |▏                               | 10 kB 15.4 MB/s eta 0:00:01[K     |▍                               | 20 kB 16.2 MB/s eta 0:00:01[K     |▋                               | 30 kB 18.7 MB/s eta 0:00:01[K     |▉                               | 40 kB 13.7 MB/s eta 0:00:01[K     |█                               | 51 kB 6.2 MB/s eta 0:00:01[K     |█▏                              | 61 kB 6.5 MB/s eta 0:00:01[K     |█▍                              | 71 kB 5.9 MB/s eta 0:00:01[K     |█▋                              | 81 kB 6.5 MB/s eta 0:00:01[K     |█▊                              | 92 kB 6.7 MB/s eta 0:00:01[K     |██                              | 102 kB 5.6 MB/s eta 0:00:01[K     |██▏                             | 112 kB 5.6 MB/s eta 0:00:01[K     |██▍                             | 122 kB 5.6 MB/s eta 0:00:01[K     |██▌                             | 133 kB 5.6 MB/s eta 0:00:01[K     |██▊                             | 143 kB 5.6 MB/s eta 0:00:01[K 

In [2]:
%%file riemann_pycuda.py
from __future__ import print_function
from __future__ import absolute_import
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import time

import numpy

def iDivUp(a, b):
    return a // b + 1

N = 1000000000

def riemannCUDA(n):
    a = numpy.empty([n])

    a = a.astype(numpy.float64)

    a_d = cuda.mem_alloc(a.size * a.dtype.itemsize)

    mod = SourceModule("""
        __global__ void medianTrapezoid(double *a, int n)
        {
          int idx = blockIdx.x * blockDim.x + threadIdx.x;
          double x = (double)idx / (double)n;
 
          if(idx < n)
            a[idx] = (exp(-x * x / 2.0) + exp(-(x + 1 / (double)n) * (x + 1 / (double)n) / 2.0)) / 2.0;
        }
        """)

    func = mod.get_function("medianTrapezoid")
    block_size = 1024
    n_blocks = iDivUp(n, block_size)
    blockDim  = (block_size, 1, 1)
    gridDim   = (n_blocks, 1, 1)
    print("CUDA kernel 'medianTrapezoid' launch with %i blocks of %i threads\n" % (n_blocks, block_size))
    func(a_d, numpy.int32(n), block=blockDim, grid=gridDim)

    cuda.memcpy_dtoh(a, a_d)

    Sum = numpy.sum(a) / numpy.sqrt(2 * numpy.pi) / numpy.float64(n)

    return Sum

dev = pycuda.autoinit.device

dev_name = dev.name()
total_memory = dev.total_memory() / 1024.0 / 1024.0 / 1024.0
threads_per_block = dev.get_attribute(pycuda.driver.device_attribute.MAX_THREADS_PER_BLOCK)
sm_count = dev.get_attribute(pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT)

print("Found GPU '%s' with %.3f GB of global memory, max %i threads per block, and %i multiprocessors\n" % 
       (dev_name, total_memory, threads_per_block, sm_count))

start = time.time()

Sum = riemannCUDA(N)

end = time.time()

time_taken = end - start # in seconds

print("Riemann sum pyCUDA (double precision) for N = %i  : %.17f" % (N, Sum));
print("Total time (measured by CPU)                              : %f s" % time_taken);

Writing riemann_pycuda.py


In [3]:
!PATH=/usr/local/cuda-10.1/bin:${PATH} python riemann_pycuda.py

Found GPU 'Tesla K80' with 11.173 GB of global memory, max 1024 threads per block, and 13 multiprocessors

tcmalloc: large alloc 8000004096 bytes == 0x56070c4d6000 @  0x7f86804651e7 0x7f867e02546e 0x7f867e075c7b 0x7f867e07635f 0x7f867e118103 0x5607093464b0 0x560709346240 0x5607093ba0f3 0x560709347afa 0x5607093b5915 0x5607093b49ee 0x5607093b46f3 0x56070947e4c2 0x56070947e83d 0x56070947e6e6 0x560709456163 0x560709455e0c 0x7f867f24fbf7 0x560709455cea
tcmalloc: large alloc 8000004096 bytes == 0x5608e9b50000 @  0x7f86804651e7 0x7f867e02546e 0x7f867e075c7b 0x7f867e075d97 0x7f867e10e887 0x5607093464b0 0x560709437e1d 0x5607093b9e99 0x560709347afa 0x5607093b5915 0x5607093b49ee 0x5607093b46f3 0x56070947e4c2 0x56070947e83d 0x56070947e6e6 0x560709456163 0x560709455e0c 0x7f867f24fbf7 0x560709455cea
CUDA kernel 'medianTrapezoid' launch with 976563 blocks of 1024 threads

Riemann sum pyCUDA (double precision) for N = 1000000000  : 0.34134474606853665
Total time (measured by CPU)                      

In [4]:
!pip -q install pyopencl

[?25l[K     |▍                               | 10 kB 22.3 MB/s eta 0:00:01[K     |▊                               | 20 kB 13.0 MB/s eta 0:00:01[K     |█▏                              | 30 kB 10.2 MB/s eta 0:00:01[K     |█▌                              | 40 kB 8.9 MB/s eta 0:00:01[K     |█▉                              | 51 kB 5.2 MB/s eta 0:00:01[K     |██▎                             | 61 kB 5.6 MB/s eta 0:00:01[K     |██▋                             | 71 kB 5.6 MB/s eta 0:00:01[K     |███                             | 81 kB 6.3 MB/s eta 0:00:01[K     |███▍                            | 92 kB 4.8 MB/s eta 0:00:01[K     |███▊                            | 102 kB 5.3 MB/s eta 0:00:01[K     |████                            | 112 kB 5.3 MB/s eta 0:00:01[K     |████▌                           | 122 kB 5.3 MB/s eta 0:00:01[K     |████▉                           | 133 kB 5.3 MB/s eta 0:00:01[K     |█████▏                          | 143 kB 5.3 MB/s eta 0:00:01[K  

In [5]:
%%file riemann_pyopencl.py
from __future__ import absolute_import, print_function
import numpy as np
import pyopencl as cl

import time

def iDivUp(a, b):
    return a // b + 1

N = 1000000000

def riemannOpenCL(n):

    a = np.empty([n])
    a = a.astype(np.float64)

    queue = cl.CommandQueue(ctx)

    mf = cl.mem_flags
    a_d = cl.Buffer(ctx, mf.WRITE_ONLY, a.nbytes)

    prg = cl.Program(ctx, """
    __kernel void medianTrapezoid(__global double *a, int n) {
    
        int idx = get_global_id(0);
        double x = (double)idx / (double)n;
 
        if(idx < n)
           a[idx] = (exp(-x * x / 2.0) + exp(-(x + 1 / (double)n) * (x + 1 / (double)n) / 2.0)) / 2.0;
    }
    """).build()

    local_item_size = 1024
    n_blocks = iDivUp(n, local_item_size)
    global_item_size = n_blocks * local_item_size
    print("OpenCL kernel 'medianTrapezoid' launch with %i blocks of %i threads\n" % (n_blocks, local_item_size))
    prg.medianTrapezoid(queue, (global_item_size, 1, 1), (local_item_size, 1, 1), a_d, np.int32(n))

    cl.enqueue_copy(queue, a, a_d)

    Sum = np.sum(a) / np.sqrt(2 * np.pi) / np.float64(n)

    return Sum

platform = cl.get_platforms()[0]
device = platform.get_devices()[0]
ctx = cl.Context([device])

dev_name = device.name
total_memory = device.global_mem_size / 1024.0 / 1024.0 / 1024.0
threads_per_block = device.max_work_group_size
sm_count = device.max_compute_units

print("Found GPU '%s' with %.3f GB of global memory, max %i threads per block, and %i multiprocessors\n" % 
       (dev_name, total_memory, threads_per_block, sm_count))

start = time.time()

Sum = riemannOpenCL(N)

end = time.time()

time_taken = end - start # in seconds

print("Riemann sum pyOpenCL (double precision) for N = %i  : %.17f" % (N, Sum));
print("Total time (measured by CPU)                                : %f s" % time_taken);

Writing riemann_pyopencl.py


In [6]:
!PATH=/usr/local/cuda-10.1/bin:${PATH} python riemann_pyopencl.py

Found GPU 'Tesla K80' with 11.173 GB of global memory, max 1024 threads per block, and 13 multiprocessors

tcmalloc: large alloc 8000004096 bytes == 0x56503d174000 @  0x7f7a6ad631e7 0x7f7a6892346e 0x7f7a68973c7b 0x7f7a6897435f 0x7f7a68a16103 0x5650391074b0 0x565039107240 0x56503917b0f3 0x565039108afa 0x565039176915 0x5650391759ee 0x5650391756f3 0x56503923f4c2 0x56503923f83d 0x56503923f6e6 0x565039217163 0x565039216e0c 0x7f7a69b4dbf7 0x565039216cea
tcmalloc: large alloc 8000004096 bytes == 0x56521a7da000 @  0x7f7a6ad631e7 0x7f7a6892346e 0x7f7a68973c7b 0x7f7a68973d97 0x7f7a68a0c887 0x5650391074b0 0x5650391f8e1d 0x56503917ae99 0x565039108afa 0x565039176915 0x5650391759ee 0x5650391756f3 0x56503923f4c2 0x56503923f83d 0x56503923f6e6 0x565039217163 0x565039216e0c 0x7f7a69b4dbf7 0x565039216cea
OpenCL kernel 'medianTrapezoid' launch with 976563 blocks of 1024 threads

Riemann sum pyOpenCL (double precision) for N = 1000000000  : 0.34134474606853665
Total time (measured by CPU)                  