In [1]:
pip -q install pycuda

In [2]:
%%file riemann_pycuda.py
from __future__ import print_function
from __future__ import absolute_import
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import time

import numpy

def iDivUp(a, b):
    return a // b + 1

N = 1000000000

def riemannCUDA(n):
    a = numpy.empty([n])

    a = a.astype(numpy.float64)

    a_d = cuda.mem_alloc(a.size * a.dtype.itemsize)

    mod = SourceModule("""
        __global__ void medianTrapezoid(double *a, int n)
        {
          int idx = blockIdx.x * blockDim.x + threadIdx.x;
          double x = (double)idx / (double)n;
 
          if(idx < n)
            a[idx] = (exp(-x * x / 2.0) + exp(-(x + 1 / (double)n) * (x + 1 / (double)n) / 2.0)) / 2.0;
        }
        """)

    func = mod.get_function("medianTrapezoid")
    block_size = 1024
    n_blocks = iDivUp(n, block_size)
    blockDim  = (block_size, 1, 1)
    gridDim   = (n_blocks, 1, 1)
    print("CUDA kernel 'medianTrapezoid' launch with %i blocks of %i threads\n" % (n_blocks, block_size))
    func(a_d, numpy.int32(n), block=blockDim, grid=gridDim)

    cuda.memcpy_dtoh(a, a_d)

    Sum = numpy.sum(a) / numpy.sqrt(2 * numpy.pi) / numpy.float64(n)

    return Sum

dev = pycuda.autoinit.device

dev_name = dev.name()
total_memory = dev.total_memory() / 1024.0 / 1024.0 / 1024.0
threads_per_block = dev.get_attribute(pycuda.driver.device_attribute.MAX_THREADS_PER_BLOCK)
sm_count = dev.get_attribute(pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT)

print("Found GPU '%s' with %.3f GB of global memory, max %i threads per block, and %i multiprocessors\n" % 
       (dev_name, total_memory, threads_per_block, sm_count))

start = time.time()

Sum = riemannCUDA(N)

end = time.time()

time_taken = end - start # in seconds

print("Riemann sum pyCUDA (double precision) for N = %i  : %.17f" % (N, Sum));
print("Total time (measured by CPU)                              : %f s" % time_taken);

Overwriting riemann_pycuda.py


In [3]:
!PATH=/usr/local/cuda-10.1/bin:${PATH} python riemann_pycuda.py

Found GPU 'Tesla K80' with 11.173 GB of global memory, max 1024 threads per block, and 13 multiprocessors

tcmalloc: large alloc 8000004096 bytes == 0x55753eaf0000 @  0x7fcc6d7df1e7 0x7fcc6b39f46e 0x7fcc6b3efc7b 0x7fcc6b3f035f 0x7fcc6b492103 0x55753bb304b0 0x55753bb30240 0x55753bba40f3 0x55753bb31afa 0x55753bb9f915 0x55753bb9e9ee 0x55753bb9e6f3 0x55753bc684c2 0x55753bc6883d 0x55753bc686e6 0x55753bc40163 0x55753bc3fe0c 0x7fcc6c5c9bf7 0x55753bc3fcea
tcmalloc: large alloc 8000004096 bytes == 0x55771c16a000 @  0x7fcc6d7df1e7 0x7fcc6b39f46e 0x7fcc6b3efc7b 0x7fcc6b3efd97 0x7fcc6b488887 0x55753bb304b0 0x55753bc21e1d 0x55753bba3e99 0x55753bb31afa 0x55753bb9f915 0x55753bb9e9ee 0x55753bb9e6f3 0x55753bc684c2 0x55753bc6883d 0x55753bc686e6 0x55753bc40163 0x55753bc3fe0c 0x7fcc6c5c9bf7 0x55753bc3fcea
CUDA kernel 'medianTrapezoid' launch with 976563 blocks of 1024 threads

Riemann sum pyCUDA (double precision) for N = 1000000000  : 0.34134474606853665
Total time (measured by CPU)                      

In [4]:
pip -q install pyopencl

In [5]:
%%file riemann_pyopencl.py
from __future__ import absolute_import, print_function
import numpy as np
import pyopencl as cl

import time

def iDivUp(a, b):
    return a // b + 1

N = 1000000000

def riemannOpenCL(n):

    a = np.empty([n])
    a = a.astype(np.float64)

    queue = cl.CommandQueue(ctx)

    mf = cl.mem_flags
    a_d = cl.Buffer(ctx, mf.WRITE_ONLY, a.nbytes)

    prg = cl.Program(ctx, """
    __kernel void medianTrapezoid(__global double *a, int n) {
    
        int idx = get_global_id(0);
        double x = (double)idx / (double)n;
 
        if(idx < n)
           a[idx] = (exp(-x * x / 2.0) + exp(-(x + 1 / (double)n) * (x + 1 / (double)n) / 2.0)) / 2.0;
    }
    """).build()

    local_item_size = 1024
    n_blocks = iDivUp(n, local_item_size)
    global_item_size = n_blocks * local_item_size
    print("OpenCL kernel 'medianTrapezoid' launch with %i blocks of %i threads\n" % (n_blocks, local_item_size))
    prg.medianTrapezoid(queue, (global_item_size, 1, 1), (local_item_size, 1, 1), a_d, np.int32(n))

    cl.enqueue_copy(queue, a, a_d)

    Sum = np.sum(a) / np.sqrt(2 * np.pi) / np.float64(n)

    return Sum

platform = cl.get_platforms()[0]
device = platform.get_devices()[0]
ctx = cl.Context([device])

dev_name = device.name
total_memory = device.global_mem_size / 1024.0 / 1024.0 / 1024.0
threads_per_block = device.max_work_group_size
sm_count = device.max_compute_units

print("Found GPU '%s' with %.3f GB of global memory, max %i threads per block, and %i multiprocessors\n" % 
       (dev_name, total_memory, threads_per_block, sm_count))

start = time.time()

Sum = riemannOpenCL(N)

end = time.time()

time_taken = end - start # in seconds

print("Riemann sum pyOpenCL (double precision) for N = %i  : %.17f" % (N, Sum));
print("Total time (measured by CPU)                                : %f s" % time_taken);

Overwriting riemann_pyopencl.py


In [6]:
!PATH=/usr/local/cuda-10.1/bin:${PATH} python riemann_pyopencl.py

Found GPU 'Tesla K80' with 11.173 GB of global memory, max 1024 threads per block, and 13 multiprocessors

tcmalloc: large alloc 8000004096 bytes == 0x561c88ad4000 @  0x7f3022a501e7 0x7f302061046e 0x7f3020660c7b 0x7f302066135f 0x7f3020703103 0x561c861f14b0 0x561c861f1240 0x561c862650f3 0x561c861f2afa 0x561c86260915 0x561c8625f9ee 0x561c8625f6f3 0x561c863294c2 0x561c8632983d 0x561c863296e6 0x561c86301163 0x561c86300e0c 0x7f302183abf7 0x561c86300cea
tcmalloc: large alloc 8000004096 bytes == 0x561e661cc000 @  0x7f3022a501e7 0x7f302061046e 0x7f3020660c7b 0x7f3020660d97 0x7f30206f9887 0x561c861f14b0 0x561c862e2e1d 0x561c86264e99 0x561c861f2afa 0x561c86260915 0x561c8625f9ee 0x561c8625f6f3 0x561c863294c2 0x561c8632983d 0x561c863296e6 0x561c86301163 0x561c86300e0c 0x7f302183abf7 0x561c86300cea
OpenCL kernel 'medianTrapezoid' launch with 976563 blocks of 1024 threads

Riemann sum pyOpenCL (double precision) for N = 1000000000  : 0.34134474606853665
Total time (measured by CPU)                  