# Timing GPU Work

We would like to time GPU work, in this case a vector addition.

In [1]:
import pyopencl as cl
import numpy as np
import pyopencl.clrandom
from time import time

In [2]:
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)

Choose platform:
[0] <pyopencl.Platform 'NVIDIA CUDA' at 0x28bcdf0>
[1] <pyopencl.Platform 'Intel(R) OpenCL' at 0x28b9218>
[2] <pyopencl.Platform 'Portable Computing Language' at 0x7fd9a796d5c0>
Choice [0]:
Choose device(s):
[0] <pyopencl.Device 'GeForce GTX TITAN X' on 'NVIDIA CUDA' at 0x28c74d0>
[1] <pyopencl.Device 'GeForce GTX TITAN X' on 'NVIDIA CUDA' at 0x28cbf00>
Choice, comma-separated [0]:1
Set the environment variable PYOPENCL_CTX=':1' to avoid being asked again.


In [57]:
n = 5_000_000

a = cl.clrandom.rand(queue, n, np.float32)
b = cl.clrandom.rand(queue, n, np.float32)
c = cl.array.empty_like(a)

In [58]:
prg = cl.Program(ctx, """
    __kernel void addvec(__global float *a, __global float *b, __global float *c, int n)
    {
      int i = get_global_id(0);
      if (i < n)
        c[i] = a[i] + b[i];
    }
    """).build()
addvec = prg.addvec

**Before every experiment:** What do you expect will happen?


Now write the timing code:

In [84]:
wgsize = 256
gridsize = (((n+wgsize-1)//wgsize)*wgsize,)
addvec(queue, gridsize, (wgsize,), a.data, b.data, c.data, np.int32(n))
addvec(queue, gridsize, (wgsize,), a.data, b.data, c.data, np.int32(n))

nruns = 10
queue.finish()
start_time = time()
for i in range(nruns):
    addvec(queue, gridsize, (wgsize,), a.data, b.data, c.data, np.int32(n))
queue.finish()
elapsed = (time() - start_time)/nruns
print(f"{elapsed} s elapsed")

bandwidth = 3*a.nbytes/elapsed/1e9
print(f"bandwidth: {bandwidth} GB/s")

0.0002655744552612305 s elapsed
bandwidth: 225.92534338809585 GB/s


How can we check our answers? (edit this cell to see suggestions)

<!--
- Change the size of the array. How does the measurement respond?
- Repeat the measurement
-->