# Using Memory Performance Characteristics

In [1]:
import pyopencl as cl
import pyopencl.clrandom
import pyopencl.array
import numpy as np
import loopy as lp

from time import time

In [2]:
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)

Choose platform:
[0] <pyopencl.Platform 'NVIDIA CUDA' at 0x2988060>
[1] <pyopencl.Platform 'Intel(R) OpenCL' at 0x27b28f8>
[2] <pyopencl.Platform 'Portable Computing Language' at 0x7fd42476d5c0>
Choice [0]:
Choose device(s):
[0] <pyopencl.Device 'GeForce GTX TITAN X' on 'NVIDIA CUDA' at 0x2997da0>
[1] <pyopencl.Device 'GeForce GTX TITAN X' on 'NVIDIA CUDA' at 0x2999650>
Choice, comma-separated [0]:1
Set the environment variable PYOPENCL_CTX=':1' to avoid being asked again.


# Transposing a Matrix

The code below realizes a simple matrix transpose:

In [125]:
knl = lp.make_kernel(
    "{[i,j]: 0<=i,j<n}",
    "result[i,j] = a[j,i]",
    lang_version=(2018, 2), name="transp")

knl = lp.assume(knl, "n mod 16 = 0")
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
knl = lp.split_iname(knl,
        "i", 16, outer_tag="g.0", inner_tag="l.1")
knl = lp.split_iname(knl,
        "j", 16, outer_tag="g.1", inner_tag="l.0")
knl = lp.set_argument_order(knl, "result,a,n")

code = lp.generate_code_v2(knl).device_code()

print(type(code))
print()
print(code)

<class 'str'>

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transp(__global float *__restrict__ result, __global float const *__restrict__ a, int const n)
{
  result[n * (16 * gid(0) + lid(1)) + 16 * gid(1) + lid(0)] = a[n * (16 * gid(1) + lid(0)) + 16 * gid(0) + lid(1)];
}


Now compile this code and get `clknl`:

In [126]:
prg = cl.Program(ctx, code).build()
clknl = prg.transp

Now set up the data:

In [127]:
n = 1024*19
a = cl.clrandom.rand(queue, (n, n), np.float32)
result = cl.array.empty(queue, (n, n), np.float32)

And time the execution:

In [130]:
def run():
    clknl(queue, (n, n), (16, 16), result.data, a.data, np.int32(n))

run()
run()
run()

nruns = 10
queue.finish()
start_time = time()
for i in range(nruns):
    run()
queue.finish()
elapsed = (time() - start_time)/nruns
print(f"{elapsed} s elapsed")

0.02455599308013916 s elapsed


Now do bandwidth accounting.

In [131]:
bandwidth = (result.nbytes + a.nbytes)/elapsed/1e9
print(f"bandwidth: {bandwidth} GB/s")

bandwidth: 123.32172753580359 GB/s


## With Scratchpad Use

In [132]:
knl_pf = lp.add_prefetch(knl, "a",
        ["i_inner", "j_inner"],
        default_tag=None)
knl_pf = lp.tag_inames(knl_pf, {"a_dim_0": "l.1", "a_dim_1": "l.0"})
code = lp.generate_code_v2(knl_pf).device_code()

print(code)

prg = cl.Program(ctx, code).build()
clknl = prg.transp

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transp(__global float *__restrict__ result, __global float const *__restrict__ a, int const n)
{
  __local float a_fetch[16 * 16];

  a_fetch[16 * lid(1) + lid(0)] = a[n * (16 * gid(1) + lid(1)) + 16 * gid(0) + lid(0)];
  barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;
  result[n * (16 * gid(0) + lid(1)) + 16 * gid(1) + lid(0)] = a_fetch[16 * lid(0) + lid(1)];
}


In [133]:
def run():
    clknl(queue, (n, n), (16, 16), result.data, a.data, np.int32(n))

run()
run()
run()

nruns = 10
queue.finish()
start_time = time()
for i in range(nruns):
    run()
queue.finish()
elapsed = (time() - start_time)/nruns
print(f"{elapsed} s elapsed")

0.020562100410461425 s elapsed


In [134]:
bandwidth = (result.nbytes + a.nbytes)/elapsed/1e9
print(f"bandwidth: {bandwidth} GB/s")

bandwidth: 147.2752018300276 GB/s
