# Using Memory Performance Characteristics

In [1]:
import pyopencl as cl
import pyopencl.clrandom
import pyopencl.array
import numpy as np
import loopy as lp

from time import time

In [2]:
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)

# Transposing a Matrix

The code below realizes a simple matrix transpose:

In [125]:
knl = lp.make_kernel(
    "{[i,j]: 0<=i,j<n}",
    "result[i,j] = a[j,i]",
    lang_version=(2018, 2), name="transp")

knl = lp.assume(knl, "n mod 16 = 0")
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
knl = lp.split_iname(knl,
        "i", 16, outer_tag="g.0", inner_tag="l.1")
knl = lp.split_iname(knl,
        "j", 16, outer_tag="g.1", inner_tag="l.0")
knl = lp.set_argument_order(knl, "result,a,n")

code = lp.generate_code_v2(knl).device_code()

print(type(code))
print()
print(code)

Now compile this code and get `clknl`:

Now set up the data:

In [127]:
n = 1024*19
a = cl.clrandom.rand(queue, (n, n), np.float32)
result = cl.array.empty(queue, (n, n), np.float32)

And time the execution:

In [130]:
def run():
    clknl(queue, (n, n), (16, 16), result.data, a.data, np.int32(n))

run()
run()
run()

nruns = 10
queue.finish()
start_time = time()
for i in range(nruns):
    run()
queue.finish()
elapsed = (time() - start_time)/nruns
print(f"{elapsed} s elapsed")

Now do bandwidth accounting.

## With Scratchpad Use

In [132]:
knl_pf = lp.add_prefetch(knl, "a",
        ["i_inner", "j_inner"],
        default_tag=None)
knl_pf = lp.tag_inames(knl_pf, {"a_dim_0": "l.1", "a_dim_1": "l.0"})
code = lp.generate_code_v2(knl_pf).device_code()

print(code)

prg = cl.Program(ctx, code).build()
clknl = prg.transp

In [133]:
def run():
    clknl(queue, (n, n), (16, 16), result.data, a.data, np.int32(n))

run()
run()
run()

nruns = 10
queue.finish()
start_time = time()
for i in range(nruns):
    run()
queue.finish()
elapsed = (time() - start_time)/nruns
print(f"{elapsed} s elapsed")