# Loopy: Counting Operations

## Setup code

In [1]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp

from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2

In [3]:
ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)

Choose platform:
[0] <pyopencl.Platform 'Portable Computing Language' at 0x7f53c17c66e8>
[1] <pyopencl.Platform 'Intel(R) OpenCL' at 0x289ca58>


Choice [0]: 0


Set the environment variable PYOPENCL_CTX='0' to avoid being asked again.


In [4]:
n = 1024
a = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
b = cl.clrandom.rand(queue, (n, n), dtype=np.float32)

## Operation-counting matrix multiplication

Here is the simple matrix-matrix multiplication kernel again:

In [8]:
knl = lp.make_kernel(
    "{[i,j,k]: 0<=i,j,k<n}",
    "c[i, j] = sum(k, a[i, k]*b[k, j])",
    target=lp.PyOpenCLTarget(queue.device))
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b":np.float32})

### Counting flops

Let us determine the number of arithmetic operations being carried out:

In [11]:
lp.get_op_map(knl)

{Op(np:dtype('float32'), add, subgroup): PwQPolynomial("[n] -> { n^3 : n > 0 }"), Op(np:dtype('float32'), mul, subgroup): PwQPolynomial("[n] -> { n^3 : n > 0 }")}

The return type is easy to evaluate for a given set of parameters--just use the `.eval_with_dict` method:

In [24]:
omap = lp.get_op_map(knl).filter_by(name=["add"], dtype=[np.float32])

omap.eval_and_sum({"n": 15})

3375

### Counting memory access

In [26]:
lp.get_mem_access_map(knl)

  inames_domain = knl.get_inames_domain(insn_inames)
  warn_with_kernel(knl, "insn_count_subgroups_upper_bound",


{MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup): PwQPolynomial("[n] -> { n^3 : n > 0 }"), MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup): PwQPolynomial("[n] -> { n^3 : n > 0 }"), MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup): PwQPolynomial("[n] -> { n^2 : n > 0 }")}

## Operation-counting a transformed kernel

In [31]:
opt_knl = knl
opt_knl = lp.assume(opt_knl, "n mod 16 = 0")
opt_knl = lp.split_iname(opt_knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
opt_knl = lp.split_iname(opt_knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
opt_knl = lp.split_iname(opt_knl, "k", 16)
#opt_knl = lp.add_prefetch(opt_knl, "a", "i_inner,k_inner")
#opt_knl = lp.add_prefetch(opt_knl, "b", "j_inner,k_inner")

opt_knl = lp.set_options(opt_knl, write_cl=True)
_ = opt_knl(queue, a=a, b=b)

[36m#[39;49;00m[36mdefine lid(N) ((int) get_local_id(N))[39;49;00m[36m[39;49;00m
[36m#[39;49;00m[36mdefine gid(N) ((int) get_group_id(N))[39;49;00m[36m[39;49;00m
[36m#[39;49;00m[36mdefine LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \[39;49;00m[36m[39;49;00m
[36m    MACRO_NAME(int8, char) \[39;49;00m[36m[39;49;00m
[36m    MACRO_NAME(int16, short) \[39;49;00m[36m[39;49;00m
[36m    MACRO_NAME(int32, int) \[39;49;00m[36m[39;49;00m
[36m    MACRO_NAME(int64, long)[39;49;00m[36m[39;49;00m
[36m#[39;49;00m[36mdefine LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \[39;49;00m[36m[39;49;00m
[36m    inline TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \[39;49;00m[36m[39;49;00m
[36m    { \[39;49;00m[36m[39;49;00m
[36m        if (a<0) \[39;49;00m[36m[39;49;00m
[36m            a = a - (b-1); \[39;49;00m[36m[39;49;00m
[36m        return a[39;49;00m[36m/[39;49;00m[36mb; \[39;49;00m[36m[39;49;00m
[36m    }[39;49;00m[36m[39;49;00m
LOOPY_C

Now count the memory accesses in the transformed version:

In [32]:
#clear
lp.get_mem_access_map(opt_knl)

{MemAccess(global, np:dtype('float32'), {1: Variable('n')}, {0: Product((Variable('n'), 16))}, load, a, None, subgroup): PwQPolynomial("[n] -> { 256 * n * floor((15 + n)/16)^3 : 0 < n <= 16; 4096 * floor((15 + n)/16)^3 : n >= 17 }"), MemAccess(global, np:dtype('float32'), {0: 1}, {1: 16}, load, b, None, workitem): PwQPolynomial("[n] -> { n^3 * floor((15 + n)/16)^3 : 0 < n <= 16; 4096 * floor((15 + n)/16)^3 : n >= 17 }"), MemAccess(global, np:dtype('float32'), {0: 1, 1: Variable('n')}, {0: Product((Variable('n'), 16)), 1: 16}, store, c, None, workitem): PwQPolynomial("[n] -> { n^2 * floor((15 + n)/16)^2 : 0 < n <= 16; 256 * floor((15 + n)/16)^2 : n >= 17 }")}

Now enable the prefetch transformation above.