# A mini-`einsum` using loopy

In this problem, we will design a function that carries out an `einsum`-type operation using `loopy`. It should be usable as shown in the tests towards the end of the worksheet. Also try to perform a simple parallelization so that your code will run on a GPU.

In [1]:
import numpy as np
import numpy.linalg as la

import pyopencl as cl
import pyopencl.array
import pyopencl.clmath
import pyopencl.clrandom

import loopy as lp

from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2

Some hints:

* `loopy.Reduction("sum", ("i", "j", "k"), expr)` expresses a sum.
* Build the loop domain `{[i,j]: 0<=i<Ni and 0<=j<Nj}` as a string and pass it to loopy.
* To build strings, use
    * `str.join()`: `",".join(names)` and 
    * `str.format`: `"Hi {name}".format(name="Andreas")`

In [2]:
def loopy_einsum(queue, spec, *args):
    arg_spec, out_spec = spec.split("->")
    arg_specs = arg_spec.split(",")
    # ...

In [3]:
#clear
def loopy_einsum(queue, spec, *args):
    arg_spec, out_spec = spec.split("->")
    arg_specs = arg_spec.split(",")

    out_indices = set(out_spec)

    all_indices = set(
        idx
        for argsp in arg_specs
        for idx in argsp) | out_indices

    sum_indices = all_indices - out_indices

    from pymbolic import var
    lhs = var("out")[tuple(var(i) for i in out_spec)]

    rhs = 1
    for i, argsp in enumerate(arg_specs):
        rhs = rhs * var("arg%d" % i)[tuple(var(i) for i in argsp)]

    if sum_indices:
        rhs = lp.Reduction("sum", tuple(var(idx) for idx in sum_indices), rhs)

    constraints = " and ".join(
        "0 <= %s < N%s" % (idx, idx)
        for idx in all_indices
        )

    domain = "{[%s]: %s}" % (",".join(all_indices), constraints)
    knl = lp.make_kernel(domain, [lp.ExpressionInstruction(lhs, rhs)])

    knl = lp.set_options(knl, write_cl=True)

    kwargs = {}
    for i, arg in enumerate(args):
        kwargs["arg%d" % i] = arg

    evt, (out,) = knl(queue, **kwargs)
    return out

Now, let us test our implementation, using a simple matrix-matrix multiplication:

In [4]:
cl_context = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(cl_context)

a = cl.clrandom.rand(queue, (300, 300), dtype=np.float64)
b = cl.clrandom.rand(queue, (300, 300), dtype=np.float64)

ab = loopy_einsum(queue, "ik,kj->ij", a, b)

diff =  a.get().dot(b.get()) - ab.get()

print(la.norm(diff, 2))

Choose platform:
[0] <pyopencl.Platform 'Portable Computing Language' at 0x7f2748f856e8>
[1] <pyopencl.Platform 'Intel(R) OpenCL' at 0x21df538>


Choice [0]: 0


Set the environment variable PYOPENCL_CTX='0' to avoid being asked again.
[36m#[39;49;00m[36mdefine lid(N) ((int) get_local_id(N))[39;49;00m[36m[39;49;00m
[36m#[39;49;00m[36mdefine gid(N) ((int) get_group_id(N))[39;49;00m[36m[39;49;00m
[36m#[39;49;00m[36mif __OPENCL_C_VERSION__ < 120[39;49;00m[36m[39;49;00m
[36m#[39;49;00m[36mpragma OPENCL EXTENSION cl_khr_fp64: enable[39;49;00m[36m[39;49;00m
[36m#[39;49;00m[36mendif[39;49;00m[36m[39;49;00m

__kernel [36mvoid[39;49;00m [32m__attribute__[39;49;00m ((reqd_work_group_size([34m1[39;49;00m, [34m1[39;49;00m, [34m1[39;49;00m))) loopy_kernel([36mint[39;49;00m [34mconst[39;49;00m Ni, [36mint[39;49;00m [34mconst[39;49;00m Nj, [36mint[39;49;00m [34mconst[39;49;00m Nk, __global [36mdouble[39;49;00m [34mconst[39;49;00m *__restrict__ arg0, __global [36mdouble[39;49;00m [34mconst[39;49;00m *__restrict__ arg1, __global [36mdouble[39;49;00m *__restrict__ out)
{
  [36mdouble[39;49;00m acc_k

  knl = lp.make_kernel(domain, [lp.ExpressionInstruction(lhs, rhs)])


1.3624569501569665e-12
