# Mini Pytato 

In [8]:
import numpy as np
import numpy.linalg as la
import pymbolic.primitives as p
import loopy as lp
import pyopencl as cl
import pyopencl.array

%load_ext gvmagic

ctx = cl.create_some_context(interactive=True)
queue = cl.CommandQueue(ctx)

Choose platform:
[0] <pyopencl.Platform 'Intel(R) OpenCL Graphics' at 0x4349d20>
[1] <pyopencl.Platform 'Portable Computing Language' at 0x7f7b3fdf5788>


Choice [0]: 1


Set the environment variable PYOPENCL_CTX='1' to avoid being asked again.


In this demo, we will deal with numpy-ish arrays that *all* have shape `(10, 10)` (think of them as $10\times 10$ matrices maybe) and contain floating point numbers.

## Building Expression Graphs

We would like to build a data structure that represents the following computation, so that:

- we can execute it later
- we can generate code for it

Reflect for a moment what `result` *is*. Does it contain data? What is its "meaning"?

(`Placeholder` is not yet implemented. We will do this in the next cell. This cell is repeated below for convenience.)

In [9]:
a = Placeholder("a")
b = Placeholder("b")

result = a + b * a

Now let's implement the `Array` base class, along with `Sum`, `Product`, and `Placeholder` subclasses. To do so, fill in code for the `...` ellipses:

In [15]:
class Array:
    def __init__(self):
        self.shape = (10, 10)
        self.dtype = np.float64
        
    def __add__(self, other):
        #beginclear
        return Sum(self, other)
        #endclear
    
    def __mul__(self, other):
        #beginclear
        return Product(self, other)
        #endclear
        
class Sum(Array):
    def __init__(self, a, b):
        super().__init__()
        #beginclear
        self.a = a
        self.b = b
        #endclear
        
class Product(Array):
    def __init__(self, a, b):
        super().__init__()
        #beginclear
        self.a = a
        self.b = b
        #endclear
        
class Placeholder(Array):
    def __init__(self, name):
        super().__init__()
        #beginclear
        self.name = name
        #endclear

Here is the cell from above once again:

In [16]:
a = Placeholder("a")
b = Placeholder("b")

result = a + b * a

Can you visualize the data structure that we have just created? (Execute this cell to see if your mental image was correct.)

In [17]:
%%dot
digraph {
    sum -> prod;
    sum -> a;
    prod -> a;
    prod -> b;
}

## Evaluating Expressions

Write code to evaluate the expression, provided some values for `a` and `b`:

In [19]:
def evaluate_expr(expr, values):
    if isinstance(expr, Placeholder):
        #beginclear
        return values[expr.name]
        #endclear
    elif isinstance(expr, Sum):
        #beginclear
        return evaluate_expr(expr.a, values) + evaluate_expr(expr.b, values)
        #endclear
    #beginclear
    elif isinstance(expr, Product):
        return evaluate_expr(expr.a, values) * evaluate_expr(expr.b, values)
    #endclear
    else:
        raise ValueError(f"unexpected node type: {expr.__class__}")
    

Let's test if we got it right. If all is well, this should produce an array of zeroes:

In [20]:
a_val = np.random.randn(10, 10)
b_val = np.random.randn(10, 10)

evaluate_expr(result, {"a": a_val, "b": b_val}) - (a_val + b_val * a_val)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

Functions like `evaluate_expr` are hard to extend once they're written. Using a class with the "visitor pattern" can help, where we make one method per node type that needs to be handled. Make note of the `rec` method that dispatches each node to the appropriate method.

Fill in the missing method implementations:

In [37]:
class EvaluationMapper:
    def __init__(self, values):
        self.values = values

    def rec(self, expr):
        method = getattr(self, f"map_{expr.__class__.__name__}")
        return method(expr)

    def map_Sum(self, expr):
        #beginclear
        return self.rec(expr.a) + self.rec(expr.b)
        #endclear
    
    def map_Product(self, expr):
        #beginclear
        return self.rec(expr.a) * self.rec(expr.b)
        #endclear

    def map_Placeholder(self, expr):
        #beginclear
        return self.values[expr.name]
        #endclear

Again, let's test that this does what we intend:

In [38]:
EvaluationMapper({"a": a_val, "b": b_val}).rec(result) - (a_val + b_val * a_val)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

## Generating code

To generate code using our code generator Loopy, all we need to do is to transcribe our array-valued expression into a scalar one, using an existing expression tree library called `pymbolic`. We have imported this as `p` above. The equivalents of `Placeholder`s there are called `Variable`. Let's write a mapper that does this. Again, fill in the blanks:

In [63]:
class CodegenMapper:
    def rec(self, expr):
        method = getattr(self, f"map_{expr.__class__.__name__}")
        return method(expr)
        
    def map_Placeholder(self, expr):
        return p.Variable(expr.name)[p.Variable("i"), p.Variable("j")]

    def map_Sum(self, expr):
        #beginclear
        return self.rec(expr.a) + self.rec(expr.b)
        #endclear
    
    def map_Product(self, expr):
        #beginclear
        return self.rec(expr.a) * self.rec(expr.b)
        #endclear

Let's with some expressions:

In [64]:
x = Placeholder("x")
y = Placeholder("y")

expr = (x+x*y)*x

# expr = (x+y)
# expr = expr*expr
# expr = expr*expr
# expr = expr*expr
# expr = expr*expr
# expr = expr*expr

Generate code for these expressions:

In [65]:
#clear
print(CodegenMapper().rec(expr))

(x[i, j] + x[i, j]*y[i, j])*x[i, j]


Let's make a loopy kernel for the expression. Fill in the generated scalar expression for the RHS of the `Assignment`:

In [66]:
knl = lp.make_kernel(
    "{[i,j]: 0<=i,j<10}",
    [lp.Assignment(
        p.Variable("lhs")[p.Variable("i"), p.Variable("j")], 
        #beginclear
        CodegenMapper().rec(expr)
        #endclear
    )])
print(knl)

---------------------------------------------------------------------------
KERNEL: loopy_kernel
---------------------------------------------------------------------------
ARGUMENTS:
lhs: type: <auto/runtime>, shape: (10, 10), dim_tags: (N1:stride:10, N0:stride:1) out aspace: global
x: type: <auto/runtime>, shape: (10, 10), dim_tags: (N1:stride:10, N0:stride:1) in aspace: global
y: type: <auto/runtime>, shape: (10, 10), dim_tags: (N1:stride:10, N0:stride:1) in aspace: global
---------------------------------------------------------------------------
DOMAINS:
{ [i, j] : 0 <= i <= 9 and 0 <= j <= 9 }
---------------------------------------------------------------------------
INAME TAGS:
i: None
j: None
---------------------------------------------------------------------------
INSTRUCTIONS:
for i, j
    [36mlhs[i, j][0m = [35m(x[i, j] + x[i, j]*y[i, j])*x[i, j][0m  {id=[32minsn[0m}
end i, j
---------------------------------------------------------------------------


Next, let's run the code on our OpenCL device:

In [67]:
xval = np.random.randn(10, 10)
yval = np.random.randn(10, 10)

evt, (res,) = knl(queue, x=xval, y=yval)

  evt, (res,) = knl(queue, x=xval, y=yval)


Check the result:

In [68]:
print(la.norm(res- (xval+xval*yval)*xval))

0.0


Look at the generated C code:

In [69]:
knl = lp.add_and_infer_dtypes(knl, {"x": xval.dtype, "y": yval.dtype})

code = lp.generate_code_v2(knl).device_code()
print(code)

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global double *__restrict__ lhs, __global double const *__restrict__ x, __global double const *__restrict__ y)
{
  for (int j = 0; j <= 9; ++j)
    for (int i = 0; i <= 9; ++i)
      lhs[10 * i + j] = (x[10 * i + j] + x[10 * i + j] * y[10 * i + j]) * x[10 * i + j];
}


And this is where you might start transforming the loopy code. Here is a simple example that tiles the loop:

In [76]:
tiled = lp.split_iname(knl, "i", 4)
tiled = lp.split_iname(tiled, "j", 4)
tiled = lp.prioritize_loops(tiled, "i_outer, j_outer, i_inner, j_inner")

code = lp.generate_code_v2(tiled).device_code()
print(code)

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global double *__restrict__ lhs, __global double const *__restrict__ x, __global double const *__restrict__ y)
{
  for (int i_outer = 0; i_outer <= 2; ++i_outer)
    for (int j_outer = 0; j_outer <= 2; ++j_outer)
      for (int i_inner = 0; i_inner <= ((-2 + i_outer == 0) ? 1 : 3); ++i_inner)
        for (int j_inner = 0; j_inner <= ((-2 + j_outer == 0) ? 1 : 3); ++j_inner)
          lhs[40 * i_outer + 10 * i_inner + 4 * j_outer + j_inner] = (x[40 * i_outer + 10 * i_inner + 4 * j_outer + j_inner] + x[40 * i_outer + 10 * i_inner + 4 * j_outer + j_inner] * y[40 * i_outer + 10 * i_inner + 4 * j_outer + j_inner]) * x[40 * i_outer + 10 * i_inner + 4 * j_outer + j_inner];
}
