# Hello Loopy: Computing a Rank-One Matrix

## Setup Code

In [16]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp

In [21]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

In [22]:
n = 1024
a = cl.clrandom.rand(queue, n, dtype=np.float32)
b = cl.clrandom.rand(queue, n, dtype=np.float32)

## The Initial Kernel

In [23]:
knl = lp.make_kernel(
    "{[i,j]: 0<=i,j<n}",
    "c[i, j] = a[i]*b[j]")

In [24]:
knl = lp.set_options(knl, write_cl=True)
evt, (mat,) = knl(queue, a=a, b=b)

## Transforming kernels: Implementation Tags

Every loop axis ("iname") comes with an *implementation tag*.

In [29]:
isplit_knl = knl
#isplit_knl = lp.assume(isplit_knl, "n mod 4 = 0")
isplit_knl = lp.split_iname(isplit_knl, "i", 4)
#isplit_knl = lp.tag_inames(isplit_knl, {"i_inner": "unr"})

evt, (mat,) = isplit_knl(queue, a=a, b=b)

----
"Map to GPU hw axis" is an iname tag as well.

Use shortcuts for less typing:

In [8]:
split_knl = knl
split_knl = lp.split_iname(split_knl, "i", 16,
        outer_tag="g.0", inner_tag="l.0")
split_knl = lp.split_iname(split_knl, "j", 16,
        outer_tag="g.1", inner_tag="l.1")

evt, (mat,) = split_knl(queue, a=a, b=b)

## Targeting CPUs/KNL

[ISPC](https://ispc.github.io/)'s "low-level" interface provides access to SSE, AVX2, AVX512 (including Knight's hardware) from a scalar CUDA program model:

In [19]:
ispc_knl = knl

ispc_knl = ispc_knl.copy(target=lp.ISPCTarget())

ispc_knl = lp.split_iname(ispc_knl, "j", 16,
        outer_tag="g.0", inner_tag="l.0")
ispc_knl = lp.add_and_infer_dtypes(ispc_knl, dict(a=np.float32, b=np.float32))

print(lp.generate_code_v2(ispc_knl).all_code())

## Transforming kernels: Leveraging data reuse

Would like to fetch entire "access footprint" of a loop.

In [22]:
fetch_knl = split_knl

fetch_knl = lp.add_prefetch(fetch_knl, "a", ["i_inner"])
fetch_knl = lp.add_prefetch(fetch_knl, "b", ["j_inner"])

evt, (mat,) = fetch_knl(queue, a=a, b=b)