# Tutorial Notebook

We'll go through the OCCA basics, to install run

```bash
pip install occa
```

or from source inside the `occa.py/` directory

```bash
pip install -e .
```

Useful links

- [OCCA Documentation](https://libocca.org/#/guide/occa/introduction)
- [OKL (OCCA Kernel language) Documentation](https://libocca.org/#/guide/okl/introduction)
- [Github Repo](https://github.com/libocca/occa.py)

## Imports

In [1]:
import numpy as np
import occa

## Device

> A physical device we're communicating with, whether the same physical device as the host or an offload device. Examples include CPU processors, GPUs, and Xeon Phi.

A few options to setting up a Device:

- Kwargs

```python
occa.Device(mode='CUDA'
            device_id=0)
    ```

- Dict

```python
occa.Device({
    'mode': 'CUDA',
    'device_id': 0,
})
```

- String (handles JSON and most of JSON5. Note the initial `{}` can be omitted since it expects a JSON object)

```python
occa.Device("mode: 'CUDA', device_id: 0")
```

In [2]:
# We'll use the default 'Serial' device
device = occa.Device(mode='Serial')

## Memory

We need to allocate memory in the device

To initialize data in the device, we can either

- Copy host data to the device
- Modified inside a kernel call (Explained in the [kernel section](#kernel))


In [3]:
entries = 10

a  = np.arange(entries, dtype=np.float32)
b  = 1 - a
ab = np.zeros(entries, dtype=np.float32)

# Allocate memory in device and copy over data
o_a  = device.malloc(a)
o_b  = device.malloc(b)
o_ab = device.malloc(entries, dtype=np.float32)

## Kernel

Kernels are built at runtime so we require 2 things

- The kernel source code (string or file)
- The name of the kernel in the source code we wish to use

To learn more about the kernel language, checkout the [OKL documentation](https://libocca.org/#/guide/okl/introduction)

In [4]:
add_vectors_source = r'''
@kernel void addVectors(const int entries,
                        const float *a,
                        const float *b,
                        float *ab) {
  for (int i = 0; i < entries; ++i; @tile(16, @outer, @inner)) {
    ab[i] = a[i] + b[i];
  }
}
'''

add_vectors = device.build_kernel_from_string(add_vectors_source,
                                              'addVectors')

# Or you can build from a file
# add_vectors = d.build_kernel('addVectors.okl',
#                              'addVectors')

## Launch Kernel

Run an OCCA kernel as a regular function


### Types

Make sure Python and C/C++ types match

For example
- Python's `int` is usually the same as C/C++'s `long`
- Python's `float` is usually the same as C/C++'s `double`

Try to use numpy types or use `np.intc` and `np.float32` for these ambiguous types

In [5]:
add_vectors(np.intc(entries),
            o_a, o_b, o_ab)

#Before
ab

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

# Syncing Host and Device
Copy over data using `copy_to` or create an array using `to_ndarray()`

In [6]:
# Copy device data to host
o_ab.copy_to(ab)

#After
ab

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)

## Kernel + numpy.ndarray

Numpy ndarrays can be passed with the following performance penalties:
- Memory is temporarily allocated in the device
- Numpy data is copied to and from the device
- Kernel launches are no longer asynchronous

However, this feature is useful for fast prototyping

In [7]:
ab = np.zeros(entries, dtype=np.float32)
ab

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [8]:
add_vectors(np.intc(entries),
            a, b, ab)
ab

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)

## **⚠️ Experimental**: Python &rarr; OKL

In [9]:
from occa import okl
from occa.okl import Array, Const, Exclusive, Shared

In [10]:
@okl.kernel
def py_add_vectors(a: Const[Array[np.float32]],
                   b: Const[Array[np.float32]],
                   ab: Array[np.float32]) -> None:
    for i in okl.range(entries).tile(16):
        ab[i] = a[i] + b[i] 

In [11]:
# Reset ab
ab = np.zeros(entries, dtype=np.float32)
o_ab.copy_from(ab)
o_ab.to_ndarray()

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [12]:
py_add_vectors(o_a, o_b, o_ab, device=device)
o_ab.to_ndarray()

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)

In [13]:
print(py_add_vectors.source())

@kernel void py_add_vectors(const float *a,
                            const float *b,
                            float *ab) {
  for (int i = 0; i < 10; ++i; @tile(16, @outer, @inner)) {
    ab[i] = a[i] + b[i];
  }
}


## **⚠️ Experimental**: Shared memory

In [14]:
@okl.kernel
def py_add_vectors(a: Const[Array[np.float32]],
                   b: Const[Array[np.float32]],
                   ab: Array[np.float32]) -> None:
    for block in okl.range(0, entries, 16).outer:
        s_ab: Shared[Array[np.float32, 16]]
        # Store sum in a shared array
        for i in okl.range(0, 16).inner:
            id: int = block + i
            if id < entries:
                s_ab[i] = a[id] + b[id] 
        # Copy over the shared array data
        for i in okl.range(0, 16).inner:
            id: int = block + i
            if id < entries:
                ab[id] = s_ab[i]

In [15]:
# Reset ab
ab = np.zeros(entries, dtype=np.float32)
o_ab.copy_from(ab)
o_ab.to_ndarray()

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [16]:
py_add_vectors(o_a, o_b, o_ab, device=device)
o_ab.to_ndarray()

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)

In [17]:
print(py_add_vectors.source())

@kernel void py_add_vectors(const float *a,
                            const float *b,
                            float *ab) {
  for (int block = 0; block < 10; block += 16; @outer) {
    @shared float s_ab[16];
    for (int i = 0; i < 16; ++i; @inner) {
      int id = block + i;
      if (id < 10) {
        s_ab[i] = a[id] + b[id];
      }
    }
    for (int i = 0; i < 16; ++i; @inner) {
      int id = block + i;
      if (id < 10) {
        ab[id] = s_ab[i];
      }
    }
  }
}


## **⚠️ Experimental**: Exclusive memory

In [18]:
@okl.kernel
def py_add_vectors(a: Const[Array[np.float32]],
                   b: Const[Array[np.float32]],
                   ab: Array[np.float32]) -> None:
    for block in okl.range(0, entries, 16).outer:
        e_ab: Exclusive[np.float32]
        # Store sum in an exclusive variable
        for i in okl.range(0, 16).inner:
            id: int = block + i
            if id < entries:
                e_ab = a[id] + b[id] 
        # Copy over the exclusive variable value
        for i in okl.range(0, 16).inner:
            id: int = block + i
            if id < entries:
                ab[id] = e_ab

In [19]:
# Reset ab
ab = np.zeros(entries, dtype=np.float32)
o_ab.copy_from(ab)
o_ab.to_ndarray()

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [20]:
py_add_vectors(o_a, o_b, o_ab, device=device)
o_ab.to_ndarray()

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)

In [21]:
print(py_add_vectors.source())

@kernel void py_add_vectors(const float *a,
                            const float *b,
                            float *ab) {
  for (int block = 0; block < 10; block += 16; @outer) {
    @exclusive float e_ab;
    for (int i = 0; i < 16; ++i; @inner) {
      int id = block + i;
      if (id < 10) {
        e_ab = a[id] + b[id];
      }
    }
    for (int i = 0; i < 16; ++i; @inner) {
      int id = block + i;
      if (id < 10) {
        ab[id] = e_ab;
      }
    }
  }
}
