### Imports

In [1]:
import argparse
import numpy as np
import occa

### Initialize Data

In [2]:
# Create device
device = occa.Device(mode='Serial')

# Alternatively, try out:
#
#   device.setup(mode='OpenMP',
#                schedule='compact',
#                chunk=10)
#
#   device.setup(mode='OpenCL',
#                platform_id=0,
#                device_id=0)
#
#   device.setup(mode='CUDA'
#                device_id=0)

# Allocate memory in Python + device
entries = 10

a  = np.arange(entries, dtype=np.float32)
b  = 1 - a
ab = np.zeros(entries, dtype=np.float32)

# Allocate memory in device and copy over data
o_a  = device.malloc(a)
o_b  = device.malloc(b)
o_ab = device.malloc(entries, dtype=np.float32)

### Build Kernel

In [3]:
add_vectors_source = r'''
@kernel void addVectors(const int entries,
                        const float *a,
                        const float *b,
                        float *ab) {
  for (int i = 0; i < entries; ++i; @tile(16, @outer, @inner)) {
    ab[i] = a[i] + b[i];
  }
}
'''

add_vectors = device.build_kernel_from_string(add_vectors_source,
                                              'addVectors')

# Or you can build from a file
# add_vectors = d.build_kernel('addVectors.okl',
#                              'addVectors')

### Launch Kernel and Copy to Host

In [4]:
add_vectors(np.intc(entries),
            o_a, o_b, o_ab)
#Before
ab

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [5]:
# Copy device data to host
o_ab.copy_to(ab)
ab

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)