# CUDA to SYCL with Buffers and Ranges

``` sh
BUILD:
cd src
./scripts/alcf/sycl.sh
cd build_sycl_${USER} # We can change this
make
cd bin
./raja-perf.exe -od output -k <DAXPY/LTIMES> -v <Base_Sycl/RAJA_Sycl>
# -od: directory for output
# -k: Kernels to run via pattern match
# -v: Variant to run (Base, RAJA / SYCL,CUDA,OpenMP), not all are built
```

## Starting Point
### Existing CUDA code

```c
namespace rajaperf
{
namespace basic
{

  //
  // Define thread block size for CUDA execution
  //
  const size_t block_size = 256;


#define DAXPY_DATA_SETUP_CUDA \
  allocAndInitCudaDeviceData(x, m_x, iend); \
  allocAndInitCudaDeviceData(y, m_y, iend);

#define DAXPY_DATA_TEARDOWN_CUDA \
  getCudaDeviceData(m_y, y, iend); \
  deallocCudaDeviceData(x); \
  deallocCudaDeviceData(y);

__global__ void daxpy(Real_ptr y, Real_ptr x,
                      Real_type a,
                      Index_type iend)
{
   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < iend) {
     DAXPY_BODY;
   }
}


void DAXPY::runCudaVariant(VariantID vid)
{
  const Index_type run_reps = getRunReps();
  const Index_type ibegin = 0;
  const Index_type iend = getRunSize();

  DAXPY_DATA_SETUP;

  if ( vid == Base_CUDA ) {

    DAXPY_DATA_SETUP_CUDA;

    startTimer();
    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
      daxpy<<<grid_size, block_size>>>( y, x, a,
                                        iend );

    }
    stopTimer();

    DAXPY_DATA_TEARDOWN_CUDA;

  } else if ( vid == RAJA_CUDA ) {

    DAXPY_DATA_SETUP_CUDA;

    startTimer();
    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
        DAXPY_BODY;
      });

    }
    stopTimer();

    DAXPY_DATA_TEARDOWN_CUDA;

  } else {
     std::cout << "\n  DAXPY : Unknown Cuda variant id = " << vid << std::endl;
  }
}
```

In [None]:
# Code to run DAXPY, maybe just sequential version?

## Data Setup

The `DAXPY_DATA_SETUP_CUDA` macro calls `allocAndInitCudaDeviceData` which is used to simplify the memory calls.

```c
  cudaMalloc( (void**)&dptr,
              len * sizeof(typename std::remove_pointer<T>::type) );
              
  cudaMemcpy( dptr, hptr,
              len * sizeof(typename std::remove_pointer<T>::type),
              cudaMemcpyHostToDevice );
                          
```

With SYCL we are able to use implicit memory management using `sycl::buffer` and `sycl::accessor`.  We are able to declare a buffer with attached source data.  For the above example `allocAndInitCudaDeviceData(x, m_x, iend);`, we will declare a buffer with `m_x` as our source data and `iend` our length.

```c
  sycl::buffer<Real_type> d_x { m_x, iend };
```

Note that this does not attach memory to a specific device.


## Kernel 

The CUDA kernel is defined within our namespace
```c
__global__ void daxpy(Real_ptr y, Real_ptr x,
                      Real_type a,
                      Index_type iend)
{
   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < iend) {
     DAXPY_BODY;
   }
}
```
In SYCL we can write our kernel inline using a lambda.
```c
  [=] (sycl::item<1> item ) {

    Index_type i = item.get_id(0);
    DAXPY_BODY

  });
```

Note that we use the `sycl::item` to access our iteration space.

## Kernel Launch
The CUDA kernel launch is defining a grid size and block size.
```c
      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
      daxpy<<<grid_size, block_size>>>( y, x, a,
                                        iend );
```
For now we will leave this to the SYCL runtime and instead pass in our global range.  This is given to the `parallel_for` as `sycl::range<1>(iend)`.  The template parameter defines the number of dimensions, 1.
```c
  qu.submit([&] (sycl::handler& h) {
    auto x = d_x.get_access<sycl::access::mode::read>(h);
    auto y = d_y.get_access<sycl::access::mode::read_write>(h);

    h.parallel_for(sycl::range<1>(iend), 
      // Here is where we use the above lambda
      [=] (sycl::item<1> item ) {

        Index_type i = item.get_id(0);
        DAXPY_BODY

    });
  });
```
Using our `qu` we submit the accessors used by our kernel along with the `parallel_for`. The accessors, `auto x = d_x.get_access<sycl::access::mode::read>(h);`, tell the runtime what data is needed by our kernel and how it will be used, eg. `read_write`. This enables us to manage our memory implicitely.  The `qu.submit` submits asynchronously and returns a `sycl::event`.   


## Data Teardown

Before the data moves back to the host, we want to finish all of our asynchronous execution.
```c
  qu.wait();
```
In CUDA we call `getCudaDeviceData` and `deallocCudaDeviceData` which are wrappers for:
```c
  cudaMemcpy( hptr, dptr,
              len * sizeof(typename std::remove_pointer<T>::type),
              cudaMemcpyDeviceToHost );
  // and
  cudaFree( dptr );
              
```
When using buffers in SYCl we allow the buffer to fall out of scope, triggering a copy back to the source data location.

```c
{ // Scope for our buffer
  sycl::buffer<type> buf(host_data, len)
  /* Do our work
   / Update buf via accessors
   / Finish our work */
} // Trigger data movement back to host_data
```

There is support for changing where the data is written back to, or triggering an update before the buffer falls out of scope.

## Lets put it all together


```c
#include "DAXPY.hpp"

#include "RAJA/RAJA.hpp"

#if defined(RAJA_ENABLE_SYCL)

#include "common/SyclDataUtils.hpp"

#include <iostream>

namespace rajaperf
{
namespace basic
{

  //
  // Define thread block size for SYCL execution
  //
  const size_t block_size = 256; // We could query our device for this

#define DAXPY_DATA_SETUP_SYCL \
  sycl::buffer<Real_type> d_x { m_x, iend }; \
  sycl::buffer<Real_type> d_y { m_y, iend) }; \


#define DAXPY_DATA_TEARDOWN_SYCL \
 // Nothing to do here

void DAXPY::runSyclVariant(VariantID vid)
{
  const Index_type run_reps = getRunReps();
  const Index_type ibegin = 0;
  const Index_type iend = getRunSize();

  DAXPY_DATA_SETUP; // This sets up our host data. m_x, m_y.

  if ( vid == Base_SYCL ) {
    { // Create a scope for our buffers

      DAXPY_DATA_SETUP_SYCL;

      startTimer();
      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

        qu.submit([&] (sycl::handler& h) {
          auto x = d_x.get_access<sycl::access::mode::read>(h);
          auto y = d_y.get_access<sycl::access::mode::read_write>(h);

          h.parallel_for(sycl::range<1>(iend), [=] (sycl::item<1> item ) {

            Index_type i = item.get_id(0);
            DAXPY_BODY

          });
        });
      }
      qu.wait(); // Wait for computation to finish before stopping timer
      stopTimer();
     
    } // End of buffer scope

    DAXPY_DATA_TEARDOWN_SYCL;
 
  } else if ( vid == RAJA_SYCL ) {

  // We will do this later

  } else {
     std::cout << "\n  DAXPY : Unknown Sycl variant id = " << vid << std::endl;
  }

}

} // end namespace basic
} // end namespace rajaperf
```

In [None]:
# Now Run It !!!