Skip to content

Commit

Permalink
Added support for SYCL indirect kernel execution.
Browse files Browse the repository at this point in the history
An indirect kernel is one where we have a functor
that is not trivially copyable and so is explicitly
constructed by the host in USM shared memory before being passed
"by pointer" (inside a reference_wrapper) to SYCL parallel_for.

This is to address the limitation that SYCL
data types can only be implicitly copied to the device if they
are trivially copyable.
  • Loading branch information
nliber authored and masterleinad committed Oct 14, 2020
1 parent 68bfe5b commit 2f3f8e7
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 145 deletions.
1 change: 0 additions & 1 deletion core/src/Kokkos_SYCL.hpp
Expand Up @@ -50,7 +50,6 @@
#ifdef KOKKOS_ENABLE_SYCL
#include <CL/sycl.hpp>
#include <Kokkos_SYCL_Space.hpp>
#include <SYCL/Kokkos_SYCL_Parallel_Range.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <impl/Kokkos_ExecSpaceInitializer.hpp>
Expand Down
5 changes: 3 additions & 2 deletions core/src/Kokkos_SYCL_Space.hpp
Expand Up @@ -46,9 +46,10 @@
#define KOKKOS_SYCLSPACE_HPP

#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_SharedAlloc.hpp>

#ifdef KOKKOS_ENABLE_SYCL
#include <SYCL/Kokkos_SYCL_Instance.hpp>
#include <impl/Kokkos_SharedAlloc.hpp>

namespace Kokkos {
namespace Experimental {
Expand All @@ -58,7 +59,7 @@ class SYCLDeviceUSMSpace {
using execution_space = SYCL;
using memory_space = SYCLDeviceUSMSpace;
using device_type = Kokkos::Device<execution_space, memory_space>;
using size_type = unsigned int;
using size_type = Impl::SYCLInternal::size_type;

SYCLDeviceUSMSpace();

Expand Down
3 changes: 3 additions & 0 deletions core/src/SYCL/Kokkos_SYCL_Instance.cpp
Expand Up @@ -98,6 +98,7 @@ void SYCLInternal::initialize(const cl::sycl::device& d) {
if (ok_init && ok_dev) {
m_queue = std::make_unique<cl::sycl::queue>(d);
std::cout << SYCL::SYCLDevice(d) << '\n';
m_indirectKernel.emplace(IndirectKernelAllocator(*m_queue));
} else {
std::ostringstream msg;
msg << "Kokkos::Experimental::SYCL::initialize(...) FAILED";
Expand All @@ -116,6 +117,8 @@ void SYCLInternal::finalize() {
// FIXME_SYCL
std::abort();
}

m_indirectKernel.reset();
m_queue.reset();
}

Expand Down
15 changes: 14 additions & 1 deletion core/src/SYCL/Kokkos_SYCL_Instance.hpp
Expand Up @@ -54,7 +54,7 @@ namespace Impl {

class SYCLInternal {
public:
using size_type = unsigned int;
using size_type = int;

SYCLInternal() = default;
~SYCLInternal();
Expand All @@ -70,6 +70,19 @@ class SYCLInternal {

std::unique_ptr<cl::sycl::queue> m_queue;

// An indirect kernel is one where the functor to be executed is explicitly
// created in USM shared memory before being executed, to get around the
// trivially copyable limitation of SYCL.
//
// m_indirectKernel just manages the memory as a reuseable buffer. It is
// stored in an optional because the allocator contains a queue
using IndirectKernelAllocator =
sycl::usm_allocator<std::byte, sycl::usm::alloc::shared>;
using IndirectKernelMemory =
std::vector<IndirectKernelAllocator::value_type, IndirectKernelAllocator>;
using IndirectKernel = std::optional<IndirectKernelMemory>;
IndirectKernel m_indirectKernel;

static int was_finalized;

static SYCLInternal& singleton();
Expand Down
116 changes: 0 additions & 116 deletions core/src/SYCL/Kokkos_SYCL_KernelLaunch.hpp

This file was deleted.

82 changes: 57 additions & 25 deletions core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
Expand Up @@ -45,51 +45,83 @@
#ifndef KOKKOS_SYCL_PARALLEL_RANGE_HPP_
#define KOKKOS_SYCL_PARALLEL_RANGE_HPP_

#include <SYCL/Kokkos_SYCL_KernelLaunch.hpp>
//#include <algorithm>
//#include <functional>

template <class FunctorType, class ExecPolicy>
class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
Kokkos::Experimental::SYCL> {
public:
typedef ExecPolicy Policy;
using Policy = ExecPolicy;

private:
typedef typename Policy::member_type Member;
typedef typename Policy::work_tag WorkTag;
typedef typename Policy::launch_bounds LaunchBounds;
using Member = typename Policy::member_type;
using WorkTag = typename Policy::work_tag;
using LaunchBounds = typename Policy::launch_bounds;

public:
const FunctorType m_functor;
const Policy m_policy;

private:
ParallelFor() = delete;
ParallelFor& operator=(const ParallelFor&) = delete;

template <class TagType>
typename std::enable_if<std::is_same<TagType, void>::value>::type exec_range(
const Member i) const {
m_functor(i);
}
static void sycl_direct_launch(const Policy& policy,
const FunctorType& functor) {
// Convenience references
const Kokkos::Experimental::SYCL& space = policy.space();
Kokkos::Experimental::Impl::SYCLInternal& instance =
*space.impl_internal_space_instance();
cl::sycl::queue& q = *instance.m_queue;

q.wait();

q.submit([functor, policy](cl::sycl::handler& cgh) {
cl::sycl::range<1> range(policy.end() - policy.begin());

cgh.parallel_for(range, [=](cl::sycl::item<1> item) {
const typename Policy::index_type id = item.get_linear_id();
if constexpr (std::is_same<WorkTag, void>::value)
functor(id);
else
functor(WorkTag(), id);
});
});

template <class TagType>
typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_range(
const Member i) const {
m_functor(TagType(), i);
q.wait();
}

public:
typedef FunctorType functor_type;
// Indirectly launch a functor by explicitly creating it in USM shared memory
void sycl_indirect_launch() const {
// Convenience references
const Kokkos::Experimental::SYCL& space = m_policy.space();
Kokkos::Experimental::Impl::SYCLInternal& instance =
*space.impl_internal_space_instance();
Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMemory& kernelMem =
*instance.m_indirectKernel;

// Allocate USM shared memory for the functor
kernelMem.resize(std::max(kernelMem.size(), sizeof(m_functor)));

// Placement new a copy of functor into USM shared memory
//
// Store it in a unique_ptr to call its destructor on scope exit
std::unique_ptr<FunctorType, Kokkos::Impl::destruct_delete>
kernelFunctorPtr(new (kernelMem.data()) FunctorType(m_functor));

inline void operator()(cl::sycl::item<1> item) const {
int id = item.get_linear_id();
m_functor(id);
// Use reference_wrapper (because it is both trivially copyable and
// invocable) and launch it
sycl_direct_launch(m_policy, std::reference_wrapper(*kernelFunctorPtr));
}

inline void execute() const {
Kokkos::Experimental::Impl::sycl_launch(*this);
public:
using functor_type = FunctorType;

void execute() const {
// if the functor is trivially copyable, we can launch it directly;
// otherwise, we will launch it indirectly via explicitly creating
// it in USM shared memory.
if constexpr (std::is_trivially_copyable_v<decltype(m_functor)>)
sycl_direct_launch(m_policy, m_functor);
else
sycl_indirect_launch();
}

ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
Expand Down
1 change: 1 addition & 0 deletions core/src/SYCL/Kokkos_SYCL_Space.cpp
Expand Up @@ -47,6 +47,7 @@
#include <Kokkos_HostSpace.hpp>
#include <impl/Kokkos_Profiling.hpp>
#include <Kokkos_SYCL.hpp>
#include <SYCL/Kokkos_SYCL_Instance.hpp>
#include <impl/Kokkos_MemorySpace.hpp>

/*--------------------------------------------------------------------------*/
Expand Down
19 changes: 19 additions & 0 deletions core/src/impl/Kokkos_Utilities.hpp
Expand Up @@ -474,6 +474,25 @@ emulate_fold_comma_operator(Ts&&...) noexcept {
// </editor-fold> end Folding emulation }}}1
//==============================================================================

//==============================================================================
// destruct_delete is a unique_ptr deleter for objects
// created by placement new into already allocated memory
// by only calling the destructor on the object.
//
// Because unique_ptr never calls its deleter with a nullptr value,
// no need to check if p == nullptr.
//
// Note: This differs in interface from std::default_delete in that the
// function call operator is templated instead of the class, to make
// it easier to use and disallow specialization.
struct destruct_delete {
template <typename T>
KOKKOS_INLINE_FUNCTION constexpr void operator()(T* p) const noexcept {
p->~T();
}
};
//==============================================================================

} // namespace Impl
} // namespace Kokkos

Expand Down

0 comments on commit 2f3f8e7

Please sign in to comment.