Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][PI][CUDA] Update queries for atomic order and scope for CUDA #4853

Merged
merged 10 commits into from
Jan 26, 2022
14 changes: 11 additions & 3 deletions sycl/include/CL/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,8 @@ typedef enum {
PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS = 0x20000,
PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_1D = 0x20001,
PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_2D = 0x20002,
PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D = 0x20003

PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D = 0x20003,
PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000
t4c1 marked this conversation as resolved.
Show resolved Hide resolved
} _pi_device_info;

typedef enum {
Expand All @@ -326,7 +326,8 @@ typedef enum {
PI_CONTEXT_INFO_PROPERTIES = CL_CONTEXT_PROPERTIES,
PI_CONTEXT_INFO_REFERENCE_COUNT = CL_CONTEXT_REFERENCE_COUNT,
// Atomics capabilities extensions
PI_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10010
PI_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10010,
PI_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x10011
} _pi_context_info;

typedef enum {
Expand Down Expand Up @@ -533,6 +534,13 @@ constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_RELEASE = 0x04;
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_ACQ_REL = 0x08;
constexpr pi_memory_order_capabilities PI_MEMORY_ORDER_SEQ_CST = 0x10;

using pi_memory_scope_capabilities = pi_bitfield;
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_WORK_ITEM = 0x01;
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_SUB_GROUP = 0x02;
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_WORK_GROUP = 0x04;
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_DEVICE = 0x08;
constexpr pi_memory_scope_capabilities PI_MEMORY_SCOPE_SYSTEM = 0x10;

typedef enum {
PI_PROFILING_INFO_COMMAND_QUEUED = CL_PROFILING_COMMAND_QUEUED,
PI_PROFILING_INFO_COMMAND_SUBMIT = CL_PROFILING_COMMAND_SUBMIT,
Expand Down
1 change: 1 addition & 0 deletions sycl/include/CL/sycl/info/context_traits.def
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ __SYCL_PARAM_TRAITS_SPEC(context, reference_count, cl_uint)
__SYCL_PARAM_TRAITS_SPEC(context, platform, cl::sycl::platform)
__SYCL_PARAM_TRAITS_SPEC(context, devices, std::vector<cl::sycl::device>)
__SYCL_PARAM_TRAITS_SPEC(context, atomic_memory_order_capabilities, std::vector<cl::sycl::memory_order>)
__SYCL_PARAM_TRAITS_SPEC(context, atomic_memory_scope_capabilities, std::vector<cl::sycl::memory_scope>)
2 changes: 2 additions & 0 deletions sycl/include/CL/sycl/info/device_traits.def
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ __SYCL_PARAM_TRAITS_SPEC(device, image_support, bool)
__SYCL_PARAM_TRAITS_SPEC(device, atomic64, bool)
__SYCL_PARAM_TRAITS_SPEC(device, atomic_memory_order_capabilities,
std::vector<cl::sycl::memory_order>)
__SYCL_PARAM_TRAITS_SPEC(device, atomic_memory_scope_capabilities,
std::vector<cl::sycl::memory_scope>)
__SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32)
__SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32)
__SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t)
Expand Down
7 changes: 6 additions & 1 deletion sycl/include/CL/sycl/info/info_desc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class program;
#endif
class device;
class platform;
enum class memory_scope;

// TODO: stop using OpenCL directly, use PI.
namespace info {
Expand All @@ -43,6 +44,8 @@ enum class context : cl_context_info {
devices = CL_CONTEXT_DEVICES,
atomic_memory_order_capabilities =
PI_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES,
atomic_memory_scope_capabilities =
PI_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES,
};

// A.3 Device information descriptors
Expand Down Expand Up @@ -164,7 +167,9 @@ enum class device : cl_device_info {
PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS,
ext_oneapi_max_work_groups_1d = PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_1D,
ext_oneapi_max_work_groups_2d = PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_2D,
ext_oneapi_max_work_groups_3d = PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D
ext_oneapi_max_work_groups_3d = PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D,
atomic_memory_scope_capabilities =
PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES
};

enum class device_type : pi_uint64 {
Expand Down
16 changes: 16 additions & 0 deletions sycl/include/CL/sycl/memory_enums.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,22 @@ readMemoryOrderBitfield(pi_memory_order_capabilities bits) {
return result;
}

inline std::vector<memory_scope>
readMemoryScopeBitfield(pi_memory_scope_capabilities bits) {
std::vector<memory_scope> result;
if (bits & PI_MEMORY_SCOPE_WORK_ITEM)
result.push_back(memory_scope::work_item);
if (bits & PI_MEMORY_SCOPE_SUB_GROUP)
result.push_back(memory_scope::sub_group);
if (bits & PI_MEMORY_SCOPE_WORK_GROUP)
result.push_back(memory_scope::work_group);
if (bits & PI_MEMORY_SCOPE_DEVICE)
result.push_back(memory_scope::device);
if (bits & PI_MEMORY_SCOPE_SYSTEM)
result.push_back(memory_scope::system);
return result;
}

#ifndef __SYCL_DEVICE_ONLY__
static constexpr std::memory_order getStdMemoryOrder(sycl::memory_order order) {
switch (order) {
Expand Down
55 changes: 51 additions & 4 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,33 @@ pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name,
case PI_CONTEXT_INFO_REFERENCE_COUNT:
return getInfo(param_value_size, param_value, param_value_size_ret,
context->get_reference_count());
case PI_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
int major = 0;
cl::sycl::detail::pi::assertion(
cuDeviceGetAttribute(&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
context->get_device()->get()) == CUDA_SUCCESS);
pi_memory_order_capabilities capabilities =
(major >= 6) ? PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE |
PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL
: PI_MEMORY_ORDER_RELAXED;
return getInfo(param_value_size, param_value, param_value_size_ret,
capabilities);
}
case PI_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
int major = 0;
cl::sycl::detail::pi::assertion(
cuDeviceGetAttribute(&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
context->get_device()->get()) == CUDA_SUCCESS);
pi_memory_order_capabilities capabilities =
(major >= 5) ? PI_MEMORY_SCOPE_WORK_ITEM | PI_MEMORY_SCOPE_SUB_GROUP |
PI_MEMORY_SCOPE_WORK_GROUP | PI_MEMORY_SCOPE_DEVICE |
PI_MEMORY_SCOPE_SYSTEM
: PI_MEMORY_SCOPE_DEVICE;
return getInfo(param_value_size, param_value, param_value_size_ret,
capabilities);
}
default:
__SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
}
Expand Down Expand Up @@ -1110,11 +1137,31 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
atomic64);
}
case PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
// NVPTX currently only support at most monotonic atomic load/store.
// Acquire and release is present in newer PTX, but is not yet supported
// in LLVM NVPTX.
int major = 0;
cl::sycl::detail::pi::assertion(
cuDeviceGetAttribute(&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
device->get()) == CUDA_SUCCESS);
pi_memory_order_capabilities capabilities =
(major >= 6) ? PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE |
PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL
: PI_MEMORY_ORDER_RELAXED;
return getInfo(param_value_size, param_value, param_value_size_ret,
capabilities);
}
case PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
int major = 0;
cl::sycl::detail::pi::assertion(
cuDeviceGetAttribute(&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
device->get()) == CUDA_SUCCESS);
pi_memory_order_capabilities capabilities =
(major >= 5) ? PI_MEMORY_SCOPE_WORK_ITEM | PI_MEMORY_SCOPE_SUB_GROUP |
PI_MEMORY_SCOPE_WORK_GROUP | PI_MEMORY_SCOPE_DEVICE |
PI_MEMORY_SCOPE_SYSTEM
: PI_MEMORY_SCOPE_DEVICE;
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_MEMORY_ORDER_RELAXED);
capabilities);
}
case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
// NVIDIA devices only support one sub-group size (the warp size)
Expand Down
18 changes: 18 additions & 0 deletions sycl/source/detail/context_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,24 @@ context_impl::get_info<info::context::atomic_memory_order_capabilities>()
sizeof(Result), &Result, nullptr);
return readMemoryOrderBitfield(Result);
}
template <>
std::vector<cl::sycl::memory_scope>
context_impl::get_info<info::context::atomic_memory_scope_capabilities>()
const {
if (is_host())
return {cl::sycl::memory_scope::work_item,
cl::sycl::memory_scope::sub_group,
cl::sycl::memory_scope::work_group, cl::sycl::memory_scope::device,
cl::sycl::memory_scope::system};

pi_memory_scope_capabilities Result;
getPlugin().call<PiApiKind::piContextGetInfo>(
MContext,
pi::cast<pi_context_info>(
info::context::atomic_memory_scope_capabilities),
sizeof(Result), &Result, nullptr);
return readMemoryScopeBitfield(Result);
}

RT::PiContext &context_impl::getHandleRef() { return MContext; }
const RT::PiContext &context_impl::getHandleRef() const { return MContext; }
Expand Down
22 changes: 22 additions & 0 deletions sycl/source/detail/device_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,21 @@ struct get_device_info<std::vector<memory_order>,
}
};

// Specialization for atomic_memory_scope_capabilities, PI returns a bitfield
template <>
struct get_device_info<std::vector<memory_scope>,
info::device::atomic_memory_scope_capabilities> {
static std::vector<memory_scope> get(RT::PiDevice dev, const plugin &Plugin) {
pi_memory_scope_capabilities result;
Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
dev,
pi::cast<RT::PiDeviceInfo>(
info::device::atomic_memory_scope_capabilities),
sizeof(pi_memory_scope_capabilities), &result, nullptr);
return readMemoryScopeBitfield(result);
}
};

// Specialization for exec_capabilities, OpenCL returns a bitfield
template <>
struct get_device_info<std::vector<info::execution_capability>,
Expand Down Expand Up @@ -744,6 +759,13 @@ get_device_info_host<info::device::atomic_memory_order_capabilities>() {
memory_order::acq_rel, memory_order::seq_cst};
}

template <>
inline std::vector<memory_scope>
get_device_info_host<info::device::atomic_memory_scope_capabilities>() {
return {memory_scope::work_item, memory_scope::sub_group,
memory_scope::work_group, memory_scope::device, memory_scope::system};
}

template <>
inline cl_uint get_device_info_host<info::device::max_read_image_args>() {
// current value is the required minimum
Expand Down