diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index bbaad03a27479..4210a0d150abf 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -82,6 +82,7 @@ def AspectExt_oneapi_limited_graph : Aspect<"ext_oneapi_limited_graph">; def AspectExt_oneapi_private_alloca : Aspect<"ext_oneapi_private_alloca">; def AspectExt_oneapi_queue_profiling_tag : Aspect<"ext_oneapi_queue_profiling_tag">; def AspectExt_oneapi_virtual_mem : Aspect<"ext_oneapi_virtual_mem">; +def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group">; // Deprecated aspects def AspectInt64_base_atomics : Aspect<"int64_base_atomics">; def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">; @@ -139,7 +140,7 @@ def : TargetInfo<"__TestAspectList", AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component, AspectExt_oneapi_graph, AspectExt_intel_fpga_task_sequence, AspectExt_oneapi_limited_graph, - AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem], + AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group], []>; // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT // match. @@ -205,9 +206,9 @@ def : CudaTargetInfo<"nvidia_gpu_sm_87", !listconcat(CudaMinAspects, CudaBindles def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>; def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, - [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>; + [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>; def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, - [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>; + [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>; // // HIP / AMDGPU device aspects diff --git a/sycl/doc/design/DeviceConfigFile.md b/sycl/doc/design/DeviceConfigFile.md index 505c65226a809..35273bbddcc84 100644 --- a/sycl/doc/design/DeviceConfigFile.md +++ b/sycl/doc/design/DeviceConfigFile.md @@ -180,6 +180,7 @@ def AspectExt_intel_free_memory : Aspect<"ext_intel_free_memory">; def AspectExt_intel_device_id : Aspect<"ext_intel_device_id">; def AspectExt_intel_memory_clock_rate : Aspect<"ext_intel_memory_clock_rate">; def AspectExt_intel_memory_bus_width : Aspect<"ext_intel_memory_bus_width">; +def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group">; def AspectEmulated : Aspect<"emulated">; def TargetTable : DynamicTable { diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp index c1fdc6857b4a0..b6955af412b3a 100644 --- a/sycl/include/sycl/detail/cg.hpp +++ b/sycl/include/sycl/detail/cg.hpp @@ -178,6 +178,7 @@ class CGExecKernel : public CG { std::vector> MAuxiliaryResources; sycl::detail::pi::PiKernelCacheConfig MKernelCacheConfig; bool MKernelIsCooperative = false; + bool MKernelUsesClusterLaunch = false; CGExecKernel(NDRDescT NDRDesc, std::shared_ptr HKernel, std::shared_ptr SyclKernel, @@ -188,7 +189,8 @@ class CGExecKernel : public CG { std::vector> AuxiliaryResources, CGTYPE Type, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, - bool KernelIsCooperative, detail::code_location loc = {}) + bool KernelIsCooperative, bool MKernelUsesClusterLaunch, + detail::code_location loc = {}) : CG(Type, std::move(CGData), std::move(loc)), MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)), MSyclKernel(std::move(SyclKernel)), @@ -196,7 +198,8 @@ class CGExecKernel : public CG { MKernelName(std::move(KernelName)), MStreams(std::move(Streams)), MAuxiliaryResources(std::move(AuxiliaryResources)), MKernelCacheConfig(std::move(KernelCacheConfig)), - MKernelIsCooperative(KernelIsCooperative) { + MKernelIsCooperative(KernelIsCooperative), + MKernelUsesClusterLaunch(MKernelUsesClusterLaunch) { assert(getType() == Kernel && "Wrong type of exec kernel CG."); } diff --git a/sycl/include/sycl/detail/cg_types.hpp b/sycl/include/sycl/detail/cg_types.hpp index 9da1f0b664d46..162510440b661 100644 --- a/sycl/include/sycl/detail/cg_types.hpp +++ b/sycl/include/sycl/detail/cg_types.hpp @@ -50,8 +50,8 @@ class ArgDesc { int MIndex; }; -// The structure represents NDRange - global, local sizes, global offset and -// number of dimensions. +// The structure represents NDRange - global, local sizes, global offset, +// number of dimensions, and the cluster dimensions if applicable. class NDRDescT { // The method initializes all sizes for dimensions greater than the passed one // to the default values, so they will not affect execution. @@ -128,6 +128,17 @@ class NDRDescT { Dims = Dims_; } + template void setClusterDimensions(sycl::range N) { + if (Dims_ != Dims) { + throw std::runtime_error( + "Dimensionality of cluster, global and local ranges must be same"); + } + + for (int I = 0; I < Dims_; ++I) { + ClusterDimensions[I] = N[I]; + } + } + sycl::range<3> GlobalSize; sycl::range<3> LocalSize; sycl::id<3> GlobalOffset; @@ -135,6 +146,7 @@ class NDRDescT { /// simplest form of parallel_for_work_group. If set, all other fields must be /// zero sycl::range<3> NumWorkGroups; + sycl::range<3> ClusterDimensions{1, 1, 1}; size_t Dims; }; diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 3a41c4881febf..d1d76ddd8ed6f 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -230,4 +230,7 @@ _PI_API(piextVirtualMemGetInfo) // Enqueue native command _PI_API(piextEnqueueNativeCommand) +// Kernel Launch Properties +_PI_API(piextEnqueueKernelLaunchCustom) + #undef _PI_API diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 66f0a57a3060b..8ead3c6f355e8 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -197,9 +197,11 @@ // pi_virtual_access_flags bit flags. // 15.55 Added piextEnqueueNativeCommand as well as associated types and enums // 16.56 Replaced piextUSMEnqueueMemset with piextUSMEnqueueFill +// 16.57 Added mappings to UR launch properties extension +// (piextEnqueueKernelLaunchCustom) #define _PI_H_VERSION_MAJOR 16 -#define _PI_H_VERSION_MINOR 56 +#define _PI_H_VERSION_MINOR 57 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -514,8 +516,12 @@ typedef enum { // Virtual memory support PI_EXT_ONEAPI_DEVICE_INFO_SUPPORTS_VIRTUAL_MEM = 0x2011E, + // Native enqueue PI_EXT_ONEAPI_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT = 0x2011F, + + // Return whether cluster launch is supported by device + PI_EXT_ONEAPI_DEVICE_INFO_CLUSTER_LAUNCH = 0x2021, } _pi_device_info; typedef enum { @@ -1317,8 +1323,28 @@ typedef enum { ///< P2P link, otherwise such operations are not supported. } _pi_peer_attr; +typedef enum { + PI_LAUNCH_PROPERTY_IGNORE = 0x0, + PI_LAUNCH_PROPERTY_COOPERATIVE = 0x1, + PI_LAUNCH_PROPERTY_CLUSTER_DIMENSION = 0x2, +} _pi_launch_property_id; + +typedef union { + int cooperative; + int32_t cluster_dims[3]; +} _pi_launch_property_value; + using pi_mem_info = _pi_mem_info; using pi_peer_attr = _pi_peer_attr; +using pi_launch_property_id = _pi_launch_property_id; +using pi_launch_property_value = _pi_launch_property_value; + +typedef struct { + pi_launch_property_id id; + pi_launch_property_value value; +} _pi_launch_property; + +using pi_launch_property = _pi_launch_property; // // Following section contains SYCL RT Plugin Interface (PI) functions. @@ -1933,6 +1959,14 @@ __SYCL_EXPORT pi_result piextEnqueueCooperativeKernelLaunch( const size_t *local_work_size, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); +__SYCL_EXPORT pi_result piextEnqueueKernelLaunchCustom( + pi_queue queue, pi_kernel kernel, pi_uint32 work_dim, + const size_t *global_work_size, const size_t *local_work_size, + pi_uint32 num_props_in_launch_prop_list, + const pi_launch_property *launch_prop_list, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event); + __SYCL_EXPORT pi_result piEnqueueEventsWait(pi_queue command_queue, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, diff --git a/sycl/include/sycl/detail/pi.hpp b/sycl/include/sycl/detail/pi.hpp index 1fe21d36a8aaa..83fc2996f42e8 100644 --- a/sycl/include/sycl/detail/pi.hpp +++ b/sycl/include/sycl/detail/pi.hpp @@ -148,6 +148,7 @@ using PiImageOffset = ::pi_image_offset_struct; using PiImageRegion = ::pi_image_region_struct; using PiPhysicalMem = ::pi_physical_mem; using PiVirtualAccessFlags = ::pi_virtual_access_flags; +using PiLaunchProperty = ::pi_launch_property; __SYCL_EXPORT void contextSetExtendedDeleter(const sycl::context &constext, pi_context_extended_deleter func, diff --git a/sycl/include/sycl/device_aspect_macros.hpp b/sycl/include/sycl/device_aspect_macros.hpp index 10f61c1e48435..55ed162863079 100644 --- a/sycl/include/sycl/device_aspect_macros.hpp +++ b/sycl/include/sycl/device_aspect_macros.hpp @@ -381,6 +381,11 @@ #define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_virtual_mem__ 0 #endif +#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_cuda_cluster_group__ +// __SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 75) +#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_cuda_cluster_group__ 0 +#endif + #ifndef __SYCL_ANY_DEVICE_HAS_host__ // __SYCL_ASPECT(host, 0) #define __SYCL_ANY_DEVICE_HAS_host__ 0 @@ -750,3 +755,8 @@ // __SYCL_ASPECT(ext_oneapi_virtual_mem, 74) #define __SYCL_ANY_DEVICE_HAS_ext_oneapi_virtual_mem__ 0 #endif + +#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_cuda_cluster_group__ +// __SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 75) +#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_cuda_cluster_group__ 0 +#endif diff --git a/sycl/include/sycl/ext/oneapi/experimental/cluster_group_prop.hpp b/sycl/include/sycl/ext/oneapi/experimental/cluster_group_prop.hpp new file mode 100644 index 0000000000000..31487beffd810 --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/cluster_group_prop.hpp @@ -0,0 +1,94 @@ +//==--- cluster_group_prop.hpp --- SYCL extension for cuda cluster group ---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { + +namespace cuda { +template +struct cluster_size + : ::sycl::ext::oneapi::experimental::detail::run_time_property_key< + ::sycl::ext::oneapi::experimental::detail::ClusterLaunch> { + cluster_size(const range &size) : size(size) {} + sycl::range get_cluster_size() { return size; } + +private: + range size; +}; + +template using cluster_size_key = cluster_size; + +} // namespace cuda + +template <> +struct is_property_key> : std::true_type {}; +template <> +struct is_property_key> : std::true_type {}; +template <> +struct is_property_key> : std::true_type {}; + +template +struct is_property_key_of, T> : std::true_type {}; + +template +struct is_property_key_of, T> : std::true_type {}; + +template +struct is_property_key_of, T> : std::true_type {}; + +template <> +struct is_property_value> + : is_property_key> {}; +template <> +struct is_property_value> + : is_property_key> {}; +template <> +struct is_property_value> + : is_property_key> {}; + +template +struct is_property_value_of, O> + : is_property_key_of, O> {}; + +template +struct is_property_value_of, O> + : is_property_key_of, O> {}; + +template +struct is_property_value_of, O> + : is_property_key_of, O> {}; + +namespace detail { +template constexpr std::size_t getClusterDim() { + if constexpr (PropertiesT::template has_property< + sycl::ext::oneapi::experimental::cuda::cluster_size_key< + 1>>()) { + return 1; + } + if constexpr (PropertiesT::template has_property< + sycl::ext::oneapi::experimental::cuda::cluster_size_key< + 2>>()) { + return 2; + } + if constexpr (PropertiesT::template has_property< + sycl::ext::oneapi::experimental::cuda::cluster_size_key< + 3>>()) { + return 3; + } + return 0; +} +} // namespace detail +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 7f10dd7f79e85..7d463d7c66eca 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -149,9 +149,9 @@ void parallel_for(handler &CGH, ext::oneapi::experimental::detail::LaunchConfigAccess, Properties> ConfigAccess(Config); - CGH.parallel_for(ConfigAccess.getRange(), - std::forward(Reductions)..., - KernelObj); + CGH.parallel_for( + ConfigAccess.getRange(), ConfigAccess.getProperties(), + std::forward(Reductions)..., KernelObj); } template , Properties> ConfigAccess(Config); - CGH.parallel_for(ConfigAccess.getRange(), - std::forward(Reductions)..., - KernelObj); + CGH.parallel_for( + ConfigAccess.getRange(), ConfigAccess.getProperties(), + std::forward(Reductions)..., KernelObj); } template #include #include +#include #include #include #include @@ -958,6 +959,21 @@ class __SYCL_EXPORT handler { sycl::ext::oneapi::experimental::execution_scope threadScope, sycl::ext::oneapi::experimental::execution_scope coordinationScope); + template + void checkAndSetClusterRange(const Properties &Props) { + namespace syclex = sycl::ext::oneapi::experimental; + constexpr std::size_t cluster_dim = + syclex::detail::getClusterDim(); + if constexpr (cluster_dim > 0) { + setKernelUsesClusterLaunch(); + MNDRDesc.setClusterDimensions( + Props + .template get_property< + syclex::cuda::cluster_size_key>() + .get_cluster_size()); + } + } + /// Process kernel properties. /// /// Stores information about kernel properties into the handler. @@ -1026,6 +1042,8 @@ class __SYCL_EXPORT handler { sycl::ext::oneapi::experimental::execution_scope::work_item, prop.coordinationScope); } + + checkAndSetClusterRange(Props); } /// Checks whether it is possible to copy the source shape to the destination @@ -1409,9 +1427,9 @@ class __SYCL_EXPORT handler { kernel_parallel_for_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ - processProperties(Props); detail::checkValueRange(ExecutionRange); MNDRDesc.set(std::move(ExecutionRange)); + processProperties(Props); StoreLambda( std::move(KernelFunc)); setType(detail::CG::Kernel); @@ -3624,6 +3642,9 @@ class __SYCL_EXPORT handler { // Set value of the kernel is cooperative flag void setKernelIsCooperative(bool); + // Set using cuda thread block cluster launch flag true + void setKernelUsesClusterLaunch(); + template < ext::oneapi::experimental::detail::UnsupportedGraphFeatures FeatureT> void throwIfGraphAssociated() const { diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index 2151776ddb8e7..f9a14e40dcd3e 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -68,3 +68,4 @@ __SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d_usm, 71) __SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d, 72) __SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 73) __SYCL_ASPECT(ext_oneapi_virtual_mem, 74) +__SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 75) diff --git a/sycl/include/sycl/info/device_traits.def b/sycl/include/sycl/info/device_traits.def index 730ef88e335d4..14126c3ee5e2c 100644 --- a/sycl/include/sycl/info/device_traits.def +++ b/sycl/include/sycl/info/device_traits.def @@ -231,6 +231,8 @@ __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_work_groups_3d, id<3>, PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D) __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_global_work_groups, size_t, PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS) +__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_cuda_cluster_group, bool, + PI_EXT_ONEAPI_DEVICE_INFO_CLUSTER_LAUNCH) #ifdef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index b4b121a2815c4..1bdb816efcb39 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -88,6 +88,7 @@ #include #include #include +#include #include #include #include diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index e58d59547c310..92d1eb4e4c825 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -564,6 +564,19 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, NumEventsInWaitList, EventWaitList, OutEvent); } +pi_result piextEnqueueKernelLaunchCustom( + pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumPropsInLaunchPropList, + const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + return pi2ur::piextEnqueueKernelLaunchCustom( + Queue, Kernel, WorkDim, GlobalWorkSize, LocalWorkSize, + NumPropsInLaunchPropList, LaunchPropList, NumEventsInWaitList, + EventsWaitList, OutEvent); +} + pi_result piextEnqueueCooperativeKernelLaunch( pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 97c1e72fde422..671df35c30ac4 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -577,6 +577,26 @@ pi_result piextEnqueueCooperativeKernelLaunch( NumEventsInWaitList, EventWaitList, OutEvent); } +pi_result piextEnqueueKernelLaunchCustom( + pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumPropsInLaunchPropList, + const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + (void)Queue; + (void)Kernel; + (void)WorkDim; + (void)GlobalWorkSize; + (void)LocalWorkSize; + (void)NumPropsInLaunchPropList; + (void)LaunchPropList; + (void)NumEventsInWaitList; + (void)EventsWaitList; + (void)OutEvent; + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + return PI_ERROR_UNSUPPORTED_FEATURE; +} + pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle, pi_context Context, pi_program Program, diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 93fab8d0fabb8..daff872067f86 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -578,6 +578,26 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, NumEventsInWaitList, EventWaitList, OutEvent); } +pi_result piextEnqueueKernelLaunchCustom( + pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumPropsInLaunchPropList, + const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + (void)Queue; + (void)Kernel; + (void)WorkDim; + (void)GlobalWorkSize; + (void)LocalWorkSize; + (void)NumPropsInLaunchPropList; + (void)LaunchPropList; + (void)NumEventsInWaitList; + (void)EventsWaitList; + (void)OutEvent; + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + return PI_ERROR_UNSUPPORTED_FEATURE; +} + pi_result piextEnqueueCooperativeKernelLaunch( pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index 8b577c4c997e3..4e61f0a9da031 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -573,6 +573,26 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, NumEventsInWaitList, EventWaitList, OutEvent); } +pi_result piextEnqueueKernelLaunchCustom( + pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumPropsInLaunchPropList, + const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + (void)Queue; + (void)Kernel; + (void)WorkDim; + (void)GlobalWorkSize; + (void)LocalWorkSize; + (void)NumPropsInLaunchPropList; + (void)LaunchPropList; + (void)NumEventsInWaitList; + (void)EventsWaitList; + (void)OutEvent; + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + return PI_ERROR_UNSUPPORTED_FEATURE; +} + pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle, pi_context Context, pi_program Program, diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 890d0fdecf79b..e3345d4cb5237 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -554,6 +554,26 @@ pi_result piextEnqueueCooperativeKernelLaunch( NumEventsInWaitList, EventWaitList, OutEvent); } +pi_result piextEnqueueKernelLaunchCustom( + pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumPropsInLaunchPropList, + const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + (void)Queue; + (void)Kernel; + (void)WorkDim; + (void)GlobalWorkSize; + (void)LocalWorkSize; + (void)NumPropsInLaunchPropList; + (void)LaunchPropList; + (void)NumEventsInWaitList; + (void)EventsWaitList; + (void)OutEvent; + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + return PI_ERROR_UNSUPPORTED_FEATURE; +} + pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle, pi_context Context, pi_program Program, diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 7dd2e0bbbac8c..b576e38ee47d4 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1341,6 +1341,8 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP) PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_SUPPORTS_VIRTUAL_MEM, UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT) + PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_CLUSTER_LAUNCH, + UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP) #undef PI_TO_UR_MAP_DEVICE_INFO default: return PI_ERROR_UNKNOWN; @@ -3780,6 +3782,57 @@ inline pi_result piextEnqueueCooperativeKernelLaunch( return PI_SUCCESS; } +inline pi_result piextEnqueueKernelLaunchCustom( + pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumPropsInLaunchPropList, + const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UREvent = reinterpret_cast(OutEvent); + + std::vector props(NumPropsInLaunchPropList); + for (pi_uint32 i = 0; i < NumPropsInLaunchPropList; i++) { + switch (LaunchPropList[i].id) { + case PI_LAUNCH_PROPERTY_IGNORE: { + props[i].id = UR_EXP_LAUNCH_PROPERTY_ID_IGNORE; + break; + } + case PI_LAUNCH_PROPERTY_CLUSTER_DIMENSION: { + + props[i].id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION; + props[i].value.clusterDim[0] = LaunchPropList[i].value.cluster_dims[0]; + props[i].value.clusterDim[1] = LaunchPropList[i].value.cluster_dims[1]; + props[i].value.clusterDim[2] = LaunchPropList[i].value.cluster_dims[2]; + break; + } + case PI_LAUNCH_PROPERTY_COOPERATIVE: { + props[i].id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE; + props[i].value.cooperative = LaunchPropList[i].value.cooperative; + break; + } + default: { + return PI_ERROR_INVALID_VALUE; + } + } + } + + HANDLE_ERRORS(urEnqueueKernelLaunchCustomExp( + UrQueue, UrKernel, WorkDim, GlobalWorkSize, LocalWorkSize, + NumPropsInLaunchPropList, &props[0], NumEventsInWaitList, + UrEventsWaitList, UREvent)); + + return PI_SUCCESS; +} + inline pi_result piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, pi_bool BlockingWrite, pi_image_offset Origin, pi_image_region Region, diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index 717fd6c895331..b75a5abd60327 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -273,6 +273,19 @@ __SYCL_EXPORT pi_result piEnqueueKernelLaunch( NumEventsInWaitList, EventWaitList, OutEvent); } +__SYCL_EXPORT pi_result piextEnqueueKernelLaunchCustom( + pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumPropsInLaunchPropList, + const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + return pi2ur::piextEnqueueKernelLaunchCustom( + Queue, Kernel, WorkDim, GlobalWorkSize, LocalWorkSize, + NumPropsInLaunchPropList, LaunchPropList, NumEventsInWaitList, + EventsWaitList, OutEvent); +} + __SYCL_EXPORT pi_result piEnqueueMemImageWrite( pi_queue Queue, pi_mem Image, pi_bool BlockingWrite, pi_image_offset Origin, pi_image_region Region, size_t InputRowPitch, size_t InputSlicePitch, @@ -1632,6 +1645,9 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { _PI_API(piextDisablePeerAccess) _PI_API(piextPeerAccessGetInfo) + // Launch Properties + _PI_API(piextEnqueueKernelLaunchCustom) + _PI_API(piextPluginGetOpaqueData) _PI_API(piTearDown) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index e22d87be839ae..c268987bf57f6 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -366,6 +366,8 @@ bool device_impl::has(aspect Aspect) const { return get_info(); case aspect::ext_intel_mem_channel: return get_info(); + case aspect::ext_oneapi_cuda_cluster_group: + return get_info(); case aspect::usm_atomic_host_allocations: return (get_device_info_impl:: diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 98d9d171f5737..5be5a76e68e69 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -301,6 +301,25 @@ struct get_device_info_impl, } }; +// Specialization for cuda cluster group +template <> +struct get_device_info_impl { + static bool get(const DeviceImplPtr &Dev) { + bool result = false; + if (Dev->getBackend() == backend::ext_oneapi_cuda) { + sycl::detail::pi::PiResult Err = + Dev->getPlugin()->call_nocheck( + Dev->getHandleRef(), + PiInfoCode::value, + sizeof(result), &result, nullptr); + if (Err != PI_SUCCESS) { + return false; + } + } + return result; + } +}; + // Specialization for exec_capabilities, OpenCL returns a bitfield template <> struct get_device_info_impl, diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index de4a6fd3bd22e..51d9d65de79f7 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -973,7 +973,8 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, // TODO: Pass accessor mem allocations nullptr, // TODO: Extract from handler - PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT, CG->MKernelIsCooperative); + PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT, CG->MKernelIsCooperative, + CG->MKernelUsesClusterLaunch); if (Res != pi_result::PI_SUCCESS) { throw sycl::exception( sycl::make_error_code(sycl::errc::kernel), diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index f0df55d5e069b..daea9816deac5 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -116,6 +116,7 @@ class handler_impl { PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT; bool MKernelIsCooperative = false; + bool MKernelUsesClusterLaunch = false; // Extra information for bindless image copy sycl::detail::pi::PiMemImageDesc MImageDesc; diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 952482e42d79f..51631d0d3e4f6 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -970,7 +970,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, FusedCG.reset(new detail::CGExecKernel( NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr), std::move(CGData), std::move(FusedArgs), FusedOrCachedKernelName, {}, {}, - CG::CGTYPE::Kernel, KernelCacheConfig, false /* KernelIsCooperative */)); + CG::CGTYPE::Kernel, KernelCacheConfig, false /* KernelIsCooperative */, + false /* KernelUsesClusterLaunch*/)); return FusedCG; } diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 583a3c5f0c386..563915598c128 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -303,7 +303,8 @@ bool Command::isFusable() const { } const auto &CG = (static_cast(*this)).getCG(); return (CG.getType() == CG::CGTYPE::Kernel) && - (!static_cast(CG).MKernelIsCooperative); + (!static_cast(CG).MKernelIsCooperative) && + (!static_cast(CG).MKernelUsesClusterLaunch); } static void flushCrossQueueDeps(const std::vector &EventImpls, @@ -2370,7 +2371,7 @@ static pi_result SetKernelParamsAndLaunch( const detail::EventImplPtr &OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function &getMemAllocationFunc, - bool IsCooperative) { + bool IsCooperative, bool KernelUsesClusterLaunch) { assert(Queue && "Kernel submissions should have an associated queue"); const PluginPtr &Plugin = Queue->getPlugin(); @@ -2408,6 +2409,35 @@ static pi_result SetKernelParamsAndLaunch( } if (OutEventImpl != nullptr) OutEventImpl->setHostEnqueueTime(); + if (KernelUsesClusterLaunch) { + std::vector property_list; + + pi_launch_property_value launch_property_value_cluster_range; + launch_property_value_cluster_range.cluster_dims[0] = + NDRDesc.ClusterDimensions[0]; + launch_property_value_cluster_range.cluster_dims[1] = + NDRDesc.ClusterDimensions[1]; + launch_property_value_cluster_range.cluster_dims[2] = + NDRDesc.ClusterDimensions[2]; + + property_list.push_back( + {pi_launch_property_id::PI_LAUNCH_PROPERTY_CLUSTER_DIMENSION, + launch_property_value_cluster_range}); + + if (IsCooperative) { + pi_launch_property_value launch_property_value_cooperative; + launch_property_value_cooperative.cooperative = 1; + property_list.push_back( + {pi_launch_property_id::PI_LAUNCH_PROPERTY_COOPERATIVE, + launch_property_value_cooperative}); + } + + return Plugin->call_nocheck( + Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalSize[0], + LocalSize, property_list.size(), property_list.data(), RawEvents.size(), + RawEvents.empty() ? nullptr : &RawEvents[0], + OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr); + } pi_result Error = [&](auto... Args) { if (IsCooperative) { @@ -2561,7 +2591,7 @@ pi_int32 enqueueImpKernel( const detail::EventImplPtr &OutEventImpl, const std::function &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, - const bool KernelIsCooperative) { + const bool KernelIsCooperative, const bool KernelUsesClusterLaunch) { assert(Queue && "Kernel submissions should have an associated queue"); // Run OpenCL kernel auto ContextImpl = Queue->getContextImplPtr(); @@ -2650,10 +2680,10 @@ pi_int32 enqueueImpKernel( sizeof(sycl::detail::pi::PiKernelCacheConfig), &KernelCacheConfig); } - Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel, - NDRDesc, EventsWaitList, OutEventImpl, - EliminatedArgMask, getMemAllocationFunc, - KernelIsCooperative); + Error = SetKernelParamsAndLaunch( + Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList, + OutEventImpl, EliminatedArgMask, getMemAllocationFunc, + KernelIsCooperative, KernelUsesClusterLaunch); const PluginPtr &Plugin = Queue->getPlugin(); if (!SyclKernelImpl && !MSyclKernel) { @@ -3006,7 +3036,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return enqueueImpKernel( MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel, KernelName, RawEvents, EventImpl, getMemAllocationFunc, - ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative); + ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative, + ExecKernel->MKernelUsesClusterLaunch); } case CG::CGTYPE::CopyUSM: { CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get(); diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 471bf66264c54..3ed072b9bb643 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -633,7 +633,7 @@ pi_int32 enqueueImpKernel( const detail::EventImplPtr &Event, const std::function &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, - bool KernelIsCooperative); + bool KernelIsCooperative, const bool KernelUsesClusterLaunch); class KernelFusionCommand; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 7dae7fdbf5726..d6d488c45f019 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -278,7 +278,8 @@ event handler::finalize() { Result = enqueueImpKernel(MQueue, MNDRDesc, MArgs, KernelBundleImpPtr, MKernel, MKernelName.c_str(), RawEvents, NewEvent, nullptr, MImpl->MKernelCacheConfig, - MImpl->MKernelIsCooperative); + MImpl->MKernelIsCooperative, + MImpl->MKernelUsesClusterLaunch); #ifdef XPTI_ENABLE_INSTRUMENTATION // Emit signal only when event is created if (NewEvent != nullptr) { @@ -341,7 +342,8 @@ event handler::finalize() { std::move(MImpl->MKernelBundle), std::move(CGData), std::move(MArgs), MKernelName.c_str(), std::move(MStreamStorage), std::move(MImpl->MAuxiliaryResources), MCGType, - MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative, MCodeLoc)); + MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative, + MImpl->MKernelUsesClusterLaunch, MCodeLoc)); break; } case detail::CG::CopyAccToPtr: @@ -1728,6 +1730,13 @@ void handler::setKernelIsCooperative(bool KernelIsCooperative) { MImpl->MKernelIsCooperative = KernelIsCooperative; } +void handler::setKernelUsesClusterLaunch() { + throwIfGraphAssociated< + syclex::detail::UnsupportedGraphFeatures:: + sycl_ext_oneapi_experimental_cuda_cluster_launch>(); + MImpl->MKernelUsesClusterLaunch = true; +} + void handler::ext_oneapi_graph( ext::oneapi::experimental::command_graph< ext::oneapi::experimental::graph_state::executable> diff --git a/sycl/test-e2e/Basic/aspects.cpp b/sycl/test-e2e/Basic/aspects.cpp index bd47d1b4bf1cb..f9b3187882d47 100644 --- a/sycl/test-e2e/Basic/aspects.cpp +++ b/sycl/test-e2e/Basic/aspects.cpp @@ -84,6 +84,9 @@ int main() { if (plt.has(aspect::usm_system_allocations)) { std::cout << " USM system allocations" << std::endl; } + if (plt.has(aspect::ext_oneapi_cuda_cluster_group)) { + std::cout << " ext_oneapi_cuda_cluster_group" << std::endl; + } } std::cout << "Passed." << std::endl; return 0; diff --git a/sycl/test-e2e/ClusterLaunch/cluster_launch_enqueue_functions.cpp b/sycl/test-e2e/ClusterLaunch/cluster_launch_enqueue_functions.cpp new file mode 100644 index 0000000000000..e639260be5714 --- /dev/null +++ b/sycl/test-e2e/ClusterLaunch/cluster_launch_enqueue_functions.cpp @@ -0,0 +1,85 @@ +// Tests whether or not cluster launch was successful, with the correct ranges +// that were passed via enqueue functions extension +// REQUIRES: aspect-ext_oneapi_cuda_cluster_group +// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out +// RUN: %{run} %t.out + +#include +#include +#include +#include + +#include + +template +int test_cluster_launch_enqueue_functions(sycl::queue &Queue, + sycl::range GlobalRange, + sycl::range LocalRange, + sycl::range ClusterRange) { + using namespace sycl::ext::oneapi::experimental; + + cuda::cluster_size ClusterDims(ClusterRange); + properties ClusterLaunchProperty{ClusterDims}; + + int *CorrectResultFlag = sycl::malloc_device(1, Queue); + Queue.memset(CorrectResultFlag, 0, sizeof(int)).wait(); + + submit_with_event(Queue, [&](sycl::handler &CGH) { + nd_launch(CGH, + launch_config(sycl::nd_range(GlobalRange, LocalRange), + ClusterLaunchProperty), + [=](sycl::nd_item It) { + uint32_t ClusterDimX, ClusterDimY, ClusterDimZ; +// Temporary solution till cluster group class is implemented +#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) && \ + (__SYCL_CUDA_ARCH__ >= 900) + asm volatile("\n\t" + "mov.u32 %0, %%cluster_nctaid.x; \n\t" + "mov.u32 %1, %%cluster_nctaid.y; \n\t" + "mov.u32 %2, %%cluster_nctaid.z; \n\t" + : "=r"(ClusterDimZ), "=r"(ClusterDimY), + "=r"(ClusterDimX)); +#endif + if constexpr (Dim == 1) { + if (ClusterDimZ == ClusterRange[0] && ClusterDimY == 1 && + ClusterDimX == 1) { + *CorrectResultFlag = 1; + } + } else if constexpr (Dim == 2) { + if (ClusterDimZ == ClusterRange[1] && + ClusterDimY == ClusterRange[0] && ClusterDimX == 1) { + *CorrectResultFlag = 1; + } + } else { + if (ClusterDimZ == ClusterRange[2] && + ClusterDimY == ClusterRange[1] && + ClusterDimX == ClusterRange[0]) { + *CorrectResultFlag = 1; + } + } + }); + }).wait_and_throw(); + + int CorrectResultFlagHost = 0; + Queue.copy(CorrectResultFlag, &CorrectResultFlagHost, 1).wait(); + return CorrectResultFlagHost; +} + +int main() { + + sycl::queue Queue; + + int HostCorrectFlag = + test_cluster_launch_enqueue_functions(Queue, sycl::range{128, 128, 128}, + sycl::range{16, 16, 2}, + sycl::range{2, 4, 1}) && + test_cluster_launch_enqueue_functions(Queue, sycl::range{512, 1024}, + sycl::range{32, 32}, + sycl::range{4, 2}) && + test_cluster_launch_enqueue_functions(Queue, sycl::range{128}, + sycl::range{32}, sycl::range{2}) && + test_cluster_launch_enqueue_functions(Queue, sycl::range{16384}, + sycl::range{32}, sycl::range{16}); + + return !HostCorrectFlag; +} diff --git a/sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp b/sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp new file mode 100644 index 0000000000000..e37d4ea1f1fb3 --- /dev/null +++ b/sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp @@ -0,0 +1,85 @@ +// Tests whether or not cluster launch was successful, with the correct ranges +// that were passed via parallel for overload +// REQUIRES: aspect-ext_oneapi_cuda_cluster_group +// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out +// RUN: %{run} %t.out + +#include +#include +#include + +#include + +template +int test_cluster_launch_parallel_for(sycl::queue &Queue, + sycl::range GlobalRange, + sycl::range LocalRange, + sycl::range ClusterRange) { + using namespace sycl::ext::oneapi::experimental; + + cuda::cluster_size ClusterDims(ClusterRange); + properties ClusterLaunchProperty{ClusterDims}; + + int *CorrectResultFlag = sycl::malloc_device(1, Queue); + Queue.memset(CorrectResultFlag, 0, sizeof(int)).wait(); + + Queue + .submit([&](sycl::handler &CGH) { + CGH.parallel_for(sycl::nd_range(GlobalRange, LocalRange), + ClusterLaunchProperty, [=](sycl::nd_item It) { + uint32_t ClusterDimX, ClusterDimY, ClusterDimZ; +// Temporary solution till cluster group class is implemented +#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) && \ + (__SYCL_CUDA_ARCH__ >= 900) + asm volatile("\n\t" + "mov.u32 %0, %%cluster_nctaid.x; \n\t" + "mov.u32 %1, %%cluster_nctaid.y; \n\t" + "mov.u32 %2, %%cluster_nctaid.z; \n\t" + : "=r"(ClusterDimZ), "=r"(ClusterDimY), + "=r"(ClusterDimX)); +#endif + if constexpr (Dim == 1) { + if (ClusterDimZ == ClusterRange[0] && + ClusterDimY == 1 && ClusterDimX == 1) { + *CorrectResultFlag = 1; + } + } else if constexpr (Dim == 2) { + if (ClusterDimZ == ClusterRange[1] && + ClusterDimY == ClusterRange[0] && + ClusterDimX == 1) { + *CorrectResultFlag = 1; + } + } else { + if (ClusterDimZ == ClusterRange[2] && + ClusterDimY == ClusterRange[1] && + ClusterDimX == ClusterRange[0]) { + *CorrectResultFlag = 1; + } + } + }); + }) + .wait_and_throw(); + + int CorrectResultFlagHost = 0; + Queue.copy(CorrectResultFlag, &CorrectResultFlagHost, 1).wait(); + return CorrectResultFlagHost; +} + +int main() { + + sycl::queue Queue; + + int HostCorrectFlag = + test_cluster_launch_parallel_for(Queue, sycl::range{128, 128, 128}, + sycl::range{16, 16, 2}, + sycl::range{2, 4, 1}) && + test_cluster_launch_parallel_for(Queue, sycl::range{512, 1024}, + sycl::range{32, 32}, + sycl::range{4, 2}) && + test_cluster_launch_parallel_for(Queue, sycl::range{128}, sycl::range{32}, + sycl::range{2}) && + test_cluster_launch_parallel_for(Queue, sycl::range{16384}, + sycl::range{32}, sycl::range{16}); + + return !HostCorrectFlag; +} diff --git a/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp new file mode 100644 index 0000000000000..e8bc71f4fc465 --- /dev/null +++ b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp @@ -0,0 +1,69 @@ +// Checks whether or not event Dependencies are honored by +// piExtEnqueueLaunchKernelCustom +// REQUIRES: aspect-ext_oneapi_cuda_cluster_group +// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out +// RUN: %{run} %t.out + +#include +#include +#include +#include +#include + +#include + +template void dummy_kernel(T *Input, int N, sycl::nd_item<1> It) { +#if defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900) + auto ID = It.get_global_linear_id(); + uint32_t ClusterDim; + asm volatile("mov.u32 %0, %%cluster_nctaid.x;" : "=r"(ClusterDim)); + + if (ID < N) { + Input[ID] += static_cast(ClusterDim); + } +#endif +} + +int main() { + + std::vector HostArray(4096, -20); + sycl::queue Queue; + + if (!Queue.get_device().has(sycl::aspect::ext_oneapi_cuda_cluster_group)) { + printf("Cluster group not supported on this arch, exiting...\n"); + return 0; + } + + { + sycl::buffer Buff(HostArray.data(), 4096); + Queue.submit([&](sycl::handler &CGH) { + auto Acc = Buff.template get_access(CGH); + CGH.parallel_for(4096, [=](auto i) { Acc[i] = 1; }); + }); + Queue.submit([&](sycl::handler &CGH) { + using namespace sycl::ext::oneapi::experimental; + + cuda::cluster_size ClusterDims(sycl::range{2}); + properties ClusterLaunchProperty{ClusterDims}; + auto Acc = Buff.template get_access(CGH); + CGH.parallel_for( + sycl::nd_range({4096}, {32}), ClusterLaunchProperty, + [=](sycl::nd_item<1> It) { + dummy_kernel( + Acc.get_multi_ptr().get(), 4096, + It); + }); + }); + Queue.submit([&](sycl::handler &CGH) { + auto Acc = Buff.template get_access(CGH); + CGH.parallel_for(4096, [=](auto i) { Acc[i] *= 5; }); + }); + } + + for (const auto &V : HostArray) { + if (V != 15) { + return 1; + } + } + return 0; +} diff --git a/sycl/test/abi/layout_handler.cpp b/sycl/test/abi/layout_handler.cpp index cc073064c724c..dc1e3f82eddaa 100644 --- a/sycl/test/abi/layout_handler.cpp +++ b/sycl/test/abi/layout_handler.cpp @@ -103,85 +103,89 @@ void foo() { // CHECK-NEXT: 320 | class sycl::range<3> NumWorkGroups // CHECK-NEXT: 320 | class sycl::detail::array<3> (base) // CHECK-NEXT: 320 | size_t[3] common_array -// CHECK-NEXT: 344 | size_t Dims -// CHECK-NEXT: 352 | class sycl::detail::string MKernelName -// CHECK-NEXT: 352 | char * str -// CHECK-NEXT: 360 | class std::shared_ptr MKernel -// CHECK-NEXT: 360 | class std::__shared_ptr (base) -// CHECK-NEXT: 360 | class std::__shared_ptr_access (base) (empty) -// CHECK-NEXT: 360 | element_type * _M_ptr -// CHECK-NEXT: 368 | class std::__shared_count<> _M_refcount -// CHECK-NEXT: 368 | _Sp_counted_base<(_Lock_policy)2U> * _M_pi -// CHECK-NEXT: 376 | detail::class CG::CGTYPE MCGType -// CHECK-NEXT: 384 | void * MSrcPtr -// CHECK-NEXT: 392 | void * MDstPtr -// CHECK-NEXT: 400 | size_t MLength -// CHECK-NEXT: 408 | class std::vector MPattern -// CHECK-NEXT: 408 | struct std::_Vector_base > (base) -// CHECK-NEXT: 408 | struct std::_Vector_base >::_Vector_impl _M_impl -// CHECK-NEXT: 408 | class std::allocator (base) (empty) -// CHECK: 408 | pointer _M_start -// CHECK-NEXT: 416 | pointer _M_finish -// CHECK-NEXT: 424 | pointer _M_end_of_storage -// CHECK-NEXT: 432 | class std::unique_ptr MHostKernel -// CHECK: 432 | class std::__uniq_ptr_impl > -// CHECK-NEXT: 432 | class std::tuple > _M_t -// CHECK-NEXT: 432 | struct std::_Tuple_impl<0, class sycl::detail::HostKernelBase *, struct std::default_delete > (base) -// CHECK-NEXT: 432 | struct std::_Tuple_impl<1, struct std::default_delete > (base) (empty) -// CHECK: 432 | struct std::_Head_base<0, class sycl::detail::HostKernelBase *> (base) -// CHECK-NEXT: 432 | class sycl::detail::HostKernelBase * _M_head_impl -// CHECK-NEXT: 440 | class std::unique_ptr MHostTask -// CHECK: 440 | class std::__uniq_ptr_impl > -// CHECK-NEXT: 440 | class std::tuple > _M_t -// CHECK-NEXT: 440 | struct std::_Tuple_impl<0, class sycl::detail::HostTask *, struct std::default_delete > (base) -// CHECK-NEXT: 440 | struct std::_Tuple_impl<1, struct std::default_delete > (base) (empty) -// CHECK: 440 | struct std::_Head_base<0, class sycl::detail::HostTask *> (base) -// CHECK-NEXT: 440 | class sycl::detail::HostTask * _M_head_impl -// CHECK-NEXT: 448 | class std::vector > MEventsWaitWithBarrier -// CHECK-NEXT: 448 | struct std::_Vector_base, class std::allocator > > (base) -// CHECK-NEXT: 448 | struct std::_Vector_base, class std::allocator > >::_Vector_impl _M_impl -// CHECK-NEXT: 448 | class std::allocator > (base) (empty) -// CHECK: 448 | pointer _M_start -// CHECK-NEXT: 456 | pointer _M_finish -// CHECK-NEXT: 464 | pointer _M_end_of_storage -// CHECK-NEXT: 472 | class std::shared_ptr MGraph -// CHECK-NEXT: 472 | class std::__shared_ptr (base) -// CHECK-NEXT: 472 | class std::__shared_ptr_access (base) (empty) -// CHECK-NEXT: 472 | element_type * _M_ptr -// CHECK-NEXT: 480 | class std::__shared_count<> _M_refcount -// CHECK-NEXT: 480 | _Sp_counted_base<(_Lock_policy)2U> * _M_pi -// CHECK-NEXT: 488 | class std::shared_ptr MExecGraph -// CHECK-NEXT: 488 | class std::__shared_ptr (base) -// CHECK-NEXT: 488 | class std::__shared_ptr_access (base) (empty) -// CHECK-NEXT: 488 | element_type * _M_ptr -// CHECK-NEXT: 496 | class std::__shared_count<> _M_refcount -// CHECK-NEXT: 496 | _Sp_counted_base<(_Lock_policy)2U> * _M_pi -// CHECK-NEXT: 504 | class std::shared_ptr MSubgraphNode -// CHECK-NEXT: 504 | class std::__shared_ptr (base) -// CHECK-NEXT: 504 | class std::__shared_ptr_access (base) (empty) -// CHECK-NEXT: 504 | element_type * _M_ptr -// CHECK-NEXT: 512 | class std::__shared_count<> _M_refcount -// CHECK-NEXT: 512 | _Sp_counted_base<(_Lock_policy)2U> * _M_pi -// CHECK-NEXT: 520 | class std::unique_ptr MGraphNodeCG -// CHECK: 520 | class std::__uniq_ptr_impl > -// CHECK-NEXT: 520 | class std::tuple > _M_t -// CHECK-NEXT: 520 | struct std::_Tuple_impl<0, class sycl::detail::CG *, struct std::default_delete > (base) -// CHECK-NEXT: 520 | struct std::_Tuple_impl<1, struct std::default_delete > (base) (empty) -// CHECK: 520 | struct std::_Head_base<0, class sycl::detail::CG *> (base) -// CHECK-NEXT: 520 | class sycl::detail::CG * _M_head_impl -// CHECK-NEXT: 528 | struct sycl::detail::code_location MCodeLoc -// CHECK-NEXT: 528 | const char * MFileName -// CHECK-NEXT: 536 | const char * MFunctionName -// CHECK-NEXT: 544 | unsigned long MLineNo -// CHECK-NEXT: 552 | unsigned long MColumnNo -// CHECK-NEXT: 560 | _Bool MIsFinalized -// CHECK-NEXT: 568 | class sycl::event MLastEvent -// CHECK-NEXT: 568 | class sycl::detail::OwnerLessBase (base) (empty) -// CHECK-NEXT: 568 | class std::shared_ptr impl -// CHECK-NEXT: 568 | class std::__shared_ptr (base) -// CHECK-NEXT: 568 | class std::__shared_ptr_access (base) (empty) -// CHECK-NEXT: 568 | element_type * _M_ptr -// CHECK-NEXT: 576 | class std::__shared_count<> _M_refcount -// CHECK-NEXT: 576 | _Sp_counted_base<(_Lock_policy)2U> * _M_pi -// CHECK-NEXT: | [sizeof=584, dsize=584, align=8, -// CHECK-NEXT: | nvsize=584, nvalign=8] +// CHECK-NEXT: 344 | class sycl::range<3> ClusterDimensions +// CHECK-NEXT: 344 | class sycl::detail::array<3> (base) +// CHECK-NEXT: 344 | size_t[3] common_array +// CHECK-NEXT: 368 | size_t Dims +// CHECK-NEXT: 376 | class sycl::detail::string MKernelName +// CHECK-NEXT: 376 | char * str +// CHECK-NEXT: 384 | class std::shared_ptr MKernel +// CHECK-NEXT: 384 | class std::__shared_ptr (base) +// CHECK-NEXT: 384 | class std::__shared_ptr_access (base) (empty) +// CHECK-NEXT: 384 | element_type * _M_ptr +// CHECK-NEXT: 392 | class std::__shared_count<> _M_refcount +// CHECK-NEXT: 392 | _Sp_counted_base<(_Lock_policy)2U> * _M_pi +// CHECK-NEXT: 400 | detail::class CG::CGTYPE MCGType +// CHECK-NEXT: 408 | void * MSrcPtr +// CHECK-NEXT: 416 | void * MDstPtr +// CHECK-NEXT: 424 | size_t MLength +// CHECK-NEXT: 432 | class std::vector MPattern +// CHECK-NEXT: 432 | struct std::_Vector_base > (base) +// CHECK-NEXT: 432 | struct std::_Vector_base >::_Vector_impl _M_impl +// CHECK-NEXT: 432 | class std::allocator (base) (empty) +// CHECK: 432 | pointer _M_start +// CHECK-NEXT: 440 | pointer _M_finish +// CHECK-NEXT: 448 | pointer _M_end_of_storage +// CHECK-NEXT: 456 | class std::unique_ptr MHostKernel +// CHECK: 456 | class std::__uniq_ptr_impl > +// CHECK-NEXT: 456 | class std::tuple > _M_t +// CHECK-NEXT: 456 | struct std::_Tuple_impl<0, class sycl::detail::HostKernelBase *, struct std::default_delete > (base) +// CHECK-NEXT: 456 | struct std::_Tuple_impl<1, struct std::default_delete > (base) (empty) +// CHECK: 456 | struct std::_Head_base<0, class sycl::detail::HostKernelBase *> (base) +// CHECK-NEXT: 456 | class sycl::detail::HostKernelBase * _M_head_impl +// CHECK-NEXT: 464 | class std::unique_ptr MHostTask +// CHECK: 464 | class std::__uniq_ptr_impl > +// CHECK-NEXT: 464 | class std::tuple > _M_t +// CHECK-NEXT: 464 | struct std::_Tuple_impl<0, class sycl::detail::HostTask *, struct std::default_delete > (base) +// CHECK-NEXT: 464 | struct std::_Tuple_impl<1, struct std::default_delete > (base) (empty) +// CHECK: 464 | struct std::_Head_base<0, class sycl::detail::HostTask *> (base) +// CHECK-NEXT: 464 | class sycl::detail::HostTask * _M_head_impl +// CHECK-NEXT: 472 | class std::vector > MEventsWaitWithBarrier +// CHECK-NEXT: 472 | struct std::_Vector_base, class std::allocator > > (base) +// CHECK-NEXT: 472 | struct std::_Vector_base, class std::allocator > >::_Vector_impl _M_impl +// CHECK-NEXT: 472 | class std::allocator > (base) (empty) +// CHECK: 472 | pointer _M_start +// CHECK-NEXT: 480 | pointer _M_finish +// CHECK-NEXT: 488 | pointer _M_end_of_storage +// CHECK-NEXT: 496 | class std::shared_ptr MGraph +// CHECK-NEXT: 496 | class std::__shared_ptr (base) +// CHECK-NEXT: 496 | class std::__shared_ptr_access (base) (empty) +// CHECK-NEXT: 496 | element_type * _M_ptr +// CHECK-NEXT: 504 | class std::__shared_count<> _M_refcount +// CHECK-NEXT: 504 | _Sp_counted_base<(_Lock_policy)2U> * _M_pi +// CHECK-NEXT: 512 | class std::shared_ptr MExecGraph +// CHECK-NEXT: 512 | class std::__shared_ptr (base) +// CHECK-NEXT: 512 | class std::__shared_ptr_access (base) (empty) +// CHECK-NEXT: 512 | element_type * _M_ptr +// CHECK-NEXT: 520 | class std::__shared_count<> _M_refcount +// CHECK-NEXT: 520 | _Sp_counted_base<(_Lock_policy)2U> * _M_pi +// CHECK-NEXT: 528 | class std::shared_ptr MSubgraphNode +// CHECK-NEXT: 528 | class std::__shared_ptr (base) +// CHECK-NEXT: 528 | class std::__shared_ptr_access (base) (empty) +// CHECK-NEXT: 528 | element_type * _M_ptr +// CHECK-NEXT: 536 | class std::__shared_count<> _M_refcount +// CHECK-NEXT: 536 | _Sp_counted_base<(_Lock_policy)2U> * _M_pi +// CHECK-NEXT: 544 | class std::unique_ptr MGraphNodeCG +// CHECK: 544 | class std::__uniq_ptr_impl > +// CHECK-NEXT: 544 | class std::tuple > _M_t +// CHECK-NEXT: 544 | struct std::_Tuple_impl<0, class sycl::detail::CG *, struct std::default_delete > (base) +// CHECK-NEXT: 544 | struct std::_Tuple_impl<1, struct std::default_delete > (base) (empty) +// CHECK: 544 | struct std::_Head_base<0, class sycl::detail::CG *> (base) +// CHECK-NEXT: 544 | class sycl::detail::CG * _M_head_impl +// CHECK-NEXT: 552 | struct sycl::detail::code_location MCodeLoc +// CHECK-NEXT: 552 | const char * MFileName +// CHECK-NEXT: 560 | const char * MFunctionName +// CHECK-NEXT: 568 | unsigned long MLineNo +// CHECK-NEXT: 576 | unsigned long MColumnNo +// CHECK-NEXT: 584 | _Bool MIsFinalized +// CHECK-NEXT: 592 | class sycl::event MLastEvent +// CHECK-NEXT: 592 | class sycl::detail::OwnerLessBase (base) (empty) +// CHECK-NEXT: 592 | class std::shared_ptr impl +// CHECK-NEXT: 592 | class std::__shared_ptr (base) +// CHECK-NEXT: 592 | class std::__shared_ptr_access (base) (empty) +// CHECK-NEXT: 592 | element_type * _M_ptr +// CHECK-NEXT: 600 | class std::__shared_count<> _M_refcount +// CHECK-NEXT: 600 | _Sp_counted_base<(_Lock_policy)2U> * _M_pi +// CHECK-NEXT: | [sizeof=608, dsize=608, align=8, +// CHECK-NEXT: | nvsize=608, nvalign=8] + diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index 7ec2569938519..f85c8a8b1b55f 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -183,3 +183,4 @@ piextVirtualMemReserve piextVirtualMemSetAccess piextVirtualMemUnmap piextWaitExternalSemaphore +piextEnqueueKernelLaunchCustom diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump index a75fc9af455c9..f3706312ae9d6 100644 --- a/sycl/test/abi/pi_hip_symbol_check.dump +++ b/sycl/test/abi/pi_hip_symbol_check.dump @@ -183,3 +183,4 @@ piextVirtualMemReserve piextVirtualMemSetAccess piextVirtualMemUnmap piextWaitExternalSemaphore +piextEnqueueKernelLaunchCustom diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 53c7f0e2ed531..88fc7c1e80cf7 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -182,3 +182,4 @@ piextVirtualMemReserve piextVirtualMemSetAccess piextVirtualMemUnmap piextWaitExternalSemaphore +piextEnqueueKernelLaunchCustom diff --git a/sycl/test/abi/pi_nativecpu_symbol_check.dump b/sycl/test/abi/pi_nativecpu_symbol_check.dump index 3d347196f757d..760f222f9a11e 100644 --- a/sycl/test/abi/pi_nativecpu_symbol_check.dump +++ b/sycl/test/abi/pi_nativecpu_symbol_check.dump @@ -183,3 +183,4 @@ piextVirtualMemReserve piextVirtualMemSetAccess piextVirtualMemUnmap piextWaitExternalSemaphore +piextEnqueueKernelLaunchCustom diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 7e174ffbb2a4c..75fba0db0809a 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -182,3 +182,4 @@ piextVirtualMemReserve piextVirtualMemSetAccess piextVirtualMemUnmap piextWaitExternalSemaphore +piextEnqueueKernelLaunchCustom diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index e96b3ec995331..137275aab3795 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3515,6 +3515,7 @@ _ZN4sycl3_V17handler22memcpyFromDeviceGlobalEPvPKvbmm _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE _ZN4sycl3_V17handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE _ZN4sycl3_V17handler22setKernelIsCooperativeEb +_ZN4sycl3_V17handler26setKernelUsesClusterLaunchEv _ZN4sycl3_V17handler24GetRangeRoundingSettingsERmS2_S2_ _ZN4sycl3_V17handler24ext_intel_read_host_pipeENS0_6detail11string_viewEPvmb _ZN4sycl3_V17handler24ext_oneapi_memcpy2d_implEPvmPKvmmm @@ -3979,6 +3980,7 @@ _ZNK4sycl3_V16device13get_info_implINS0_4info6device33ext_oneapi_max_global_work _ZNK4sycl3_V16device13get_info_implINS0_4info6device33usm_restricted_shared_allocationsEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_4info6device35ext_intel_gpu_eu_count_per_subsliceEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_4info6device38sub_group_independent_forward_progressEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv +_ZNK4sycl3_V16device13get_info_implINS0_4info6device29ext_oneapi_cuda_cluster_groupEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_4info6device4nameEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_4info6device6vendorEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_4info6device7aspectsEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 0bc26c55b8a73..d82585c9c8112 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -142,6 +142,7 @@ ??$get_info_impl@Uext_oneapi_max_work_groups_2d@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$id@$01@12@XZ ??$get_info_impl@Uext_oneapi_max_work_groups_3d@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$id@$02@12@XZ ??$get_info_impl@Uext_oneapi_srgb@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA_NXZ +??$get_info_impl@Uext_oneapi_cuda_cluster_group@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA_NXZ ??$get_info_impl@Uextensions@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@Vstring@detail@_V1@sycl@@V?$allocator@Vstring@detail@_V1@sycl@@@std@@@std@@XZ ??$get_info_impl@Uextensions@platform@info@_V1@sycl@@@platform@_V1@sycl@@AEBA?AV?$vector@Vstring@detail@_V1@sycl@@V?$allocator@Vstring@detail@_V1@sycl@@@std@@@std@@XZ ??$get_info_impl@Ufree_memory@device@info@intel@ext@_V1@sycl@@@device@_V1@sycl@@AEBA_KXZ @@ -4410,6 +4411,7 @@ ?setHandlerKernelBundle@handler@_V1@sycl@@AEAAXVkernel@23@@Z ?setKernelCacheConfig@handler@_V1@sycl@@AEAAXW4_pi_kernel_cache_config@@@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z +?setKernelUsesClusterLaunch@handler@_V1@sycl@@AEAAXXZ ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z ?setNDRangeUsed@handler@_V1@sycl@@AEAAX_N@Z ?setStateExplicitKernelBundle@handler@_V1@sycl@@AEAAXXZ diff --git a/sycl/test/abi/symbol_size_alignment.cpp b/sycl/test/abi/symbol_size_alignment.cpp index 0d3d43faf3f35..db310b6291924 100644 --- a/sycl/test/abi/symbol_size_alignment.cpp +++ b/sycl/test/abi/symbol_size_alignment.cpp @@ -52,9 +52,9 @@ int main() { check(); check(); #ifdef _MSC_VER - check(); + check(); #else - check(); + check(); #endif check, 16, 8>(); check(); diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index 2e3718035d5d3..db06078bc5ba5 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -178,6 +178,7 @@ // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp +// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index 04f306dc0cfb8..1b76c23fe64c5 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -606,3 +606,25 @@ TEST_F(CommandGraphTest, ProfilingExceptionProperty) { } ASSERT_EQ(Success, false); } + +TEST_F(CommandGraphTest, ClusterLaunchException) { + namespace syclex = sycl::ext::oneapi::experimental; + + syclex::properties cluster_launch_property{ + syclex::cuda::cluster_size<1>(sycl::range<1>{4})}; + + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + Graph.begin_recording(Queue); + auto Event1 = Queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for>(sycl::nd_range<1>({4096}, {32}), + cluster_launch_property, + [&](sycl::nd_item<1> it) {}); + }); + Queue.wait(); + Graph.end_recording(Queue); + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); +} diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index 5541db5688c53..24a33ee98d97b 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -1658,3 +1658,13 @@ inline pi_result mock_piextUSMImport(const void *HostPtr, size_t Size, inline pi_result mock_piextUSMRelease(const void *HostPtr, pi_context Context) { return PI_SUCCESS; } + +inline pi_result mock_piextEnqueueKernelLaunchCustom( + pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumPropsInLaunchPropList, + const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + return PI_SUCCESS; +} diff --git a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp index 649e7a6126729..27708bb2efba5 100644 --- a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp +++ b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp @@ -124,7 +124,8 @@ class MockHandler : public sycl::handler { std::move(CGH->CGData), std::move(CGH->MArgs), CGH->MKernelName.c_str(), std::move(CGH->MStreamStorage), std::move(MImpl->MAuxiliaryResources), CGH->MCGType, {}, - MImpl->MKernelIsCooperative, CGH->MCodeLoc)); + MImpl->MKernelIsCooperative, MImpl->MKernelUsesClusterLaunch, + CGH->MCodeLoc)); break; } default: diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index 9d89a970c9a09..ab20050f250a6 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -307,7 +307,8 @@ class MockHandlerCustomFinalize : public MockHandler { getNDRDesc(), std::move(getHostKernel()), getKernel(), std::move(MImpl->MKernelBundle), std::move(CGData), getArgs(), getKernelName(), getStreamStorage(), MImpl->MAuxiliaryResources, - getCGType(), {}, MImpl->MKernelIsCooperative, getCodeLoc())); + getCGType(), {}, MImpl->MKernelIsCooperative, + MImpl->MKernelUsesClusterLaunch, getCodeLoc())); break; } case sycl::detail::CG::CodeplayHostTask: { diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index d4f65caddf56c..29b54fc31dad2 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -36,7 +36,8 @@ class MockHandlerStreamInit : public MockHandler { getRequirements(), getEvents()), getArgs(), getKernelName(), getStreamStorage(), std::move(MImpl->MAuxiliaryResources), getCGType(), {}, - MImpl->MKernelIsCooperative, getCodeLoc())); + MImpl->MKernelIsCooperative, MImpl->MKernelUsesClusterLaunch, + getCodeLoc())); break; } default: