From 1b730e458f237daf03759fdcb936ed5a411853ff Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 13 Nov 2025 11:46:46 +0000 Subject: [PATCH 1/7] Support for range-based handler-less kernel submission --- sycl/include/sycl/handler.hpp | 50 +- sycl/include/sycl/queue.hpp | 440 +++++++++++++----- sycl/include/sycl/range_rounding.hpp | 161 +++++++ sycl/source/detail/queue_impl.cpp | 2 + sycl/source/handler.cpp | 9 +- sycl/source/queue.cpp | 34 +- .../test-e2e/Basic/test_num_kernel_copies.cpp | 3 +- sycl/test/abi/sycl_symbols_linux.dump | 5 + sycl/test/abi/sycl_symbols_windows.dump | 5 + .../include_deps/sycl_detail_core.hpp.cpp | 1 + .../sycl_khr_includes_queue.hpp.cpp | 1 + .../sycl_khr_includes_reduction.hpp.cpp | 1 + .../sycl_khr_includes_usm.hpp.cpp | 1 + 13 files changed, 563 insertions(+), 150 deletions(-) create mode 100644 sycl/include/sycl/range_rounding.hpp diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index d8d46d2a27814..58c0f11c627db 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -362,6 +363,24 @@ class RoundedRangeKernelWithKH { } }; +template ::value> * = nullptr> +auto getRangeRoundedKernelLambda(KernelType KernelFunc, range UserRange) { + return detail::RoundedRangeKernelWithKH{ + UserRange, KernelFunc}; +} + +template ::value> * = nullptr> +auto getRangeRoundedKernelLambda(KernelType KernelFunc, range UserRange) { + return detail::RoundedRangeKernel{ + UserRange, KernelFunc}; +} + using std::enable_if_t; using sycl::detail::queue_impl; @@ -1017,6 +1036,8 @@ class __SYCL_EXPORT handler { bool eventNeeded() const; + device get_device() const; + template struct TransformUserItemType { using type = std::conditional_t< std::is_convertible_v, LambdaArgType>, nd_item, @@ -1024,6 +1045,7 @@ class __SYCL_EXPORT handler { item, LambdaArgType>>; }; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES std::optional> getMaxWorkGroups(); // We need to use this version to support gcc 7.5.0. Remove when minimal // supported gcc version is bumped. @@ -1152,6 +1174,7 @@ class __SYCL_EXPORT handler { return {range{}, false}; return {RoundedRange, true}; } +#endif /// Defines and invokes a SYCL kernel function for the specified range. /// @@ -1214,11 +1237,12 @@ class __SYCL_EXPORT handler { // Range rounding is supported only for newer SYCL standards. #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \ SYCL_LANGUAGE_VERSION >= 202012L - auto [RoundedRange, HasRoundedRange] = getRoundedRange(UserRange); + auto [RoundedRange, HasRoundedRange] = + detail::getRoundedRange(UserRange, get_device()); if (HasRoundedRange) { using NameWT = typename detail::get_kernel_wrapper_name_t::name; auto Wrapper = - getRangeRoundedKernelLambda( + detail::getRangeRoundedKernelLambda( KernelFunc, UserRange); using KName = std::conditional_t::value, @@ -3258,6 +3282,7 @@ class __SYCL_EXPORT handler { friend class ext::oneapi::experimental::detail::dynamic_parameter_impl; friend class ext::oneapi::experimental::detail::dynamic_command_group_impl; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES bool DisableRangeRounding(); bool RangeRoundingTrace(); @@ -3265,27 +3290,6 @@ class __SYCL_EXPORT handler { void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor, size_t &MinRange); - template ::value> * = nullptr> - auto getRangeRoundedKernelLambda(KernelType KernelFunc, - range UserRange) { - return detail::RoundedRangeKernelWithKH{UserRange, KernelFunc}; - } - - template ::value> * = nullptr> - auto getRangeRoundedKernelLambda(KernelType KernelFunc, - range UserRange) { - return detail::RoundedRangeKernel{ - UserRange, KernelFunc}; - } - -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES const std::shared_ptr &getContextImplPtr() const; #endif detail::context_impl &getContextImpl() const; diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 0b348ec7ff256..b86c28663d0ae 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -41,6 +41,7 @@ #include // for nd_range #include // for property_list #include // for range +#include #include // for sycl::span #include // for size_t @@ -161,8 +162,15 @@ class __SYCL_EXPORT SubmissionInfo { } // namespace v1 -template struct TransformUserItemType { + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; +}; + +template auto submit_kernel_direct( @@ -170,78 +178,7 @@ auto submit_kernel_direct( KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents, const PropertiesT &ExtraProps = ext::oneapi::experimental::empty_properties_t{}, - const detail::code_location &CodeLoc = detail::code_location::current()) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - - using KernelType = - std::remove_const_t>; - - using NameT = - typename detail::get_kernel_name_t::name; - - detail::KernelWrapper::wrap(KernelFunc); - - HostKernelRef - HostKernel(std::forward(KernelFunc)); - - // Instantiating the kernel on the host improves debugging. - // Passing this pointer to another translation unit prevents optimization. -#ifndef NDEBUG - // TODO: call library to prevent dropping call due to optimization. - (void) - detail::GetInstantiateKernelOnHostPtr(); -#endif - - detail::DeviceKernelInfo *DeviceKernelInfoPtr = - &detail::getDeviceKernelInfo(); - constexpr auto Info = detail::CompileTimeKernelInfo; - - assert(Info.Name != std::string_view{} && "Kernel must have a name!"); - - static_assert( - Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize, - "Unexpected kernel lambda size. This can be caused by an " - "external host compiler producing a lambda with an " - "unexpected layout. This is a limitation of the compiler." - "In many cases the difference is related to capturing constexpr " - "variables. In such cases removing constexpr specifier aligns the " - "captures between the host compiler and the device compiler." - "\n" - "In case of MSVC, passing " - "-fsycl-host-compiler-options='/std:c++latest' " - "might also help."); - - detail::KernelPropertyHolderStructTy ParsedProperties; - if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< - const KernelType &>::value) { - // Merge properties via get() and manually specified properties. - // get() method is used for specifying kernel properties but properties - // passed via launch_config (ExtraProps) should be kernel launch properties. - // They are mutually exclusive, so there should not be any conflict when - // merging properties. merge_properties() throws if there's a conflict. - auto MergedProps = - sycl::ext::oneapi::experimental::detail::merge_properties( - ExtraProps, - KernelFunc.get(ext::oneapi::experimental::properties_tag{})); - - ParsedProperties = extractKernelProperties(MergedProps); - } else { - ParsedProperties = extractKernelProperties(ExtraProps); - } - - if constexpr (EventNeeded) { - return submit_kernel_direct_with_event_impl( - Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents, - ParsedProperties, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); - } else { - submit_kernel_direct_without_event_impl( - Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents, - ParsedProperties, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); - } -} + const detail::code_location &CodeLoc = detail::code_location::current()); template Range, KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents = {}, const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, - const detail::code_location &CodeLoc = detail::code_location::current()) { + const detail::code_location &CodeLoc = detail::code_location::current()); - using KernelType = - std::remove_const_t>; - - using LambdaArgType = - sycl::detail::lambda_arg_type>; - static_assert( - std::is_convertible_v, LambdaArgType>, - "Kernel argument of a sycl::parallel_for with sycl::nd_range " - "must be either sycl::nd_item or be convertible from sycl::nd_item"); - using TransformedArgType = sycl::nd_item; - -#ifndef __SYCL_DEVICE_ONLY__ - detail::checkValueRange(Range); -#endif - - return submit_kernel_direct( - Queue, detail::nd_range_view(Range), - std::forward(KernelFunc), DepEvents, Props, - CodeLoc); -} +template +auto submit_kernel_direct_parallel_for( + const queue &Queue, range Range, KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents = {}, + const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, + const detail::code_location &CodeLoc = detail::code_location::current()); template DepEvents = {}, const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, - const detail::code_location &CodeLoc = detail::code_location::current()) { - - return submit_kernel_direct( - Queue, detail::nd_range_view(), - std::forward(KernelFunc), DepEvents, Props, - CodeLoc); -} + const detail::code_location &CodeLoc = detail::code_location::current()); } // namespace detail @@ -3977,11 +3892,27 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::remove_const_t< + std::remove_reference_t>>>; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + return detail::submit_kernel_direct_parallel_for( + *this, Range, Rest..., {}, Properties, TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// parallel_for_impl with a kernel represented as a lambda + range that @@ -4011,12 +3942,29 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::remove_const_t< + std::remove_reference_t>>>; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + return detail::submit_kernel_direct_parallel_for( + *this, Range, Rest..., sycl::span(&DepEvent, 1), + Properties, TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// parallel_for_impl with a kernel represented as a lambda + range that @@ -4048,12 +3996,29 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { PropertiesT Properties, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::remove_const_t< + std::remove_reference_t>>>; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + return detail::submit_kernel_direct_parallel_for( + *this, Range, Rest..., DepEvents, Properties, + TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// parallel_for_impl version with a kernel represented as a lambda + range @@ -4095,6 +4060,235 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } }; +namespace detail { + +template +auto submit_kernel_direct(const queue &Queue, detail::nd_range_view RangeView, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &ExtraProps, + const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + + using KernelType = + std::remove_const_t>; + + detail::KernelWrapper::wrap(KernelFunc); + + HostKernelRef + HostKernel(std::forward(KernelFunc)); + + // Instantiating the kernel on the host improves debugging. + // Passing this pointer to another translation unit prevents optimization. +#ifndef NDEBUG + // TODO: call library to prevent dropping call due to optimization. + (void) + detail::GetInstantiateKernelOnHostPtr(); +#endif + + detail::DeviceKernelInfo *DeviceKernelInfoPtr = + &detail::getDeviceKernelInfo(); + constexpr auto Info = detail::CompileTimeKernelInfo; + + assert(Info.Name != std::string_view{} && "Kernel must have a name!"); + + static_assert( + Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize, + "Unexpected kernel lambda size. This can be caused by an " + "external host compiler producing a lambda with an " + "unexpected layout. This is a limitation of the compiler." + "In many cases the difference is related to capturing constexpr " + "variables. In such cases removing constexpr specifier aligns the " + "captures between the host compiler and the device compiler." + "\n" + "In case of MSVC, passing " + "-fsycl-host-compiler-options='/std:c++latest' " + "might also help."); + + detail::KernelPropertyHolderStructTy ParsedProperties; + if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< + const KernelType &>::value) { + // Merge properties via get() and manually specified properties. + // get() method is used for specifying kernel properties but properties + // passed via launch_config (ExtraProps) should be kernel launch properties. + // They are mutually exclusive, so there should not be any conflict when + // merging properties. merge_properties() throws if there's a conflict. + auto MergedProps = + sycl::ext::oneapi::experimental::detail::merge_properties( + ExtraProps, + KernelFunc.get(ext::oneapi::experimental::properties_tag{})); + + ParsedProperties = extractKernelProperties(MergedProps); + } else { + ParsedProperties = extractKernelProperties(ExtraProps); + } + + if constexpr (EventNeeded) { + return submit_kernel_direct_with_event_impl( + Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents, + ParsedProperties, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } else { + submit_kernel_direct_without_event_impl( + Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents, + ParsedProperties, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } +} + +template +auto submit_kernel_direct_parallel_for(const queue &Queue, nd_range Range, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &Props, + const detail::code_location &CodeLoc) { + + using KernelType = + std::remove_const_t>; + using NameT = + typename detail::get_kernel_name_t::name; + + using LambdaArgType = + sycl::detail::lambda_arg_type>; + static_assert( + std::is_convertible_v, LambdaArgType>, + "Kernel argument of a sycl::parallel_for with sycl::nd_range " + "must be either sycl::nd_item or be convertible from sycl::nd_item"); + using TransformedArgType = sycl::nd_item; + +#ifndef __SYCL_DEVICE_ONLY__ + detail::checkValueRange(Range); +#endif + + return submit_kernel_direct( + Queue, detail::nd_range_view(Range), + std::forward(KernelFunc), DepEvents, Props, + CodeLoc); +} + +template +auto submit_kernel_direct_parallel_for(const queue &Queue, range Range, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &Props, + const detail::code_location &CodeLoc) { + +#ifndef __SYCL_DEVICE_ONLY__ + if (!range_size_fits_in_size_t(Range)) + throw sycl::exception(make_error_code(errc::runtime), + "The total number of work-items in " + "a range must fit within size_t"); +#endif + + using KernelType = + std::remove_const_t>; + using NameT = + typename detail::get_kernel_name_t::name; + using LambdaArgType = sycl::detail::lambda_arg_type>; + + // If 1D kernel argument is an integral type, convert it to sycl::item<1> + // If user type is convertible from sycl::item/sycl::nd_item, use + // sycl::item/sycl::nd_item to transport item information + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename TransformUserItemType::type>; + + static_assert(!std::is_same_v>, + "Kernel argument cannot have a sycl::nd_item type in " + "sycl::parallel_for with sycl::range"); + + static_assert(std::is_convertible_v, LambdaArgType> || + std::is_convertible_v, LambdaArgType>, + "sycl::parallel_for(sycl::range) kernel must have the " + "first argument of sycl::item type, or of a type which is " + "implicitly convertible from sycl::item"); + + using RefLambdaArgType = std::add_lvalue_reference_t; + static_assert( + (std::is_invocable_v), + "SYCL kernel lambda/functor has an unexpected signature, it should be " + "invocable with sycl::item"); + + // Range rounding can be disabled by the user. + // Range rounding is supported only for newer SYCL standards. +#if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \ + SYCL_LANGUAGE_VERSION >= 202012L + auto [RoundedRange, HasRoundedRange] = + detail::getRoundedRange(Range, Queue.get_device()); + if (HasRoundedRange) { + using NameWT = typename detail::get_kernel_wrapper_name_t::name; + auto Wrapper = + detail::getRangeRoundedKernelLambda( + KernelFunc, Range); + + using KTypeWrapper = decltype(Wrapper); + using KName = std::conditional_t::value, + KTypeWrapper, NameWT>; +#ifndef __SYCL_DEVICE_ONLY__ + // We are executing over the rounded range, but there are still + // items/ids that are are constructed in ther range rounded + // kernel use items/ids in the user range, which means that + // __SYCL_ASSUME_INT can still be violated. So check the bounds + // of the user range, instead of the rounded range. + detail::checkValueRange(Range); +#endif + return submit_kernel_direct( + Queue, detail::nd_range_view(RoundedRange), std::move(Wrapper), + DepEvents, Props, CodeLoc); + } else +#endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ && + // SYCL_LANGUAGE_VERSION >= 202012L + { +#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ +#ifndef __SYCL_DEVICE_ONLY__ + detail::checkValueRange(Range); +#endif + return submit_kernel_direct( + Queue, detail::nd_range_view(Range), + std::forward(KernelFunc), DepEvents, Props, + CodeLoc); + +#else + (void)Range; + (void)Props; + (void)KernelFunc; +#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + } +} + +template +auto submit_kernel_direct_single_task(const queue &Queue, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &Props, + const detail::code_location &CodeLoc) { + + using KernelType = + std::remove_const_t>; + using NameT = + typename detail::get_kernel_name_t::name; + + return submit_kernel_direct( + Queue, detail::nd_range_view(), + std::forward(KernelFunc), DepEvents, Props, + CodeLoc); +} +} // namespace detail + } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/range_rounding.hpp b/sycl/include/sycl/range_rounding.hpp new file mode 100644 index 0000000000000..1e30269fb20bb --- /dev/null +++ b/sycl/include/sycl/range_rounding.hpp @@ -0,0 +1,161 @@ +//==----------- range_rounding.hpp --- SYCL range rounding utils -----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include // for size_t + +namespace sycl { +inline namespace _V1 { + +namespace detail { + +void __SYCL_EXPORT GetRangeRoundingSettings(size_t &MinFactor, + size_t &GoodFactor, + size_t &MinRange); + +std::tuple, bool> + __SYCL_EXPORT getMaxWorkGroups(const device &Device); + +bool __SYCL_EXPORT DisableRangeRounding(); + +bool __SYCL_EXPORT RangeRoundingTrace(); + +template +std::tuple, bool> getRoundedRange(range UserRange, + const device &Device) { + range RoundedRange = UserRange; + // Disable the rounding-up optimizations under these conditions: + // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set. + // 2. The kernel is provided via an interoperability method (this uses a + // different code path). + // 3. The range is already a multiple of the rounding factor. + // + // Cases 2 and 3 could be supported with extra effort. + // As an optimization for the common case it is an + // implementation choice to not support those scenarios. + // Note that "this_item" is a free function, i.e. not tied to any + // specific id or item. When concurrent parallel_fors are executing + // on a device it is difficult to tell which parallel_for the call is + // being made from. One could replicate portions of the + // call-graph to make this_item calls kernel-specific but this is + // not considered worthwhile. + + // Perform range rounding if rounding-up is enabled. + if (DisableRangeRounding()) + return {range{}, false}; + + // Range should be a multiple of this for reasonable performance. + size_t MinFactorX = 16; + // Range should be a multiple of this for improved performance. + size_t GoodFactor = 32; + // Range should be at least this to make rounding worthwhile. + size_t MinRangeX = 1024; + + // Check if rounding parameters have been set through environment: + // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange + GetRangeRoundingSettings(MinFactorX, GoodFactor, MinRangeX); + + // In SYCL, each dimension of a global range size is specified by + // a size_t, which can be up to 64 bits. All backends should be + // able to accept a kernel launch with a 32-bit global range size + // (i.e. do not throw an error). The OpenCL CPU backend will + // accept every 64-bit global range, but the GPU backends will not + // generally accept every 64-bit global range. So, when we get a + // non-32-bit global range, we wrap the old kernel in a new kernel + // that has each work item peform multiple invocations the old + // kernel in a 32-bit global range. + id MaxNWGs = [&] { + auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups(Device); + if (!HasMaxWGs) { + id Default; + for (int i = 0; i < Dims; ++i) + Default[i] = (std::numeric_limits::max)(); + return Default; + } + + id IdResult; + size_t Limit = (std::numeric_limits::max)(); + for (int i = 0; i < Dims; ++i) + IdResult[i] = (std::min)(Limit, MaxWGs[Dims - i - 1]); + return IdResult; + }(); + auto M = (std::numeric_limits::max)(); + range MaxRange; + for (int i = 0; i < Dims; ++i) { + auto DesiredSize = MaxNWGs[i] * GoodFactor; + MaxRange[i] = + DesiredSize <= M ? DesiredSize : (M / GoodFactor) * GoodFactor; + } + + bool DidAdjust = false; + auto Adjust = [&](int Dim, size_t Value) { + if (RangeRoundingTrace()) + std::cout << "parallel_for range adjusted at dim " << Dim << " from " + << RoundedRange[Dim] << " to " << Value << std::endl; + RoundedRange[Dim] = Value; + DidAdjust = true; + }; + +#ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ + size_t GoodExpFactor = 1; + switch (Dims) { + case 1: + GoodExpFactor = 32; // Make global range multiple of {32} + break; + case 2: + GoodExpFactor = 16; // Make global range multiple of {16, 16} + break; + case 3: + GoodExpFactor = 8; // Make global range multiple of {8, 8, 8} + break; + } + + // Check if rounding parameters have been set through environment: + // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange + GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX); + + for (auto i = 0; i < Dims; ++i) + if (UserRange[i] % GoodExpFactor) { + Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor); + } +#else + // Perform range rounding if there are sufficient work-items to + // need rounding and the user-specified range is not a multiple of + // a "good" value. + if (RoundedRange[0] % MinFactorX != 0 && RoundedRange[0] >= MinRangeX) { + // It is sufficient to round up just the first dimension. + // Multiplying the rounded-up value of the first dimension + // by the values of the remaining dimensions (if any) + // will yield a rounded-up value for the total range. + Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor); + } +#endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ +#ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + // If we are forcing range rounding kernels to be used, we always want the + // rounded range kernel to be generated, even if rounding isn't needed + DidAdjust = true; +#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + + for (int i = 0; i < Dims; ++i) + if (RoundedRange[i] > MaxRange[i]) + Adjust(i, MaxRange[i]); + + if (!DidAdjust) + return {range{}, false}; + return {RoundedRange, true}; +} + +} // namespace detail +} // namespace _V1 +} // namespace sycl \ No newline at end of file diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 7f38c9266a37e..3d2ade10ed496 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -630,6 +630,8 @@ queue_impl::submit_direct(bool CallerNeedsEvent, detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); + NestedCallsTracker tracker; + // Used by queue_empty() and getLastEvent() MEmpty.store(false, std::memory_order_release); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index fbcd88f1bd42a..7f673801b4024 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1093,6 +1093,8 @@ void handler::ext_oneapi_barrier(const std::vector &WaitList) { } using namespace sycl::detail; + +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES bool handler::DisableRangeRounding() { return SYCLConfig::get(); } @@ -1106,6 +1108,7 @@ void handler::GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor, SYCLConfig::GetSettings( MinFactor, GoodFactor, MinRange); } +#endif void handler::memcpy(void *Dest, const void *Src, size_t Count) { throwIfActionIsCreated(); @@ -1980,6 +1983,7 @@ kernel_bundle handler::getKernelBundle() const { *KernelBundleImplPtr); } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES std::optional> handler::getMaxWorkGroups() { device_impl &DeviceImpl = impl->get_device(); std::array UrResult = {}; @@ -2001,7 +2005,6 @@ std::tuple, bool> handler::getMaxWorkGroups_v2() { return {std::array{0, 0, 0}, false}; } -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::setNDRangeUsed(bool Value) { (void)Value; } #endif @@ -2038,6 +2041,10 @@ void handler::registerDynamicParameter( bool handler::eventNeeded() const { return impl->MEventNeeded; } +device handler::get_device() const { + return detail::createSyclObjFromImpl(impl->get_device()); +} + void *handler::storeRawArg(const void *Ptr, size_t Size) { impl->CGData.MArgsStorage.emplace_back(Size); void *Storage = static_cast(impl->CGData.MArgsStorage.back().data()); diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index f5858217d23e7..2fd57dcf90dbd 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -20,8 +20,8 @@ namespace sycl { inline namespace _V1 { -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES namespace detail { +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES SubmissionInfo::SubmissionInfo() : impl{std::make_shared()} {} @@ -58,10 +58,40 @@ const ext::oneapi::experimental::event_mode_enum & SubmissionInfo::EventMode() const { return impl->MEventMode; } -} // namespace detail #endif // __INTEL_PREVIEW_BREAKING_CHANGES +void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor, + size_t &MinRange) { + SYCLConfig::GetSettings( + MinFactor, GoodFactor, MinRange); +} + +std::tuple, bool> getMaxWorkGroups(const device &Device) { + std::array UrResult = {}; + auto &DeviceImpl = getSyclObjImpl(Device); + + auto Ret = DeviceImpl->getAdapter().call_nocheck( + DeviceImpl->getHandleRef(), + UrInfoCode< + ext::oneapi::experimental::info::device::max_work_groups<3>>::value, + sizeof(UrResult), &UrResult, nullptr); + if (Ret == UR_RESULT_SUCCESS) { + return {UrResult, true}; + } + return {std::array{0, 0, 0}, false}; +} + +bool DisableRangeRounding() { + return SYCLConfig::get(); +} + +bool RangeRoundingTrace() { + return SYCLConfig::get(); +} + +} // namespace detail + queue::queue(const context &SyclContext, const device_selector &DeviceSelector, const async_handler &AsyncHandler, const property_list &PropList) { const std::vector Devs = SyclContext.get_devices(); diff --git a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp index 82f8477a10962..d0770a56696be 100644 --- a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp +++ b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp @@ -23,7 +23,8 @@ int main(int argc, char **argv) { kernel<0> krn0; q.parallel_for(sycl::range<1>{1}, krn0); - assert(copy_count == 1); + // The kernel is copied on the scheduler-based path only + assert(copy_count == 0); assert(move_count == 0); copy_count = 0; diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 6d8e358604eca..667e6aa204fb8 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3310,6 +3310,7 @@ _ZN4sycl3_V16detail16AccessorBaseHostC1ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6a _ZN4sycl3_V16detail16AccessorBaseHostC1ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviimbRKNS0_13property_listE _ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviibmbRKNS0_13property_listE _ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviimbRKNS0_13property_listE +_ZN4sycl3_V16detail16getMaxWorkGroupsERKNS0_6deviceE _ZN4sycl3_V16detail16get_pointer_typeEPKvRNS1_12context_implE _ZN4sycl3_V16detail16openIPCMemHandleEPKSt4bytemRKNS0_7contextERKNS0_6deviceE _ZN4sycl3_V16detail16reduGetMaxWGSizeERNS0_7handlerEm @@ -3318,6 +3319,7 @@ _ZN4sycl3_V16detail17HostProfilingInfo3endEv _ZN4sycl3_V16detail17HostProfilingInfo5startEv _ZN4sycl3_V16detail17device_global_map3addEPKvPKc _ZN4sycl3_V16detail17reduComputeWGSizeEmmRm +_ZN4sycl3_V16detail18RangeRoundingTraceEv _ZN4sycl3_V16detail18get_kernel_id_implENS1_11string_viewE _ZN4sycl3_V16detail18make_kernel_bundleEmRKNS0_7contextENS0_12bundle_stateENS0_7backendE _ZN4sycl3_V16detail18make_kernel_bundleEmRKNS0_7contextEbNS0_12bundle_stateENS0_7backendE @@ -3329,6 +3331,7 @@ _ZN4sycl3_V16detail19kernel_bundle_plain30ext_oneapi_get_raw_kernel_nameENS1_11s _ZN4sycl3_V16detail19kernel_bundle_plain32set_specialization_constant_implEPKcPvm _ZN4sycl3_V16detail19kernel_bundle_plain33ext_oneapi_get_device_global_sizeENS1_11string_viewE _ZN4sycl3_V16detail19kernel_bundle_plain36ext_oneapi_get_device_global_addressENS1_11string_viewERKNS0_6deviceE +_ZN4sycl3_V16detail20DisableRangeRoundingEv _ZN4sycl3_V16detail20associateWithHandlerERNS0_7handlerEPNS1_16AccessorBaseHostENS0_6access6targetE _ZN4sycl3_V16detail20associateWithHandlerERNS0_7handlerEPNS1_28SampledImageAccessorBaseHostENS0_12image_targetE _ZN4sycl3_V16detail20associateWithHandlerERNS0_7handlerEPNS1_30UnsampledImageAccessorBaseHostENS0_12image_targetE @@ -3354,6 +3357,7 @@ _ZN4sycl3_V16detail22reduGetPreferredWGSizeERNS0_7handlerEm _ZN4sycl3_V16detail22reduGetPreferredWGSizeERSt10shared_ptrINS1_10queue_implEEm _ZN4sycl3_V16detail22removeDuplicateDevicesERKSt6vectorINS0_6deviceESaIS3_EE _ZN4sycl3_V16detail23constructorNotificationEPvS2_NS0_6access6targetENS3_4modeERKNS1_13code_locationE +_ZN4sycl3_V16detail24GetRangeRoundingSettingsERmS2_S2_ _ZN4sycl3_V16detail24find_device_intersectionERKSt6vectorINS0_13kernel_bundleILNS0_12bundle_stateE1EEESaIS5_EE _ZN4sycl3_V16detail26createKernelNameBasedCacheEv _ZN4sycl3_V16detail26isDeviceGlobalUsedInKernelEPKv @@ -4111,6 +4115,7 @@ _ZNK4sycl3_V17context8get_infoINS0_4info7context32atomic_memory_scope_capabiliti _ZNK4sycl3_V17context8get_infoINS0_4info7context7devicesEEENS0_6detail20is_context_info_descIT_E11return_typeEv _ZNK4sycl3_V17context8get_infoINS0_4info7context8platformEEENS0_6detail20is_context_info_descIT_E11return_typeEv _ZNK4sycl3_V17context9getNativeEv +_ZNK4sycl3_V17handler10get_deviceEv _ZNK4sycl3_V17handler11eventNeededEv _ZNK4sycl3_V17handler14getContextImplEv _ZNK4sycl3_V17handler15getCommandGraphEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index fdb3575bd9790..6886aa2c5fd0e 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -710,6 +710,7 @@ ?AccessTargetMask@handler@_V1@sycl@@0HB ?Clear@exception_list@_V1@sycl@@AEAAXXZ ?DirSep@OSUtil@detail@_V1@sycl@@2QEBDEB +?DisableRangeRounding@detail@_V1@sycl@@YA_NXZ ?DisableRangeRounding@handler@_V1@sycl@@AEAA_NXZ ?EventMode@SubmissionInfo@detail@_V1@sycl@@QEAAAEAW4event_mode_enum@experimental@oneapi@ext@34@XZ ?EventMode@SubmissionInfo@detail@_V1@sycl@@QEBAAEBW4event_mode_enum@experimental@oneapi@ext@34@XZ @@ -717,6 +718,7 @@ ?EventMode@SubmissionInfo@v1@detail@_V1@sycl@@QEBAAEBW4event_mode_enum@experimental@oneapi@ext@45@XZ ?GDBMethodsAnchor@SampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ ?GDBMethodsAnchor@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ +?GetRangeRoundingSettings@detail@_V1@sycl@@YAXAEA_K00@Z ?GetRangeRoundingSettings@handler@_V1@sycl@@AEAAXAEA_K00@Z ?HasAssociatedAccessor@handler@_V1@sycl@@AEBA_NPEAVAccessorImplHost@detail@23@W4target@access@23@@Z ?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ @@ -725,6 +727,7 @@ ?PostProcessorFunc@SubmissionInfo@v1@detail@_V1@sycl@@QEBAAEBV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@345@XZ ?PushBack@exception_list@_V1@sycl@@AEAAX$$QEAVexception_ptr@std@@@Z ?PushBack@exception_list@_V1@sycl@@AEAAXAEBVexception_ptr@std@@@Z +?RangeRoundingTrace@detail@_V1@sycl@@YA_NXZ ?RangeRoundingTrace@handler@_V1@sycl@@AEAA_NXZ ?SecondaryQueue@SubmissionInfo@detail@_V1@sycl@@QEAAAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@XZ ?SecondaryQueue@SubmissionInfo@detail@_V1@sycl@@QEBAAEBV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@XZ @@ -4076,6 +4079,7 @@ ?getEndTime@HostProfilingInfo@detail@_V1@sycl@@QEBA_KXZ ?getKernelBundle@handler@_V1@sycl@@AEBA?AV?$kernel_bundle@$0A@@23@XZ ?getKernelName@handler@_V1@sycl@@AEAA?AVstring@detail@23@XZ +?getMaxWorkGroups@detail@_V1@sycl@@YA?AV?$tuple@V?$array@_K$02@std@@_N@std@@AEBVdevice@23@@Z ?getMaxWorkGroups@handler@_V1@sycl@@AEAA?AV?$optional@V?$array@_K$02@std@@@std@@XZ ?getMaxWorkGroups_v2@handler@_V1@sycl@@AEAA?AV?$tuple@V?$array@_K$02@std@@_N@std@@XZ ?getMemoryObject@AccessorBaseHost@detail@_V1@sycl@@QEBAPEAXXZ @@ -4168,6 +4172,7 @@ ?get_coordinate_normalization_mode@sampler@_V1@sycl@@QEBA?AW4coordinate_normalization_mode@23@XZ ?get_count@image_plain@detail@_V1@sycl@@IEBA_KXZ ?get_descriptor@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAAEBUimage_descriptor@23456@XZ +?get_device@handler@_V1@sycl@@AEBA?AVdevice@23@XZ ?get_device@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AVdevice@56@XZ ?get_device@memory_pool@experimental@oneapi@ext@_V1@sycl@@QEBA?AVdevice@56@XZ ?get_device@physical_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AVdevice@56@XZ diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index a0bc25739c465..2d0cc14cd5c26 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -152,6 +152,7 @@ // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: feature_test.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp index 9cd74ac24ca78..846bd0ed4a436 100644 --- a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp @@ -156,5 +156,6 @@ // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp index 65e5d95389e72..e6b167d5102a0 100644 --- a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp @@ -183,6 +183,7 @@ // CHECK-NEXT: detail/nd_range_view.hpp // CHECK-NEXT: detail/optional.hpp // CHECK-NEXT: ext/oneapi/experimental/event_mode_property.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: usm.hpp // CHECK-NEXT: usm/usm_pointer_info.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp index 5534c9b9fe6ee..b24f3577906ae 100644 --- a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp @@ -171,6 +171,7 @@ // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: usm/usm_pointer_info.hpp // CHECK-NEXT: usm/usm_allocator.hpp From b470005b3edba968b369d67e177d3bf26262915e Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 24 Nov 2025 13:46:56 +0000 Subject: [PATCH 2/7] Update range rounding tests and nd_range_view --- sycl/include/sycl/detail/nd_range_view.hpp | 4 + sycl/include/sycl/queue.hpp | 7 +- sycl/source/detail/ndrange_desc.hpp | 36 +- .../parallel_for_disable_range_roundup.cpp | 65 +-- .../Basic/parallel_for_range_roundup.cpp | 400 ++++++++++++------ sycl/test-e2e/helpers.hpp | 15 + 6 files changed, 355 insertions(+), 172 deletions(-) diff --git a/sycl/include/sycl/detail/nd_range_view.hpp b/sycl/include/sycl/detail/nd_range_view.hpp index effc7a1bc8fe4..c69f94e1ed9e8 100644 --- a/sycl/include/sycl/detail/nd_range_view.hpp +++ b/sycl/include/sycl/detail/nd_range_view.hpp @@ -29,6 +29,10 @@ class nd_range_view { nd_range_view &operator=(const nd_range_view &Desc) = default; nd_range_view &operator=(nd_range_view &&Desc) = default; + template + nd_range_view(sycl::range &N) + : MGlobalSize(&(N[0])), MDims(size_t(Dims_)) {} + template nd_range_view(sycl::nd_range &ExecutionRange) : MGlobalSize(&(ExecutionRange.globalSize[0])), diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b86c28663d0ae..81890a3b6e9a7 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -4065,7 +4065,8 @@ namespace detail { template -auto submit_kernel_direct(const queue &Queue, detail::nd_range_view RangeView, +auto submit_kernel_direct(const queue &Queue, + const detail::nd_range_view &RangeView, KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents, const PropertiesT &ExtraProps, @@ -4233,8 +4234,8 @@ auto submit_kernel_direct_parallel_for(const queue &Queue, range Range, KTypeWrapper, NameWT>; #ifndef __SYCL_DEVICE_ONLY__ // We are executing over the rounded range, but there are still - // items/ids that are are constructed in ther range rounded - // kernel use items/ids in the user range, which means that + // items/ids that are constructed in the range rounded + // kernel, use items/ids in the user range, which means that // __SYCL_ASSUME_INT can still be violated. So check the bounds // of the user range, instead of the rounded range. detail::checkValueRange(Range); diff --git a/sycl/source/detail/ndrange_desc.hpp b/sycl/source/detail/ndrange_desc.hpp index 1a18aa97a31f2..825383851d6aa 100644 --- a/sycl/source/detail/ndrange_desc.hpp +++ b/sycl/source/detail/ndrange_desc.hpp @@ -33,9 +33,13 @@ class NDRDescT { NDRDescT(const NDRDescT &Desc) = default; NDRDescT(NDRDescT &&Desc) = default; - NDRDescT(const detail::nd_range_view &NDRangeView) : Dims{NDRangeView.MDims} { + NDRDescT(const detail::nd_range_view &NDRangeView, + bool SetNumWorkGroups = false) + : Dims{NDRangeView.MDims} { if (!NDRangeView.MGlobalSize) { init(); + } else if (!NDRangeView.MLocalSize) { + init(&(NDRangeView.MGlobalSize[0]), SetNumWorkGroups); } else { init(NDRangeView.MGlobalSize, NDRangeView.MLocalSize, NDRangeView.MOffset); @@ -44,19 +48,7 @@ class NDRDescT { template NDRDescT(sycl::range N, bool SetNumWorkGroups) : Dims{size_t(Dims_)} { - if (SetNumWorkGroups) { - for (size_t I = 0; I < Dims_; ++I) { - NumWorkGroups[I] = N[I]; - } - } else { - for (size_t I = 0; I < Dims_; ++I) { - GlobalSize[I] = N[I]; - } - - for (size_t I = Dims_; I < 3; ++I) { - GlobalSize[I] = 1; - } - } + init(&(N[0]), SetNumWorkGroups); } template @@ -109,6 +101,22 @@ class NDRDescT { size_t Dims = 0; private: + void init(const size_t *N, bool SetNumWorkGroups) { + if (SetNumWorkGroups) { + for (size_t I = 0; I < Dims; ++I) { + NumWorkGroups[I] = N[I]; + } + } else { + for (size_t I = 0; I < Dims; ++I) { + GlobalSize[I] = N[I]; + } + + for (size_t I = Dims; I < 3; ++I) { + GlobalSize[I] = 1; + } + } + } + void init(const size_t *NumWorkItems, const size_t *LocalSizes, const size_t *Offset) { for (size_t I = 0; I < Dims; ++I) { diff --git a/sycl/test-e2e/Basic/parallel_for_disable_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_disable_range_roundup.cpp index 676188b39e9e0..b3f4d06103138 100644 --- a/sycl/test-e2e/Basic/parallel_for_disable_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_disable_range_roundup.cpp @@ -6,52 +6,61 @@ // RUN: %{build} -sycl-std=2020 -o %t2.out // RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t2.out | FileCheck %s --check-prefix CHECK-ENABLED -#include +#include #include +#include + +#include "../helpers.hpp" #include using namespace sycl; -range<1> Range1 = {0}; - void check(const char *msg, size_t v, size_t ref) { std::cout << msg << v << std::endl; assert(v == ref); } -int try_rounding_off(size_t size) { - range<1> Size{size}; - int Counter = 0; - { - buffer, 1> BufRange(&Range1, 1); - buffer BufCounter(&Counter, 1); - queue myQueue; - - std::cout << "Run parallel_for" << std::endl; - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](item<1> ITEM) { - AccCounter[0].fetch_add(1); - AccRange[0] = ITEM.get_range(0); - }); - }); - myQueue.wait(); - } - check("Size seen by user = ", Range1.get(0), size); - check("Counter = ", Counter, size); - return 0; +void try_rounding_off(size_t size, bool useShortcutFunction) { + range<1> Range{size}; + queue Queue; + range<1> *RangePtr = malloc_shared>(1, Queue); + int *CounterPtr = malloc_shared(1, Queue); + + std::cout << "Run parallel_for" << std::endl; + auto KernelFunc = [=](item<1> id) { + auto atm = atomic_ref(*CounterPtr); + atm.fetch_add(1); + (*RangePtr) = id.get_range(0); + }; + command_submit_wrappers::parallel_for_wrapper( + useShortcutFunction, Queue, Range, KernelFunc); + + Queue.wait(); + + auto Context = Queue.get_context(); + + check("Size seen by user = ", RangePtr->get(0), size); + check("Counter = ", *CounterPtr, size); + + free(RangePtr, Context); + free(CounterPtr, Context); } int main() { int x; x = 1500; - try_rounding_off(x); + try_rounding_off(x, true); + try_rounding_off(x, false); return 0; } +// CHECK-DISABLED: Run parallel_for +// CHECK-DISABLED-NOT: parallel_for range adjusted at dim 0 from 1500 +// CHECK-DISABLED: Size seen by user = 1500 +// CHECK-DISABLED-NEXT: Counter = 1500 // CHECK-DISABLED: Run parallel_for // CHECK-DISABLED-NOT: parallel_for range adjusted at dim 0 from 1500 // CHECK-DISABLED: Size seen by user = 1500 @@ -61,3 +70,7 @@ int main() { // CHECK-ENABLED-NEXT: parallel_for range adjusted at dim 0 from 1500 // CHECK-ENABLED-NEXT: Size seen by user = 1500 // CHECK-ENABLED-NEXT: Counter = 1500 +// CHECK-ENABLED: Run parallel_for +// CHECK-ENABLED-NEXT: parallel_for range adjusted at dim 0 from 1500 +// CHECK-ENABLED-NEXT: Size seen by user = 1500 +// CHECK-ENABLED-NEXT: Counter = 1500 diff --git a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp index 67123d393269d..2d529041b2c61 100644 --- a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp @@ -13,22 +13,25 @@ // // These tests test 3 things: // -// 1. The user range is the same as the in kernel range (using BufRange) as +// 1. The user range is the same as the in kernel range (using RangePtr) as // reported by get_range(). // 2. That the effective range is the same as the reported range (using -// BufCounter). i.e. check that the mapping of effective range to user range +// CouterPtr). i.e. check that the mapping of effective range to user range // is "onto". // 3. That every index in a 1, 2, or 3 dimension range is active the execution -// (using BufIndexes). i.e. check that the mapping of effective range to user -// range is "one-to-one". +// (using ItemIndexesPtr). i.e. check that the mapping of effective range to +// user range is "one-to-one". // // UNSUPPORTED: hip // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/17077 // -#include +#include #include +#include #include +#include "../helpers.hpp" + #include using namespace sycl; @@ -58,158 +61,223 @@ template void checkVec(vec a, vec b) { assert(a[2] == b[2]); } -template void try_1d_range(size_t size) { +template +void try_1d_range(size_t size, bool useShortcutFunction) { using IndexCheckT = int; - range<1> Size{size}; - int Counter = 0; - std::vector ItemIndexes(Size[0]); - { - buffer, 1> BufRange(&Range1, 1); - buffer BufCounter(&Counter, 1); - buffer BufIndexes(ItemIndexes); - queue myQueue; - - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - auto AccIndexes = BufIndexes.get_access(cgh); - cgh.parallel_for>(Size, [=](KernelIdT I) { - AccCounter[0].fetch_add(1); - if constexpr (std::is_same_v>) - AccRange[0] = sycl::range<1>(I.get_range(0)); - int Idx = I[0]; - AccIndexes[Idx] = IndexCheckT(I[0]); - }); - }); - myQueue.wait(); - } + range<1> Range{size}; + queue Queue; + + range<1> *RangePtr = malloc_shared>(1, Queue); + int *CounterPtr = malloc_shared(1, Queue); + IndexCheckT *ItemIndexesPtr = malloc_shared(Range[0], Queue); + + auto KernelFunc = [=](KernelIdT I) { + auto atm = atomic_ref(*CounterPtr); + atm.fetch_add(1); + if constexpr (std::is_same_v>) + (*RangePtr) = range<1>(I.get_range(0)); + int Idx = I[0]; + ItemIndexesPtr[Idx] = IndexCheckT(I[0]); + }; + + command_submit_wrappers::parallel_for_wrapper>( + useShortcutFunction, Queue, Range, KernelFunc); + + Queue.wait(); + if constexpr (std::is_same_v>) { - check("Size seen by user at Dim 0 = ", Range1.get(0), size); + check("Size seen by user at Dim 0 = ", RangePtr->get(0), size); } - check("Counter = ", Counter, size); - for (auto i = 0; i < Size[0]; ++i) { - checkVec<1>(vec(ItemIndexes[i]), vec(i)); + check("Counter = ", *CounterPtr, size); + for (auto i = 0; i < Range[0]; ++i) { + checkVec<1>(vec(ItemIndexesPtr[i]), vec(i)); } std::cout << "Correct kernel indexes used\n"; + + auto Context = Queue.get_context(); + free(RangePtr, Context); + free(CounterPtr, Context); + free(ItemIndexesPtr, Context); } -template void try_2d_range(size_t size) { +template +void try_2d_range(size_t size, bool useShortcutFunction) { using IndexCheckT = int2; - range<2> Size{size, MagicY}; - int Counter = 0; - std::vector ItemIndexes(Size[0] * Size[1]); - { - buffer, 1> BufRange(&Range2, 1); - buffer BufCounter(&Counter, 1); - buffer BufIndexes(ItemIndexes); - queue myQueue; - - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - auto AccIndexes = BufIndexes.get_access(cgh); - cgh.parallel_for>(Size, [=](KernelIdT I) { - AccCounter[0].fetch_add(1); - if constexpr (std::is_same_v>) - AccRange[0] = sycl::range<2>(I.get_range(0), I.get_range(1)); - int Idx = I[0] * Size[1] + I[1]; - AccIndexes[Idx] = IndexCheckT(I[0], I[1]); - }); - }); - myQueue.wait(); - } + range<2> Range{size, MagicY}; + queue Queue; + + range<2> *RangePtr = malloc_shared>(1, Queue); + int *CounterPtr = malloc_shared(1, Queue); + IndexCheckT *ItemIndexesPtr = + malloc_shared(Range[0] * Range[1], Queue); + + auto KernelFunc = [=](KernelIdT I) { + auto atm = atomic_ref(*CounterPtr); + atm.fetch_add(1); + + if constexpr (std::is_same_v>) + (*RangePtr) = range<2>(I.get_range(0), I.get_range(1)); + int Idx = I[0] * Range[1] + I[1]; + ItemIndexesPtr[Idx] = IndexCheckT(I[0], I[1]); + }; + + command_submit_wrappers::parallel_for_wrapper>( + useShortcutFunction, Queue, Range, KernelFunc); + + Queue.wait(); + if constexpr (std::is_same_v>) { - check("Size seen by user at Dim 0 = ", Range2.get(0), Size[0]); - check("Size seen by user at Dim 1 = ", Range2.get(1), Size[1]); + check("Size seen by user at Dim 0 = ", RangePtr->get(0), Range[0]); + check("Size seen by user at Dim 1 = ", RangePtr->get(1), Range[1]); } - check("Counter = ", Counter, size * MagicY); - for (auto i = 0; i < Size[0]; ++i) - for (auto j = 0; j < Size[1]; ++j) - checkVec<2>(ItemIndexes[i * Size[1] + j], IndexCheckT(i, j)); + check("Counter = ", *CounterPtr, size * MagicY); + for (auto i = 0; i < Range[0]; ++i) + for (auto j = 0; j < Range[1]; ++j) + checkVec<2>(ItemIndexesPtr[i * Range[1] + j], IndexCheckT(i, j)); std::cout << "Correct kernel indexes used\n"; + + auto Context = Queue.get_context(); + free(RangePtr, Context); + free(CounterPtr, Context); + free(ItemIndexesPtr, Context); } -template void try_3d_range(size_t size) { +template +void try_3d_range(size_t size, bool useShortcutFunction) { using IndexCheckT = int3; - range<3> Size{size, MagicY, MagicZ}; - int Counter = 0; - std::vector ItemIndexes(Size[0] * Size[1] * Size[2]); - { - buffer, 1> BufRange(&Range3, 1); - buffer BufCounter(&Counter, 1); - buffer BufIndexes(ItemIndexes); - queue myQueue; - - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - auto AccIndexes = BufIndexes.get_access(cgh); - cgh.parallel_for>(Size, [=](KernelIdT I) { - AccCounter[0].fetch_add(1); - if constexpr (std::is_same_v>) - AccRange[0] = - sycl::range<3>(I.get_range(0), I.get_range(1), I.get_range(2)); - int Idx = I[0] * Size[1] * Size[2] + I[1] * Size[2] + I[2]; - AccIndexes[Idx] = IndexCheckT(I[0], I[1], I[2]); - }); - }); - myQueue.wait(); - } + range<3> Range{size, MagicY, MagicZ}; + queue Queue; + + range<3> *RangePtr = malloc_shared>(1, Queue); + int *CounterPtr = malloc_shared(1, Queue); + IndexCheckT *ItemIndexesPtr = + malloc_shared(Range[0] * Range[1] * Range[2], Queue); + + auto KernelFunc = [=](KernelIdT I) { + auto atm = atomic_ref(*CounterPtr); + atm.fetch_add(1); + + if constexpr (std::is_same_v>) + (*RangePtr) = range<3>(I.get_range(0), I.get_range(1), I.get_range(2)); + int Idx = I[0] * Range[1] * Range[2] + I[1] * Range[2] + I[2]; + ItemIndexesPtr[Idx] = IndexCheckT(I[0], I[1], I[2]); + }; + + command_submit_wrappers::parallel_for_wrapper>( + useShortcutFunction, Queue, Range, KernelFunc); + + Queue.wait(); + if constexpr (std::is_same_v>) { - check("Size seen by user at Dim 0 = ", Range3.get(0), Size[0]); - check("Size seen by user at Dim 1 = ", Range3.get(1), Size[1]); - check("Size seen by user at Dim 2 = ", Range3.get(2), Size[2]); + check("Size seen by user at Dim 0 = ", RangePtr->get(0), Range[0]); + check("Size seen by user at Dim 1 = ", RangePtr->get(1), Range[1]); + check("Size seen by user at Dim 2 = ", RangePtr->get(2), Range[2]); } - check("Counter = ", Counter, size * MagicY * MagicZ); - for (auto i = 0; i < Size[0]; ++i) - for (auto j = 0; j < Size[1]; ++j) - for (auto k = 0; k < Size[2]; ++k) - checkVec<3>(ItemIndexes[i * Size[1] * Size[2] + j * Size[2] + k], + check("Counter = ", *CounterPtr, size * MagicY * MagicZ); + for (auto i = 0; i < Range[0]; ++i) + for (auto j = 0; j < Range[1]; ++j) + for (auto k = 0; k < Range[2]; ++k) + checkVec<3>(ItemIndexesPtr[i * Range[1] * Range[2] + j * Range[2] + k], IndexCheckT(i, j, k)); std::cout << "Correct kernel indexes used\n"; + + auto Context = Queue.get_context(); + free(RangePtr, Context); + free(CounterPtr, Context); + free(ItemIndexesPtr, Context); } -void try_unnamed_lambda(size_t size) { - range<3> Size{size, MagicY, MagicZ}; - int Counter = 0; - { - buffer, 1> BufRange(&Range3, 1); - buffer BufCounter(&Counter, 1); - queue myQueue; - - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](id<3> ID) { - AccCounter[0].fetch_add(1); - AccRange[0][0] = ID[0]; - }); - }); - myQueue.wait(); - } - check("Counter = ", Counter, size * MagicY * MagicZ); +void try_unnamed_lambda(size_t size, bool useShortcutFunction) { + range<3> Range{size, MagicY, MagicZ}; + queue Queue; + + range<3> *RangePtr = malloc_shared>(1, Queue); + int *CounterPtr = malloc_shared(1, Queue); + + auto KernelFunc = [=](id<3> ID) { + auto atm = atomic_ref(*CounterPtr); + atm.fetch_add(1); + (*RangePtr)[0] = ID[0]; + }; + + command_submit_wrappers::parallel_for_wrapper( + useShortcutFunction, Queue, Range, KernelFunc); + + Queue.wait(); + + check("Counter = ", *CounterPtr, size * MagicY * MagicZ); + + auto Context = Queue.get_context(); + free(RangePtr, Context); + free(CounterPtr, Context); } int main() { int x = 1500; - try_1d_range>(x); - try_1d_range>(x); - try_2d_range>(x); - try_2d_range>(x); - try_3d_range>(x); - try_3d_range>(x); - try_unnamed_lambda(x); + try_1d_range>(x, true); + try_1d_range>(x, true); + try_2d_range>(x, true); + try_2d_range>(x, true); + try_3d_range>(x, true); + try_3d_range>(x, true); + try_unnamed_lambda(x, true); + + try_1d_range>(x, false); + try_1d_range>(x, false); + try_2d_range>(x, false); + try_2d_range>(x, false); + try_3d_range>(x, false); + try_3d_range>(x, false); + try_unnamed_lambda(x, false); x = 256; - try_1d_range>(x); - try_1d_range>(x); - try_2d_range>(x); - try_2d_range>(x); - try_3d_range>(x); - try_3d_range>(x); - try_unnamed_lambda(x); + try_1d_range>(x, true); + try_1d_range>(x, true); + try_2d_range>(x, true); + try_2d_range>(x, true); + try_3d_range>(x, true); + try_3d_range>(x, true); + try_unnamed_lambda(x, true); + + try_1d_range>(x, false); + try_1d_range>(x, false); + try_2d_range>(x, false); + try_2d_range>(x, false); + try_3d_range>(x, false); + try_3d_range>(x, false); + try_unnamed_lambda(x, false); } +// CHECK-DEFAULT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-DEFAULT-NEXT: Counter = 1500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 1500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Counter = 49500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 49500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 2 = 64 +// CHECK-DEFAULT-NEXT: Counter = 3168000 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 3168000 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 3168000 // CHECK-DEFAULT: parallel_for range adjusted at dim 0 from 1500 to 1504 // CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 1500 // CHECK-DEFAULT-NEXT: Counter = 1500 @@ -255,6 +323,25 @@ int main() { // CHECK-DEFAULT-NEXT: Counter = 540672 // CHECK-DEFAULT-NEXT: Correct kernel indexes used // CHECK-DEFAULT-NEXT: Counter = 540672 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-DEFAULT-NEXT: Counter = 256 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 256 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Counter = 8448 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 8448 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 2 = 64 +// CHECK-DEFAULT-NEXT: Counter = 540672 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 540672 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 540672 // CHECK-EXP: parallel_for range adjusted at dim 0 from 1500 to 1504 // CHECK-EXP-NEXT: Size seen by user at Dim 0 = 1500 @@ -287,6 +374,61 @@ int main() { // CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 // CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40 // CHECK-EXP-NEXT: Counter = 3168000 +// CHECK-EXP: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-EXP-NEXT: Counter = 1500 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-EXP-NEXT: Counter = 1500 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48 +// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-EXP-NEXT: Counter = 49500 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48 +// CHECK-EXP-NEXT: Counter = 49500 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40 +// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-EXP-NEXT: Size seen by user at Dim 2 = 64 +// CHECK-EXP-NEXT: Counter = 3168000 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40 +// CHECK-EXP-NEXT: Counter = 3168000 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40 +// CHECK-EXP-NEXT: Counter = 3168000 +// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-EXP-NEXT: Counter = 256 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: Counter = 256 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48 +// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-EXP-NEXT: Counter = 8448 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48 +// CHECK-EXP-NEXT: Counter = 8448 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40 +// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-EXP-NEXT: Size seen by user at Dim 2 = 64 +// CHECK-EXP-NEXT: Counter = 540672 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40 +// CHECK-EXP-NEXT: Counter = 540672 +// CHECK-EXP-NEXT: Correct kernel indexes used +// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40 +// CHECK-EXP-NEXT: Counter = 540672 // CHECK-EXP-NEXT: Size seen by user at Dim 0 = 256 // CHECK-EXP-NEXT: Counter = 256 // CHECK-EXP-NEXT: Correct kernel indexes used diff --git a/sycl/test-e2e/helpers.hpp b/sycl/test-e2e/helpers.hpp index 0b684b486a8ab..62726474fbf85 100644 --- a/sycl/test-e2e/helpers.hpp +++ b/sycl/test-e2e/helpers.hpp @@ -134,3 +134,18 @@ std::string getVal(const char *name) { return res; } } // namespace env + +namespace command_submit_wrappers { +template +sycl::event parallel_for_wrapper(bool UseShortcutFunction, sycl::queue &Q, + sycl::range Range, + const KernelType &KernelFunc) { + if (UseShortcutFunction) { + return Q.parallel_for(Range, KernelFunc); + } else { + return Q.submit([&](sycl::handler &cgh) { + cgh.parallel_for(Range, KernelFunc); + }); + } +} +} // namespace command_submit_wrappers From 315711533fb59b294962c34a59f39e0ee6e28ac8 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 24 Nov 2025 14:38:04 +0000 Subject: [PATCH 3/7] Minor test changes --- .../Basic/parallel_for_disable_range_roundup.cpp | 1 + sycl/test-e2e/Basic/parallel_for_range_roundup.cpp | 14 +++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sycl/test-e2e/Basic/parallel_for_disable_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_disable_range_roundup.cpp index b3f4d06103138..fd78bc48313fd 100644 --- a/sycl/test-e2e/Basic/parallel_for_disable_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_disable_range_roundup.cpp @@ -25,6 +25,7 @@ void try_rounding_off(size_t size, bool useShortcutFunction) { queue Queue; range<1> *RangePtr = malloc_shared>(1, Queue); int *CounterPtr = malloc_shared(1, Queue); + (*CounterPtr) = 0; std::cout << "Run parallel_for" << std::endl; auto KernelFunc = [=](item<1> id) { diff --git a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp index 2d529041b2c61..054ad17407c3d 100644 --- a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp @@ -38,10 +38,6 @@ using namespace sycl; constexpr size_t MagicY = 33, MagicZ = 64; -range<1> Range1 = {0}; -range<2> Range2 = {0, 0}; -range<3> Range3 = {0, 0, 0}; - template class Kernel1; template class Kernel2; template class Kernel3; @@ -71,6 +67,8 @@ void try_1d_range(size_t size, bool useShortcutFunction) { int *CounterPtr = malloc_shared(1, Queue); IndexCheckT *ItemIndexesPtr = malloc_shared(Range[0], Queue); + (*CounterPtr) = 0; + auto KernelFunc = [=](KernelIdT I) { auto atm = atomic_ref(*CounterPtr); @@ -112,6 +110,8 @@ void try_2d_range(size_t size, bool useShortcutFunction) { IndexCheckT *ItemIndexesPtr = malloc_shared(Range[0] * Range[1], Queue); + (*CounterPtr) = 0; + auto KernelFunc = [=](KernelIdT I) { auto atm = atomic_ref(*CounterPtr); @@ -155,6 +155,8 @@ void try_3d_range(size_t size, bool useShortcutFunction) { IndexCheckT *ItemIndexesPtr = malloc_shared(Range[0] * Range[1] * Range[2], Queue); + (*CounterPtr) = 0; + auto KernelFunc = [=](KernelIdT I) { auto atm = atomic_ref(*CounterPtr); @@ -166,7 +168,7 @@ void try_3d_range(size_t size, bool useShortcutFunction) { ItemIndexesPtr[Idx] = IndexCheckT(I[0], I[1], I[2]); }; - command_submit_wrappers::parallel_for_wrapper>( + command_submit_wrappers::parallel_for_wrapper>( useShortcutFunction, Queue, Range, KernelFunc); Queue.wait(); @@ -197,6 +199,8 @@ void try_unnamed_lambda(size_t size, bool useShortcutFunction) { range<3> *RangePtr = malloc_shared>(1, Queue); int *CounterPtr = malloc_shared(1, Queue); + (*CounterPtr) = 0; + auto KernelFunc = [=](id<3> ID) { auto atm = atomic_ref(*CounterPtr); From 8e490a8040f93f71373e38aa6c0daa1adb8ec04f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 24 Nov 2025 15:43:37 +0000 Subject: [PATCH 4/7] Fix include tests --- sycl/test/include_deps/sycl_detail_core.hpp.cpp | 2 +- sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp | 1 + sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp | 1 + sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp | 2 +- sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp | 2 +- sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp | 1 + sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp | 2 +- 7 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index 2d0cc14cd5c26..eb375275c85b9 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -151,8 +151,8 @@ // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp -// CHECK-NEXT: sampler.hpp // CHECK-NEXT: range_rounding.hpp +// CHECK-NEXT: sampler.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: feature_test.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp index 862394bd4f656..ee2b40f5db78b 100644 --- a/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp @@ -148,5 +148,6 @@ // CHECK-NEXT: ext/oneapi/device_global/properties.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sampler.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp index 7f606a58d8a68..a4d934ca352ea 100644 --- a/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp @@ -149,6 +149,7 @@ // CHECK-NEXT: ext/oneapi/device_global/device_global.hpp // CHECK-NEXT: ext/oneapi/device_global/properties.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sampler.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp index 846bd0ed4a436..4635b3a1194a4 100644 --- a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp @@ -155,7 +155,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp -// CHECK-NEXT: sampler.hpp // CHECK-NEXT: range_rounding.hpp +// CHECK-NEXT: sampler.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp index e6b167d5102a0..9ca3f8adcb46e 100644 --- a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp @@ -178,12 +178,12 @@ // CHECK-NEXT: ext/oneapi/device_global/properties.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sampler.hpp // CHECK-NEXT: queue.hpp // CHECK-NEXT: detail/nd_range_view.hpp // CHECK-NEXT: detail/optional.hpp // CHECK-NEXT: ext/oneapi/experimental/event_mode_property.hpp -// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: usm.hpp // CHECK-NEXT: usm/usm_pointer_info.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp index efaf1605c801f..9981a03d1ec99 100644 --- a/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp @@ -167,5 +167,6 @@ // CHECK-NEXT: ext/oneapi/device_global/properties.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sampler.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp index b24f3577906ae..fe334f7aca888 100644 --- a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp @@ -170,8 +170,8 @@ // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp -// CHECK-NEXT: sampler.hpp // CHECK-NEXT: range_rounding.hpp +// CHECK-NEXT: sampler.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: usm/usm_pointer_info.hpp // CHECK-NEXT: usm/usm_allocator.hpp From 17729b84c406d75aa1ffa826013b5512d4a081f1 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 25 Nov 2025 09:55:25 +0000 Subject: [PATCH 5/7] Move TransformUserItemType --- sycl/include/sycl/handler.hpp | 13 ++++++++++--- sycl/include/sycl/queue.hpp | 9 +-------- sycl/include/sycl/range_rounding.hpp | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 58c0f11c627db..04137d59879e4 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -403,6 +403,13 @@ template bool range_size_fits_in_size_t(const range &r) { return true; } +template struct TransformUserItemType { + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; +}; + } // namespace detail /// Command group handler class. @@ -1038,6 +1045,7 @@ class __SYCL_EXPORT handler { device get_device() const; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES template struct TransformUserItemType { using type = std::conditional_t< std::is_convertible_v, LambdaArgType>, nd_item, @@ -1045,7 +1053,6 @@ class __SYCL_EXPORT handler { item, LambdaArgType>>; }; -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES std::optional> getMaxWorkGroups(); // We need to use this version to support gcc 7.5.0. Remove when minimal // supported gcc version is bumped. @@ -1214,7 +1221,7 @@ class __SYCL_EXPORT handler { // sycl::item/sycl::nd_item to transport item information using TransformedArgType = std::conditional_t< std::is_integral::value && Dims == 1, item, - typename TransformUserItemType::type>; + typename detail::TransformUserItemType::type>; static_assert(!std::is_same_v>, "Kernel argument cannot have a sycl::nd_item type in " @@ -1765,7 +1772,7 @@ class __SYCL_EXPORT handler { using LambdaArgType = sycl::detail::lambda_arg_type>; using TransformedArgType = std::conditional_t< std::is_integral::value && Dims == 1, item, - typename TransformUserItemType::type>; + typename detail::TransformUserItemType::type>; wrap_kernel(KernelFunc, {} /*Props*/, NumWorkItems, WorkItemOffset); } diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 81890a3b6e9a7..76d50e222abef 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -162,13 +162,6 @@ class __SYCL_EXPORT SubmissionInfo { } // namespace v1 -template struct TransformUserItemType { - using type = std::conditional_t< - std::is_convertible_v, LambdaArgType>, nd_item, - std::conditional_t, LambdaArgType>, - item, LambdaArgType>>; -}; - template Range, // sycl::item/sycl::nd_item to transport item information using TransformedArgType = std::conditional_t< std::is_integral::value && Dims == 1, item, - typename TransformUserItemType::type>; + typename detail::TransformUserItemType::type>; static_assert(!std::is_same_v>, "Kernel argument cannot have a sycl::nd_item type in " diff --git a/sycl/include/sycl/range_rounding.hpp b/sycl/include/sycl/range_rounding.hpp index 1e30269fb20bb..f19877760bc83 100644 --- a/sycl/include/sycl/range_rounding.hpp +++ b/sycl/include/sycl/range_rounding.hpp @@ -158,4 +158,4 @@ std::tuple, bool> getRoundedRange(range UserRange, } // namespace detail } // namespace _V1 -} // namespace sycl \ No newline at end of file +} // namespace sycl From 08f74311d5c806fdefee1a5cb26b2d753898df03 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 25 Nov 2025 11:27:59 +0000 Subject: [PATCH 6/7] Move some range rounding utils from handler.hpp to range_rounding.hpp --- sycl/include/sycl/handler.hpp | 118 ------------------------- sycl/include/sycl/range_rounding.hpp | 125 ++++++++++++++++++++++++++- 2 files changed, 124 insertions(+), 119 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 04137d59879e4..cf9df9e5f1734 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -263,124 +263,6 @@ __SYCL_EXPORT void *getValueFromDynamicParameter( ext::oneapi::experimental::detail::dynamic_parameter_base &DynamicParamBase); -template class RoundedRangeIDGenerator { - id Id; - id InitId; - range UserRange; - range RoundedRange; - bool Done = false; - -public: - RoundedRangeIDGenerator(const id &Id, const range &UserRange, - const range &RoundedRange) - : Id(Id), InitId(Id), UserRange(UserRange), RoundedRange(RoundedRange) { - for (int i = 0; i < Dims; ++i) - if (Id[i] >= UserRange[i]) - Done = true; - } - - explicit operator bool() { return !Done; } - - void updateId() { - for (int i = 0; i < Dims; ++i) { - Id[i] += RoundedRange[i]; - if (Id[i] < UserRange[i]) - return; - Id[i] = InitId[i]; - } - Done = true; - } - - id getId() { return Id; } - - template auto getItem() { - if constexpr (std::is_invocable_v &> || - std::is_invocable_v &, kernel_handler>) - return detail::Builder::createItem(UserRange, getId(), {}); - else { - static_assert(std::is_invocable_v &> || - std::is_invocable_v &, - kernel_handler>, - "Kernel must be invocable with an item!"); - return detail::Builder::createItem(UserRange, getId()); - } - } -}; - -// TODO: The wrappers can be optimized further so that the body -// essentially looks like this: -// for (auto z = it[2]; z < UserRange[2]; z += it.get_range(2)) -// for (auto y = it[1]; y < UserRange[1]; y += it.get_range(1)) -// for (auto x = it[0]; x < UserRange[0]; x += it.get_range(0)) -// KernelFunc({x,y,z}); -template -class RoundedRangeKernel { -public: - range UserRange; - KernelType KernelFunc; - void operator()(item It) const { - auto RoundedRange = It.get_range(); - for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen; - Gen.updateId()) { - auto item = Gen.template getItem(); - KernelFunc(item); - } - } - - // Copy the properties_tag getter from the original kernel to propagate - // property(s) - template < - typename T = KernelType, - typename = std::enable_if_t::value>> - auto get(ext::oneapi::experimental::properties_tag) const { - return KernelFunc.get(ext::oneapi::experimental::properties_tag{}); - } -}; - -template -class RoundedRangeKernelWithKH { -public: - range UserRange; - KernelType KernelFunc; - void operator()(item It, kernel_handler KH) const { - auto RoundedRange = It.get_range(); - for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen; - Gen.updateId()) { - auto item = Gen.template getItem(); - KernelFunc(item, KH); - } - } - - // Copy the properties_tag getter from the original kernel to propagate - // property(s) - template < - typename T = KernelType, - typename = std::enable_if_t::value>> - auto get(ext::oneapi::experimental::properties_tag) const { - return KernelFunc.get(ext::oneapi::experimental::properties_tag{}); - } -}; - -template ::value> * = nullptr> -auto getRangeRoundedKernelLambda(KernelType KernelFunc, range UserRange) { - return detail::RoundedRangeKernelWithKH{ - UserRange, KernelFunc}; -} - -template ::value> * = nullptr> -auto getRangeRoundedKernelLambda(KernelType KernelFunc, range UserRange) { - return detail::RoundedRangeKernel{ - UserRange, KernelFunc}; -} - using std::enable_if_t; using sycl::detail::queue_impl; diff --git a/sycl/include/sycl/range_rounding.hpp b/sycl/include/sycl/range_rounding.hpp index f19877760bc83..13e8f64f03b75 100644 --- a/sycl/include/sycl/range_rounding.hpp +++ b/sycl/include/sycl/range_rounding.hpp @@ -9,17 +9,140 @@ #pragma once #include +#include #include #include +#include +#include + #include +#include -#include // for size_t +#include namespace sycl { inline namespace _V1 { namespace detail { +template class RoundedRangeIDGenerator { + id Id; + id InitId; + range UserRange; + range RoundedRange; + bool Done = false; + +public: + RoundedRangeIDGenerator(const id &Id, const range &UserRange, + const range &RoundedRange) + : Id(Id), InitId(Id), UserRange(UserRange), RoundedRange(RoundedRange) { + for (int i = 0; i < Dims; ++i) + if (Id[i] >= UserRange[i]) + Done = true; + } + + explicit operator bool() { return !Done; } + + void updateId() { + for (int i = 0; i < Dims; ++i) { + Id[i] += RoundedRange[i]; + if (Id[i] < UserRange[i]) + return; + Id[i] = InitId[i]; + } + Done = true; + } + + id getId() { return Id; } + + template auto getItem() { + if constexpr (std::is_invocable_v &> || + std::is_invocable_v &, kernel_handler>) + return detail::Builder::createItem(UserRange, getId(), {}); + else { + static_assert(std::is_invocable_v &> || + std::is_invocable_v &, + kernel_handler>, + "Kernel must be invocable with an item!"); + return detail::Builder::createItem(UserRange, getId()); + } + } +}; + +// TODO: The wrappers can be optimized further so that the body +// essentially looks like this: +// for (auto z = it[2]; z < UserRange[2]; z += it.get_range(2)) +// for (auto y = it[1]; y < UserRange[1]; y += it.get_range(1)) +// for (auto x = it[0]; x < UserRange[0]; x += it.get_range(0)) +// KernelFunc({x,y,z}); +template +class RoundedRangeKernel { +public: + range UserRange; + KernelType KernelFunc; + void operator()(item It) const { + auto RoundedRange = It.get_range(); + for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen; + Gen.updateId()) { + auto item = Gen.template getItem(); + KernelFunc(item); + } + } + + // Copy the properties_tag getter from the original kernel to propagate + // property(s) + template < + typename T = KernelType, + typename = std::enable_if_t::value>> + auto get(ext::oneapi::experimental::properties_tag) const { + return KernelFunc.get(ext::oneapi::experimental::properties_tag{}); + } +}; + +template +class RoundedRangeKernelWithKH { +public: + range UserRange; + KernelType KernelFunc; + void operator()(item It, kernel_handler KH) const { + auto RoundedRange = It.get_range(); + for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen; + Gen.updateId()) { + auto item = Gen.template getItem(); + KernelFunc(item, KH); + } + } + + // Copy the properties_tag getter from the original kernel to propagate + // property(s) + template < + typename T = KernelType, + typename = std::enable_if_t::value>> + auto get(ext::oneapi::experimental::properties_tag) const { + return KernelFunc.get(ext::oneapi::experimental::properties_tag{}); + } +}; + +template ::value> * = nullptr> +auto getRangeRoundedKernelLambda(KernelType KernelFunc, range UserRange) { + return detail::RoundedRangeKernelWithKH{ + UserRange, KernelFunc}; +} + +template ::value> * = nullptr> +auto getRangeRoundedKernelLambda(KernelType KernelFunc, range UserRange) { + return detail::RoundedRangeKernel{ + UserRange, KernelFunc}; +} + void __SYCL_EXPORT GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor, size_t &MinRange); From d633b7584508634a9c323c971369435dcae8005d Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 25 Nov 2025 11:48:07 +0000 Subject: [PATCH 7/7] Add missing includes --- sycl/include/sycl/range_rounding.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sycl/include/sycl/range_rounding.hpp b/sycl/include/sycl/range_rounding.hpp index 13e8f64f03b75..30408c495cd31 100644 --- a/sycl/include/sycl/range_rounding.hpp +++ b/sycl/include/sycl/range_rounding.hpp @@ -8,11 +8,15 @@ #pragma once +#include #include #include #include #include +#include #include +#include +#include #include #include