Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
3223842
[SYCL] Handler-less kernel submit API
slawekptak Jul 3, 2025
fde19ca
Fix formatting
slawekptak Jul 3, 2025
13424de
Fix formatting
slawekptak Jul 4, 2025
fbc789d
Change the ExtendedSubmissionInfo to KernelRuntimeInfo,
slawekptak Jul 7, 2025
591b3ec
Added copy/move constructor and assignment operator
slawekptak Jul 8, 2025
d235b7c
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Jul 8, 2025
6641601
Add a no event submit and no handler compile flag
slawekptak Jul 11, 2025
0f41d5a
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Jul 14, 2025
a6e711e
Added a new configure option to build no handler submit path, changed
slawekptak Jul 14, 2025
9c8040e
Host task dependency test
slawekptak Jul 17, 2025
31cbdb9
Add a check for special captures
slawekptak Jul 18, 2025
c5cd091
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 11, 2025
998d592
Switch to the common kernel wrappers, fix the KRInfo function call
slawekptak Aug 11, 2025
4000c07
Enable no handler in the preview lib build, add no handler unit
slawekptak Aug 12, 2025
f8e9cd6
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 12, 2025
01af8bb
Unused argument fix and IsTopCodeLoc assignment
slawekptak Aug 12, 2025
4469e59
Implemented the barrier and un-enqueued commands synchronization
slawekptak Aug 13, 2025
ac1a5cf
Fix formatting
slawekptak Aug 13, 2025
5865f3a
Fixed #ifdef, added comment to a new function.
slawekptak Aug 13, 2025
072803c
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 19, 2025
27b3110
Address review comments
slawekptak Aug 20, 2025
9041e94
Updated Linux symbols
slawekptak Aug 21, 2025
ac2c5bb
Addressed more review comments
slawekptak Aug 21, 2025
8e155fb
Fix formatting
slawekptak Aug 21, 2025
502f637
Fix formatting, remove unused properties argument
slawekptak Aug 21, 2025
d708c93
Fix ProcessKernelRuntimeInfo call
slawekptak Aug 21, 2025
e9f6e4e
Fix unit test build and ProcessKernelRuntimeInfo calls
slawekptak Aug 21, 2025
057a7a5
Fix formatting
slawekptak Aug 21, 2025
77d92ca
Added single_task shortcut function support for no-handler
slawekptak Aug 22, 2025
85aaa5c
Fix formatting
slawekptak Aug 25, 2025
a54422a
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 29, 2025
967d35e
Update KernelRuntimeInfo - change kernel name based cache pointer
slawekptak Aug 29, 2025
ec1ef89
Switch to DeviceKernelInfo use on the no-handler path
slawekptak Aug 29, 2025
1f95b9b
KernelName fix
slawekptak Aug 29, 2025
12ef6da
Update Windows symbols
slawekptak Aug 29, 2025
2980531
Split the kernel submit code into a command submission lambda and gen…
slawekptak Sep 1, 2025
01e0f9f
Fix formatting
slawekptak Sep 2, 2025
63d1345
Rename submit_generic_direct to submit_direct
slawekptak Sep 2, 2025
4001fea
Fix unused Props argument
slawekptak Sep 4, 2025
6c9525b
Update Linux symbols
slawekptak Sep 4, 2025
f871b10
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Sep 16, 2025
18df56b
Define the SubmitCommandFuncType template type and rebase
slawekptak Sep 16, 2025
3375e77
Use the KernelData structure in the no-handler path
slawekptak Sep 17, 2025
72dc199
Rename KernelRuntimeInfo to KernelDataDesc
slawekptak Sep 17, 2025
9715916
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Sep 17, 2025
177277b
Minor changes in the template variables
slawekptak Sep 17, 2025
eb9a5d6
Remove unused type
slawekptak Sep 17, 2025
1f8ea92
Remove KernelDataDesc and pass the arguments directly,
slawekptak Sep 22, 2025
74438ae
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Sep 22, 2025
0e48eb6
Code cleanup
slawekptak Sep 22, 2025
0d47ac7
Update Linux symbols
slawekptak Sep 22, 2025
ab6812a
Update Windows symbols
slawekptak Sep 22, 2025
a35286d
Address review comments
slawekptak Sep 22, 2025
27a5cf5
Rvalue reference for KernelData
slawekptak Sep 22, 2025
9144f84
Add a static_assert to check if properties are empty.
slawekptak Sep 23, 2025
ca0b632
Remove unused function
slawekptak Sep 23, 2025
42e2b30
Fix unused parameter
slawekptak Sep 23, 2025
943f1f7
Merge two overloads of submit_kernel_direct_impl
slawekptak Sep 23, 2025
76bcaf2
Template instantiations for submit_kernel_direct_with_event_impl
slawekptak Sep 24, 2025
6588fe8
Change kernel direct submit functions in queue_impl to templates
slawekptak Sep 24, 2025
3c0e33c
Update Linux symbols
slawekptak Sep 24, 2025
8a20b8a
Update Windows symbols
slawekptak Sep 24, 2025
2be3d3d
Convert the kernel direct submit functions to free functions
slawekptak Sep 25, 2025
f139c93
Minor fixes
slawekptak Sep 25, 2025
8023ec1
Add missing calls.
slawekptak Sep 25, 2025
de94db6
Remove extern template definitions
slawekptak Sep 26, 2025
066b421
Consolidate the event-based and event-less functions
slawekptak Sep 26, 2025
eed0591
Make free functions from the queue kernel direct submit methods
slawekptak Sep 26, 2025
552f448
Address review comments
slawekptak Sep 26, 2025
f5c0d77
Update Linux symbols
slawekptak Sep 26, 2025
b442d37
ifdef fix
slawekptak Sep 26, 2025
5fa8ccc
Removed unused function declaration
slawekptak Sep 26, 2025
fa6d2f8
Export template instantiations
slawekptak Sep 26, 2025
f04ed3f
Update Windows symbols
slawekptak Sep 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions sycl/cmake/modules/AddSYCLUnitTest.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Internal function to create SYCL unit tests with code reuse
# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview file1.cpp, file2.cpp ...)
function(add_sycl_unittest_internal test_dirname link_variant is_preview)
# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview is_no_cgh file1.cpp, file2.cpp ...)
function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_cgh)
# Enable exception handling for these unit tests
set(LLVM_REQUIRES_EH ON)
set(LLVM_REQUIRES_RTTI ON)
Expand Down Expand Up @@ -34,7 +34,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
# Chaning CMAKE_CURRENT_BINARY_DIR should not affect this variable in its
# parent scope.
if (${is_preview})
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview")
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview")
endif()

if (${is_no_cgh})
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/NoCGH")
endif()

if ("${link_variant}" MATCHES "SHARED")
Expand Down Expand Up @@ -65,6 +69,18 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
set(sycl_cache_suffix "_preview")
endif()

if (${is_no_cgh})
set(sycl_cache_suffix "_no_cgh")
endif()

if (${is_no_cgh})
target_compile_definitions(
${test_dirname}
PRIVATE
__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
)
endif()

if (SYCL_ENABLE_XPTI_TRACING)
target_compile_definitions(${test_dirname}
PRIVATE XPTI_ENABLE_INSTRUMENTATION XPTI_STATIC_LIBRARY)
Expand Down Expand Up @@ -150,7 +166,6 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
-Wno-inconsistent-missing-override
)
endif()

target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING)
endfunction()

Expand All @@ -160,6 +175,7 @@ endfunction()
# the SYCL preview features enabled.
# Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)`
macro(add_sycl_unittest test_name_prefix link_variant)
add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_no_cgh ${link_variant} FALSE TRUE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN})
endmacro()
38 changes: 30 additions & 8 deletions sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,10 +259,19 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
typename KernelType, typename... ReductionsT>
void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
ReductionsT &&...Reductions) {
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Range, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
// TODO The handler-less path does not support reductions yet.
if constexpr (sizeof...(ReductionsT) == 0) {
detail::submit_kernel_direct<KernelName>(std::move(Q), empty_properties_t{},
Range, KernelObj);
} else
#endif
{
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Range, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
}
}

template <typename KernelName = sycl::detail::auto_name, int Dimensions,
Expand All @@ -283,10 +292,23 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
typename Properties, typename KernelType, typename... ReductionsT>
void nd_launch(queue Q, launch_config<nd_range<Dimensions>, Properties> Config,
const KernelType &KernelObj, ReductionsT &&...Reductions) {
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Config, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
// TODO The handler-less path does not support reductions yet.
if constexpr (sizeof...(ReductionsT) == 0) {
ext::oneapi::experimental::detail::LaunchConfigAccess<nd_range<Dimensions>,
Properties>
ConfigAccess(Config);
detail::submit_kernel_direct<KernelName>(
std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(),
KernelObj);
} else
#endif
{
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Config, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
}
}

template <int Dimensions, typename... ArgsT>
Expand Down
18 changes: 18 additions & 0 deletions sycl/include/sycl/khr/free_function_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,27 +153,45 @@ void launch_grouped(const queue &q, range<1> r, range<1> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
detail::submit_kernel_direct(q,
ext::oneapi::experimental::empty_properties_t{},
nd_range<1>(r, size), k);
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}
template <typename KernelType>
void launch_grouped(const queue &q, range<2> r, range<2> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
detail::submit_kernel_direct(q,
ext::oneapi::experimental::empty_properties_t{},
nd_range<2>(r, size), k);
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}
template <typename KernelType>
void launch_grouped(const queue &q, range<3> r, range<3> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
detail::submit_kernel_direct(q,
ext::oneapi::experimental::empty_properties_t{},
nd_range<3>(r, size), k);
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}

template <typename... Args>
Expand Down
79 changes: 74 additions & 5 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,20 @@ template <backend BackendName, class SyclObjectT>
auto get_native(const SyclObjectT &Obj)
-> backend_return_t<BackendName, SyclObjectT>;

template <int Dims>
event __SYCL_EXPORT submit_kernel_direct_with_event_impl(
const queue &Queue, const nd_range<Dims> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

template <int Dims>
void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
const queue &Queue, const nd_range<Dims> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

namespace detail {
class queue_impl;

Expand Down Expand Up @@ -141,6 +155,51 @@ class __SYCL_EXPORT SubmissionInfo {
};

} // namespace v1

template <typename KernelName = detail::auto_name, bool EventNeeded = false,
typename PropertiesT, typename KernelType, int Dims>
auto submit_kernel_direct(
const queue &Queue, PropertiesT Props, const nd_range<Dims> &Range,
const KernelType &KernelFunc,
const detail::code_location &CodeLoc = detail::code_location::current()) {
// TODO Properties not supported yet
(void)Props;
static_assert(
std::is_same_v<PropertiesT,
ext::oneapi::experimental::empty_properties_t>,
"Setting properties not supported yet for no-CGH kernel submit.");
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);

using NameT =
typename detail::get_kernel_name_t<KernelName, KernelType>::name;
using LambdaArgType =
sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
static_assert(
std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
"Kernel argument of a sycl::parallel_for with sycl::nd_range "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could the text be altered in the subsequent patches, as this code can be called not only from parallel_for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the plan is to extend this to other functions once parallel_for(nd_range) is complete.

"must be either sycl::nd_item or be convertible from sycl::nd_item");
using TransformedArgType = sycl::nd_item<Dims>;

std::shared_ptr<detail::HostKernelBase> HostKernel = std::make_shared<
detail::HostKernel<KernelType, TransformedArgType, Dims>>(KernelFunc);

detail::DeviceKernelInfo *DeviceKernelInfoPtr =
&detail::getDeviceKernelInfo<NameT>();

detail::KernelWrapper<detail::WrapAs::parallel_for, NameT, KernelType,
TransformedArgType, PropertiesT>::wrap(KernelFunc);

if constexpr (EventNeeded) {
return submit_kernel_direct_with_event_impl(
Queue, Range, HostKernel, DeviceKernelInfoPtr,
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
} else {
submit_kernel_direct_without_event_impl(
Queue, Range, HostKernel, DeviceKernelInfoPtr,
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
}
}

} // namespace detail

namespace ext ::oneapi ::experimental {
Expand Down Expand Up @@ -3203,11 +3262,21 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
// TODO The handler-less path does not support reductions yet.
if constexpr (sizeof...(RestT) == 1) {
return detail::submit_kernel_direct<KernelName, true>(
*this, ext::oneapi::experimental::empty_properties_t{}, Range,
Rest...);
} else
#endif
{
return submit(
[&](handler &CGH) {
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
}
}

/// parallel_for version with a kernel represented as a lambda + nd_range that
Expand Down
100 changes: 100 additions & 0 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,106 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
return EventImpl;
}

detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
const NDRDescT &NDRDesc,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {

KernelData KData;

KData.setDeviceKernelInfoPtr(DeviceKernelInfo);
KData.setKernelFunc(HostKernel->getPtr());
KData.setNDRDesc(NDRDesc);

auto SubmitKernelFunc =
[&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr {
std::unique_ptr<detail::CG> CommandGroup;
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
std::vector<std::shared_ptr<const void>> AuxiliaryResources;

KData.extractArgsAndReqsFromLambda();

CommandGroup.reset(new detail::CGExecKernel(
KData.getNDRDesc(), HostKernel,
nullptr, // Kernel
nullptr, // KernelBundle
std::move(CGData), std::move(KData).getArgs(),
*KData.getDeviceKernelInfoPtr(), std::move(StreamStorage),
std::move(AuxiliaryResources), detail::CGType::Kernel,
UR_KERNEL_CACHE_CONFIG_DEFAULT,
false, // KernelIsCooperative
false, // KernelUsesClusterLaunch
0, // KernelWorkGroupMemorySize
CodeLoc));
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;

return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
*this, true);
};

return submit_direct(CallerNeedsEvent, SubmitKernelFunc);
}

template <typename SubmitCommandFuncType>
detail::EventImplPtr
queue_impl::submit_direct(bool CallerNeedsEvent,
SubmitCommandFuncType &SubmitCommandFunc) {
detail::CG::StorageInitHelper CGData;
std::unique_lock<std::mutex> Lock(MMutex);

// Graphs are not supported yet for the no-handler path
assert(!hasCommandGraph());

// Set the No Last Event Mode to false, since the no-handler path
// does not support it yet.
MNoLastEventMode.store(false, std::memory_order_relaxed);

// Used by queue_empty() and getLastEvent()
MEmpty.store(false, std::memory_order_release);

// Sync with an external event
std::optional<event> ExternalEvent = popExternalEvent();
if (ExternalEvent) {
CGData.MEvents.push_back(getSyclObjImpl(*ExternalEvent));
}

// Sync with the last event for in order queue
EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr;
if (isInOrder() && LastEvent) {
CGData.MEvents.push_back(LastEvent);
}

// Barrier and un-enqueued commands synchronization for out or order queue
if (!isInOrder()) {
MMissedCleanupRequests.unset(
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
for (auto &UpdatedGraph : MissedCleanupRequests)
doUnenqueuedCommandCleanup(UpdatedGraph);
MissedCleanupRequests.clear();
});

if (MDefaultGraphDeps.LastBarrier &&
!MDefaultGraphDeps.LastBarrier->isEnqueued()) {
CGData.MEvents.push_back(MDefaultGraphDeps.LastBarrier);
}
}

EventImplPtr EventImpl = SubmitCommandFunc(CGData);

// Sync with the last event for in order queue
if (isInOrder() && !EventImpl->isDiscarded()) {
LastEvent = EventImpl;
}

// Barrier and un-enqueued commands synchronization for out or order queue
if (!isInOrder() && !EventImpl->isEnqueued()) {
MDefaultGraphDeps.UnenqueuedCmdEvents.push_back(EventImpl);
}

return CallerNeedsEvent ? EventImpl : nullptr;
}

template <typename HandlerFuncT>
event queue_impl::submitWithHandler(const std::vector<event> &DepEvents,
bool CallerNeedsEvent,
Expand Down
Loading