From 1d1bf166284cc72a5285c6938319cf846b29922f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 29 Sep 2025 09:39:16 +0000 Subject: [PATCH 01/16] [SYCL] Add scheduler-bypass for handler-less kernel submission path The handler-less kernel submission path has been extended to support the fast, scheduler-bypass submission. --- sycl/source/detail/queue_impl.cpp | 112 ++++++++++++++++++++++-------- 1 file changed, 83 insertions(+), 29 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 79769d8819000..fc62f0ab3fd80 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -432,30 +432,75 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( KData.setKernelFunc(HostKernel->getPtr()); KData.setNDRDesc(NDRDesc); - auto SubmitKernelFunc = - [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { - std::unique_ptr CommandGroup; - std::vector> StreamStorage; - std::vector> AuxiliaryResources; - - KData.extractArgsAndReqsFromLambda(); - - CommandGroup.reset(new detail::CGExecKernel( - KData.getNDRDesc(), HostKernel, - nullptr, // Kernel - nullptr, // KernelBundle - std::move(CGData), std::move(KData).getArgs(), - *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage), - std::move(AuxiliaryResources), detail::CGType::Kernel, - UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, // KernelIsCooperative - false, // KernelUsesClusterLaunch - 0, // KernelWorkGroupMemorySize - CodeLoc)); - CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; - - return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), - *this, true); + auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, + bool SchedulerBypass) -> EventImplPtr { + if (SchedulerBypass) { + bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); + std::vector RawEvents; + + if (CGData.MEvents.size() > 0) { + RawEvents = detail::Command::getUrEvents(CGData.MEvents, this, false); + } + + std::shared_ptr ResultEvent = + DiscardEvent ? nullptr + : detail::event_impl::create_device_event(*this); + + if (!DiscardEvent) { + ResultEvent->setWorkerQueue(weak_from_this()); + ResultEvent->setStateIncomplete(); + ResultEvent->setSubmissionTime(); + } + + enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(), + nullptr, // KernelBundle + nullptr, // Kernel + *KData.getDeviceKernelInfoPtr(), RawEvents, + ResultEvent.get(), + nullptr, // getMemAllocationFunc + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // KernelIsCooperative + false, // KernelUsesClusterLaunch + 0, // WorkGroupMemorySize + nullptr, // BinImage + KData.getKernelFuncPtr()); + + if (!DiscardEvent) { + ResultEvent->setEnqueued(); + // connect returned event with dependent events + if (!isInOrder()) { + // MEvents is not used anymore, so can move. + ResultEvent->getPreparedDepsEvents() = std::move(CGData.MEvents); + // ResultEvent is local for current thread, no need to lock. + ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); + } + } + + return ResultEvent; + } else { + std::unique_ptr CommandGroup; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + KData.extractArgsAndReqsFromLambda(); + + CommandGroup.reset(new detail::CGExecKernel( + KData.getNDRDesc(), HostKernel, + nullptr, // Kernel + nullptr, // KernelBundle + std::move(CGData), std::move(KData).getArgs(), + *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage), + std::move(AuxiliaryResources), detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // KernelIsCooperative + false, // KernelUsesClusterLaunch + 0, // KernelWorkGroupMemorySize + CodeLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + + return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), + *this, true); + } }; return submit_direct(CallerNeedsEvent, SubmitKernelFunc); @@ -505,15 +550,24 @@ queue_impl::submit_direct(bool CallerNeedsEvent, } } - EventImplPtr EventImpl = SubmitCommandFunc(CGData); + bool SchedulerBypass = + CGData.MEvents.size() > 0 + ? detail::Scheduler::areEventsSafeForSchedulerBypass(CGData.MEvents, + getContextImpl()) + : true; - // Sync with the last event for in order queue - if (isInOrder() && !EventImpl->isDiscarded()) { - LastEvent = EventImpl; + EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass); + + // Sync with the last event for in order queue. For scheduler-bypass flow, + // the ordering is done at the layers below the SYCL runtime, + // but for the scheduler-based flow, it needs to be done here, as the + // scheduler handles host task submissions. + if (isInOrder()) { + LastEvent = SchedulerBypass ? nullptr : EventImpl; } // Barrier and un-enqueued commands synchronization for out or order queue - if (!isInOrder() && !EventImpl->isEnqueued()) { + if (!isInOrder() && EventImpl && !EventImpl->isEnqueued()) { MDefaultGraphDeps.UnenqueuedCmdEvents.push_back(EventImpl); } From 91ad6dd9068179b7c10431cd3e233c5139e54ea4 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 29 Sep 2025 09:52:53 +0000 Subject: [PATCH 02/16] Remove unnecessary EventImpl check --- sycl/source/detail/queue_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index fc62f0ab3fd80..c853163a32d1d 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -567,7 +567,7 @@ queue_impl::submit_direct(bool CallerNeedsEvent, } // Barrier and un-enqueued commands synchronization for out or order queue - if (!isInOrder() && EventImpl && !EventImpl->isEnqueued()) { + if (!isInOrder() && !EventImpl->isEnqueued()) { MDefaultGraphDeps.UnenqueuedCmdEvents.push_back(EventImpl); } From 4d06579cf19293bee58cc60c139b48ce6ee0646f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 30 Sep 2025 10:38:56 +0000 Subject: [PATCH 03/16] Extract the scheduler bypass logic into a separate function and call it from the handler and handler-less functions --- sycl/source/detail/queue_impl.cpp | 149 +++++++++++++++++++++--------- sycl/source/detail/queue_impl.hpp | 24 ++++- sycl/source/handler.cpp | 99 ++------------------ 3 files changed, 131 insertions(+), 141 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c853163a32d1d..665e924a8344c 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -420,13 +420,110 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, return EventImpl; } -detail::EventImplPtr queue_impl::submit_kernel_direct_impl( +EventImplPtr queue_impl::submit_kernel_scheduler_bypass( + KernelData &KData, std::vector &DepEvents, + bool EventNeeded, std::shared_ptr &Kernel, + detail::kernel_bundle_impl *KernelBundleImpPtr, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + std::vector RawEvents; + + // TODO checking the size of the events vector and avoiding the call is + // more efficient here at this point + if (DepEvents.size() > 0) { + RawEvents = detail::Command::getUrEvents(DepEvents, this, false); + } + + bool DiscardEvent = !EventNeeded && supportsDiscardingPiEvents(); + if (DiscardEvent) { + // Kernel only uses assert if it's non interop one + bool KernelUsesAssert = + !(Kernel && Kernel->isInterop()) && KData.usesAssert(); + DiscardEvent = !KernelUsesAssert; + } + + std::shared_ptr ResultEvent = + DiscardEvent ? nullptr : detail::event_impl::create_device_event(*this); + + auto EnqueueKernel = [&]() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + xpti_td *CmdTraceEvent = nullptr; + uint64_t InstanceID = 0; + auto StreamID = detail::getActiveXPTIStreamID(); + // Only enable instrumentation if there are subscribes to the SYCL + // stream + const bool xptiEnabled = xptiCheckTraceEnabled(StreamID); + if (xptiEnabled) { + std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData( + StreamID, Kernel, CodeLoc, IsTopCodeLoc, + *KData.getDeviceKernelInfoPtr(), this, KData.getNDRDesc(), + KernelBundleImpPtr, KData.getArgs()); + detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, + xpti::trace_task_begin, nullptr); + } +#endif + const detail::RTDeviceBinaryImage *BinImage = nullptr; + if (detail::SYCLConfig::get()) { + BinImage = detail::retrieveKernelBinary(*this, KData.getKernelName()); + assert(BinImage && "Failed to obtain a binary image."); + } + enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(), + KernelBundleImpPtr, Kernel.get(), + *KData.getDeviceKernelInfoPtr(), RawEvents, + ResultEvent.get(), nullptr, KData.getKernelCacheConfig(), + KData.isCooperative(), KData.usesClusterLaunch(), + KData.getKernelWorkGroupMemorySize(), BinImage, + KData.getKernelFuncPtr()); +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (xptiEnabled) { + // Emit signal only when event is created + if (!DiscardEvent) { + detail::emitInstrumentationGeneral( + StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal, + static_cast(ResultEvent->getHandle())); + } + detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, + xpti::trace_task_end, nullptr); + } +#endif + }; + + if (DiscardEvent) { + EnqueueKernel(); + } else { + ResultEvent->setWorkerQueue(weak_from_this()); + ResultEvent->setStateIncomplete(); + ResultEvent->setSubmissionTime(); + + EnqueueKernel(); + ResultEvent->setEnqueued(); + // connect returned event with dependent events + if (!isInOrder()) { + // MEvents is not used anymore, so can move. + ResultEvent->getPreparedDepsEvents() = std::move(DepEvents); + // ResultEvent is local for current thread, no need to lock. + ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); + } + } + + return ResultEvent; +} + +EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; + detail::code_location CLoc; + bool IsTopCLoc = true; + +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (xptiTraceEnabled()) { + CLoc = CodeLoc; + IsTopCLoc = IsTopCodeLoc; + } +#endif KData.setDeviceKernelInfoPtr(DeviceKernelInfo); KData.setKernelFunc(HostKernel->getPtr()); @@ -435,48 +532,10 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, bool SchedulerBypass) -> EventImplPtr { if (SchedulerBypass) { - bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); - std::vector RawEvents; - - if (CGData.MEvents.size() > 0) { - RawEvents = detail::Command::getUrEvents(CGData.MEvents, this, false); - } - - std::shared_ptr ResultEvent = - DiscardEvent ? nullptr - : detail::event_impl::create_device_event(*this); - - if (!DiscardEvent) { - ResultEvent->setWorkerQueue(weak_from_this()); - ResultEvent->setStateIncomplete(); - ResultEvent->setSubmissionTime(); - } - - enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(), - nullptr, // KernelBundle - nullptr, // Kernel - *KData.getDeviceKernelInfoPtr(), RawEvents, - ResultEvent.get(), - nullptr, // getMemAllocationFunc - UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, // KernelIsCooperative - false, // KernelUsesClusterLaunch - 0, // WorkGroupMemorySize - nullptr, // BinImage - KData.getKernelFuncPtr()); - - if (!DiscardEvent) { - ResultEvent->setEnqueued(); - // connect returned event with dependent events - if (!isInOrder()) { - // MEvents is not used anymore, so can move. - ResultEvent->getPreparedDepsEvents() = std::move(CGData.MEvents); - // ResultEvent is local for current thread, no need to lock. - ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); - } - } - - return ResultEvent; + std::shared_ptr Kernel; + return submit_kernel_scheduler_bypass(KData, CGData.MEvents, + CallerNeedsEvent, Kernel, nullptr, + CLoc, IsTopCLoc); } else { std::unique_ptr CommandGroup; std::vector> StreamStorage; @@ -495,8 +554,8 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( false, // KernelIsCooperative false, // KernelUsesClusterLaunch 0, // KernelWorkGroupMemorySize - CodeLoc)); - CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + CLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCLoc; return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), *this, true); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index c3d6748695423..9fad6c1568d19 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -388,6 +388,24 @@ class queue_impl : public std::enable_shared_from_this { submit_impl(CGF, /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo); } + /// Submits a kernel using the scheduler bypass fast path + /// + /// \param KData an object storing data related to the kernel. + /// \param DepEvents list of event dependencies. + /// \param EventNeeded true, if the resulting event is needed. + /// \param Kernel used, if kernel defined as a kernel object. + /// \param KernelBundleImpPtr used, if kernel bundle defined. + /// \param CodeLoc is the code location of the submit call. + /// \param IsTopCodeLoc used to determine if the object is in a local + /// scope or in the top level scope. + /// + /// \return a SYCL event representing submitted command or nullptr. + EventImplPtr submit_kernel_scheduler_bypass( + KernelData &KData, std::vector &DepEvents, + bool EventNeeded, std::shared_ptr &Kernel, + detail::kernel_bundle_impl *KernelBundleImpPtr, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + /// Performs a blocking wait for the completion of all enqueued tasks in the /// queue. /// @@ -904,15 +922,15 @@ class queue_impl : public std::enable_shared_from_this { /// scope or in the top level scope. /// /// \return a SYCL event representing submitted command group or nullptr. - detail::EventImplPtr submit_kernel_direct_impl( + EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template - detail::EventImplPtr submit_direct(bool CallerNeedsEvent, - SubmitCommandFuncType &SubmitCommandFunc); + EventImplPtr submit_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc); /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index ffa7d80eda4d0..a5132e76f0e00 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -638,103 +638,16 @@ event handler::finalize() { // the graph is not changed, then this faster path is used to submit // kernel bypassing scheduler and avoiding CommandGroup, Command objects // creation. - std::vector RawEvents; - // TODO checking the size of the events vector and avoiding the call is - // more efficient here at this point - if (impl->CGData.MEvents.size() > 0) { - RawEvents = detail::Command::getUrEvents( - impl->CGData.MEvents, impl->get_queue_or_null(), false); - } - - bool DiscardEvent = - !impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents(); - if (DiscardEvent) { - // Kernel only uses assert if it's non interop one - bool KernelUsesAssert = !(MKernel && MKernel->isInterop()) && - impl->MKernelData.usesAssert(); - DiscardEvent = !KernelUsesAssert; - } - - std::shared_ptr ResultEvent = - DiscardEvent - ? nullptr - : detail::event_impl::create_device_event(impl->get_queue()); - auto EnqueueKernel = [&]() { -#ifdef XPTI_ENABLE_INSTRUMENTATION - xpti_td *CmdTraceEvent = nullptr; - uint64_t InstanceID = 0; - auto StreamID = detail::getActiveXPTIStreamID(); - // Only enable instrumentation if there are subscribes to the SYCL - // stream - const bool xptiEnabled = xptiCheckTraceEnabled(StreamID); - if (xptiEnabled) { - std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData( - StreamID, MKernel, MCodeLoc, impl->MIsTopCodeLoc, - *impl->MKernelData.getDeviceKernelInfoPtr(), - impl->get_queue_or_null(), impl->MKernelData.getNDRDesc(), - KernelBundleImpPtr, impl->MKernelData.getArgs()); - detail::emitInstrumentationGeneral(StreamID, InstanceID, - CmdTraceEvent, - xpti::trace_task_begin, nullptr); - } -#endif - const detail::RTDeviceBinaryImage *BinImage = nullptr; - if (detail::SYCLConfig::get()) { - BinImage = detail::retrieveKernelBinary(impl->get_queue(), - impl->getKernelName()); - assert(BinImage && "Failed to obtain a binary image."); - } - enqueueImpKernel(impl->get_queue(), impl->MKernelData.getNDRDesc(), - impl->MKernelData.getArgs(), KernelBundleImpPtr, - MKernel.get(), - *impl->MKernelData.getDeviceKernelInfoPtr(), RawEvents, - ResultEvent.get(), nullptr, - impl->MKernelData.getKernelCacheConfig(), - impl->MKernelData.isCooperative(), - impl->MKernelData.usesClusterLaunch(), - impl->MKernelData.getKernelWorkGroupMemorySize(), - BinImage, impl->MKernelData.getKernelFuncPtr()); -#ifdef XPTI_ENABLE_INSTRUMENTATION - if (xptiEnabled) { - // Emit signal only when event is created - if (!DiscardEvent) { - detail::emitInstrumentationGeneral( - StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal, - static_cast(ResultEvent->getHandle())); - } - detail::emitInstrumentationGeneral(StreamID, InstanceID, - CmdTraceEvent, - xpti::trace_task_end, nullptr); - } -#endif - }; - - if (DiscardEvent) { - EnqueueKernel(); - } else { - detail::queue_impl &Queue = impl->get_queue(); - ResultEvent->setWorkerQueue(Queue.weak_from_this()); - ResultEvent->setStateIncomplete(); - ResultEvent->setSubmissionTime(); - - EnqueueKernel(); - ResultEvent->setEnqueued(); - // connect returned event with dependent events - if (!Queue.isInOrder()) { - // MEvents is not used anymore, so can move. - ResultEvent->getPreparedDepsEvents() = - std::move(impl->CGData.MEvents); - // ResultEvent is local for current thread, no need to lock. - ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); - } - } + detail::EventImplPtr EventImpl = + impl->get_queue().submit_kernel_scheduler_bypass( + impl->MKernelData, impl->CGData.MEvents, impl->MEventNeeded, + MKernel, KernelBundleImpPtr, MCodeLoc, impl->MIsTopCodeLoc); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - return ResultEvent; + return EventImpl; #else return detail::createSyclObjFromImpl( - ResultEvent ? ResultEvent - : detail::event_impl::create_discarded_event()); + EventImpl ? EventImpl : detail::event_impl::create_discarded_event()); #endif } } From 03deefe9cae433e5bcd008d08dfe4244f2594dfa Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 30 Sep 2025 14:42:29 +0000 Subject: [PATCH 04/16] Change the EventImpl var name back to original --- sycl/source/handler.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index a5132e76f0e00..40806c4155f78 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -639,15 +639,16 @@ event handler::finalize() { // kernel bypassing scheduler and avoiding CommandGroup, Command objects // creation. - detail::EventImplPtr EventImpl = + detail::EventImplPtr ResultEvent = impl->get_queue().submit_kernel_scheduler_bypass( impl->MKernelData, impl->CGData.MEvents, impl->MEventNeeded, MKernel, KernelBundleImpPtr, MCodeLoc, impl->MIsTopCodeLoc); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - return EventImpl; + return ResultEvent; #else return detail::createSyclObjFromImpl( - EventImpl ? EventImpl : detail::event_impl::create_discarded_event()); + ResultEvent ? ResultEvent + : detail::event_impl::create_discarded_event()); #endif } } From 956d27a31c9b7ec0b26ee639edd633840919423a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 1 Oct 2025 08:01:28 +0000 Subject: [PATCH 05/16] Address review comments --- sycl/source/detail/queue_impl.cpp | 54 ++++++++++++------------------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 665e924a8344c..95f76541250f6 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -515,15 +515,6 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; - detail::code_location CLoc; - bool IsTopCLoc = true; - -#ifdef XPTI_ENABLE_INSTRUMENTATION - if (xptiTraceEnabled()) { - CLoc = CodeLoc; - IsTopCLoc = IsTopCodeLoc; - } -#endif KData.setDeviceKernelInfoPtr(DeviceKernelInfo); KData.setKernelFunc(HostKernel->getPtr()); @@ -535,31 +526,28 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( std::shared_ptr Kernel; return submit_kernel_scheduler_bypass(KData, CGData.MEvents, CallerNeedsEvent, Kernel, nullptr, - CLoc, IsTopCLoc); - } else { - std::unique_ptr CommandGroup; - std::vector> StreamStorage; - std::vector> AuxiliaryResources; - - KData.extractArgsAndReqsFromLambda(); - - CommandGroup.reset(new detail::CGExecKernel( - KData.getNDRDesc(), HostKernel, - nullptr, // Kernel - nullptr, // KernelBundle - std::move(CGData), std::move(KData).getArgs(), - *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage), - std::move(AuxiliaryResources), detail::CGType::Kernel, - UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, // KernelIsCooperative - false, // KernelUsesClusterLaunch - 0, // KernelWorkGroupMemorySize - CLoc)); - CommandGroup->MIsTopCodeLoc = IsTopCLoc; - - return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), - *this, true); + CodeLoc, IsTopCodeLoc); } + std::unique_ptr CommandGroup; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + KData.extractArgsAndReqsFromLambda(); + + CommandGroup.reset(new detail::CGExecKernel( + KData.getNDRDesc(), HostKernel, + nullptr, // Kernel + nullptr, // KernelBundle + std::move(CGData), std::move(KData).getArgs(), + *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage), + std::move(AuxiliaryResources), detail::CGType::Kernel, + KData.getKernelCacheConfig(), KData.isCooperative(), + KData.usesClusterLaunch(), KData.getKernelWorkGroupMemorySize(), + CodeLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + + return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), + *this, true); }; return submit_direct(CallerNeedsEvent, SubmitKernelFunc); From f15f842aded7d14f2a153bdb2244e201c31b3725 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 3 Oct 2025 09:17:50 +0000 Subject: [PATCH 06/16] Allocate HostKernel on the scheduler path only --- sycl/source/detail/queue_impl.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c088c0c0f6ce2..6c52e076d659d 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -516,11 +516,8 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( KernelData KData; - std::shared_ptr HostKernelPtr = - HostKernel.takeOrCopyOwnership(); - KData.setDeviceKernelInfoPtr(DeviceKernelInfo); - KData.setKernelFunc(HostKernelPtr->getPtr()); + KData.setKernelFunc(HostKernel.getPtr()); KData.setNDRDesc(NDRDesc); auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, @@ -534,6 +531,9 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( std::vector> StreamStorage; std::vector> AuxiliaryResources; + std::shared_ptr HostKernelPtr = + HostKernel.takeOrCopyOwnership(); + KData.extractArgsAndReqsFromLambda(); CommandGroup.reset(new detail::CGExecKernel( From e8dc2296b7edfbed42aecbfa05cefcb7a58ee513 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 3 Oct 2025 09:20:48 +0000 Subject: [PATCH 07/16] Fix formatting --- sycl/source/detail/queue_impl.cpp | 3 +-- sycl/source/detail/queue_impl.hpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 6c52e076d659d..6c6cef89cc691 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -509,8 +509,7 @@ EventImplPtr queue_impl::submit_kernel_scheduler_bypass( } EventImplPtr queue_impl::submit_kernel_direct_impl( - const NDRDescT &NDRDesc, - detail::HostKernelRefBase &HostKernel, + const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 764d8545c2429..83d2c7668ad5c 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -921,8 +921,7 @@ class queue_impl : public std::enable_shared_from_this { /// /// \return a SYCL event representing submitted command group or nullptr. EventImplPtr submit_kernel_direct_impl( - const NDRDescT &NDRDesc, - detail::HostKernelRefBase &HostKernel, + const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); From bcf270f0149425a980da49cb071f855e3b08549f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 3 Oct 2025 09:36:51 +0000 Subject: [PATCH 08/16] Address review comments --- sycl/source/detail/queue_impl.cpp | 2 +- sycl/source/detail/queue_impl.hpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 6c6cef89cc691..d4cb21d7db5aa 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -498,7 +498,7 @@ EventImplPtr queue_impl::submit_kernel_scheduler_bypass( ResultEvent->setEnqueued(); // connect returned event with dependent events if (!isInOrder()) { - // MEvents is not used anymore, so can move. + // DepEvents is not used anymore, so can move. ResultEvent->getPreparedDepsEvents() = std::move(DepEvents); // ResultEvent is local for current thread, no need to lock. ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 83d2c7668ad5c..212c612b5a1d3 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -388,13 +388,13 @@ class queue_impl : public std::enable_shared_from_this { /// Submits a kernel using the scheduler bypass fast path /// - /// \param KData an object storing data related to the kernel. - /// \param DepEvents list of event dependencies. - /// \param EventNeeded true, if the resulting event is needed. - /// \param Kernel used, if kernel defined as a kernel object. - /// \param KernelBundleImpPtr used, if kernel bundle defined. + /// \param KData is an object storing data related to the kernel. + /// \param DepEvents is a list of event dependencies. + /// \param EventNeeded should be true, if the resulting event is needed. + /// \param Kernel to be used, if kernel defined as a kernel object. + /// \param KernelBundleImpPtr to be used, if kernel bundle defined. /// \param CodeLoc is the code location of the submit call. - /// \param IsTopCodeLoc used to determine if the object is in a local + /// \param IsTopCodeLoc is used to determine if the object is in a local /// scope or in the top level scope. /// /// \return a SYCL event representing submitted command or nullptr. From 43b4b3a6f6b9a3af4615c06a7f60819626829130 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 3 Oct 2025 13:14:20 +0000 Subject: [PATCH 09/16] Change the LaunchGroupedShortcutMoveKernelNoEvent unit test, to reflect the new logic behind HostKernel construction. --- .../FreeFunctionCommandsEvents.cpp | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp index 360bdca27e73f..ea523283064ea 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp @@ -235,14 +235,46 @@ TEST_F(FreeFunctionCommandsEventsTests, TestMoveFunctor::MoveCtorCalls = 0; TestMoveFunctor MoveOnly; + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + // This kernel submission uses scheduler-bypass path, so the HostKernel + // shouldn't be constructed. + + sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32}, + std::move(MoveOnly)); + + ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 0); + ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + // Another kernel submission is queued behind a host task, + // to force the scheduler-based submission. In this case, the HostKernel + // should be constructed. + + Queue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + }); + }); + sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32}, std::move(MoveOnly)); + + { + std::unique_lock lk(CvMutex); + ready = true; + } + Cv.notify_one(); + + Queue.wait(); + // Move ctor for TestMoveFunctor is called during move construction of // HostKernel. Copy ctor is called by InstantiateKernelOnHost, can't delete // it. ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 1); - - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{2}); } #endif From 2f0280d33c2415c27f8f458cba48632bad26885d Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 6 Oct 2025 13:57:50 +0000 Subject: [PATCH 10/16] [SYCL] Fallback path for handler-less kernel properties Add a fallback path (handler-based submission) for the handler-less kernel submission path, if kernel function properties are provided. --- .../oneapi/experimental/enqueue_functions.hpp | 29 +++++++------------ sycl/include/sycl/queue.hpp | 10 +++++-- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 3dc28532b2372..0e7c85caa47be 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -260,8 +260,12 @@ template Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - // TODO The handler-less path does not support reductions yet. - if constexpr (sizeof...(ReductionsT) == 0) { + // TODO The handler-less path does not support reductions and kernel function + // properties yet. + if constexpr (sizeof...(ReductionsT) == 0 && + !(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value)) { detail::submit_kernel_direct(std::move(Q), empty_properties_t{}, Range, KernelObj); } else @@ -292,23 +296,10 @@ template void nd_launch(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - // TODO The handler-less path does not support reductions yet. - if constexpr (sizeof...(ReductionsT) == 0) { - ext::oneapi::experimental::detail::LaunchConfigAccess, - Properties> - ConfigAccess(Config); - detail::submit_kernel_direct( - std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(), - KernelObj); - } else -#endif - { - submit(std::move(Q), [&](handler &CGH) { - nd_launch(CGH, Config, KernelObj, - std::forward(Reductions)...); - }); - } + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Config, KernelObj, + std::forward(Reductions)...); + }); } template diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 69911bec229fc..f6dce0d01accc 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3276,8 +3276,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - // TODO The handler-less path does not support reductions yet. - if constexpr (sizeof...(RestT) == 1) { + using KernelType = std::tuple_element_t<0, std::tuple>; + + // TODO The handler-less path does not support reductions and kernel + // function properties yet. + if constexpr (sizeof...(RestT) == 1 && + !(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value)) { return detail::submit_kernel_direct( *this, ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); From 48324199170cfcb914d8e1fda90d6d08a1b3dfbe Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 6 Oct 2025 14:24:55 +0000 Subject: [PATCH 11/16] Add properties check to free function extension --- .../sycl/khr/free_function_commands.hpp | 60 ++++++++++++------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 31464ba588dfc..9ecd30c881c89 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -158,14 +158,20 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size), - std::forward(k)); -#else - submit( - q, [&](handler &h) { launch_grouped(h, r, size, k); }, - codeLoc); + // TODO The handler-less path does not support kernel function properties yet. + if constexpr (!(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value)) { + detail::submit_kernel_direct( + q, ext::oneapi::experimental::empty_properties_t{}, + nd_range<1>(r, size), std::forward(k)); + } else #endif + { + submit( + q, [&](handler &h) { launch_grouped(h, r, size, k); }, + codeLoc); + } } template >> @@ -173,14 +179,20 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size), - std::forward(k)); -#else - submit( - q, [&](handler &h) { launch_grouped(h, r, size, k); }, - codeLoc); + // TODO The handler-less path does not support kernel function properties yet. + if constexpr (!(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value)) { + detail::submit_kernel_direct( + q, ext::oneapi::experimental::empty_properties_t{}, + nd_range<2>(r, size), std::forward(k)); + } else #endif + { + submit( + q, [&](handler &h) { launch_grouped(h, r, size, k); }, + codeLoc); + } } template >> @@ -188,14 +200,20 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size), - std::forward(k)); -#else - submit( - q, [&](handler &h) { launch_grouped(h, r, size, k); }, - codeLoc); + // TODO The handler-less path does not support kernel function properties yet. + if constexpr (!(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value)) { + detail::submit_kernel_direct( + q, ext::oneapi::experimental::empty_properties_t{}, + nd_range<3>(r, size), std::forward(k)); + } else #endif + { + submit( + q, [&](handler &h) { launch_grouped(h, r, size, k); }, + codeLoc); + } } template From 2bd29d0e654f3e610c2e513f75c76a561dd777e8 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 6 Oct 2025 16:37:21 +0000 Subject: [PATCH 12/16] [SYCL] Remove assertion for graph support for handler-less kernel submit --- sycl/source/detail/queue_impl.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 8243896cf76d1..e635d7ba9a926 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -530,9 +530,6 @@ queue_impl::submit_direct(bool CallerNeedsEvent, detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); - // Graphs are not supported yet for the no-handler path - assert(!hasCommandGraph()); - // Set the No Last Event Mode to false, since the no-handler path // does not support it yet. MNoLastEventMode.store(false, std::memory_order_relaxed); From 36fa3114481c24a8949c95e3a16c1bbcfc600b5a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 6 Oct 2025 17:36:00 +0000 Subject: [PATCH 13/16] Use scheduler bypass path only if no graph associated with the queue --- sycl/source/detail/queue_impl.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 4134913cce7dd..4d6893067afe9 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -659,10 +659,11 @@ queue_impl::submit_direct(bool CallerNeedsEvent, } bool SchedulerBypass = - CGData.MEvents.size() > 0 - ? detail::Scheduler::areEventsSafeForSchedulerBypass(CGData.MEvents, - getContextImpl()) - : true; + (CGData.MEvents.size() > 0 + ? detail::Scheduler::areEventsSafeForSchedulerBypass( + CGData.MEvents, getContextImpl()) + : true) && + !hasCommandGraph(); EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass); From 3b867c26ef646fdb2d3d93f30a86e9e4a191ea81 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 7 Oct 2025 08:26:29 +0000 Subject: [PATCH 14/16] Temp - Remove the no-handler macro --- .../ext/oneapi/experimental/enqueue_functions.hpp | 5 +---- sycl/include/sycl/khr/free_function_commands.hpp | 15 +++------------ sycl/include/sycl/queue.hpp | 6 ++---- .../FreeFunctionCommandsEvents.cpp | 2 -- 4 files changed, 6 insertions(+), 22 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 0e7c85caa47be..718c5850b3f9a 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -259,7 +259,6 @@ template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT // TODO The handler-less path does not support reductions and kernel function // properties yet. if constexpr (sizeof...(ReductionsT) == 0 && @@ -268,9 +267,7 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, const KernelType &>::value)) { detail::submit_kernel_direct(std::move(Q), empty_properties_t{}, Range, KernelObj); - } else -#endif - { + } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, std::forward(Reductions)...); diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 9ecd30c881c89..e1afe00672f21 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -157,7 +157,6 @@ template r, range<1> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT // TODO The handler-less path does not support kernel function properties yet. if constexpr (!(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< @@ -165,9 +164,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k, detail::submit_kernel_direct( q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size), std::forward(k)); - } else -#endif - { + } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); @@ -178,7 +175,6 @@ template r, range<2> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT // TODO The handler-less path does not support kernel function properties yet. if constexpr (!(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< @@ -186,9 +182,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k, detail::submit_kernel_direct( q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size), std::forward(k)); - } else -#endif - { + } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); @@ -199,7 +193,6 @@ template r, range<3> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT // TODO The handler-less path does not support kernel function properties yet. if constexpr (!(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< @@ -207,9 +200,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k, detail::submit_kernel_direct( q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size), std::forward(k)); - } else -#endif - { + } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index f6dce0d01accc..6e3f6b00f67ef 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3275,7 +3275,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { parallel_for(nd_range Range, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + using KernelType = std::tuple_element_t<0, std::tuple>; // TODO The handler-less path does not support reductions and kernel @@ -3287,9 +3287,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { return detail::submit_kernel_direct( *this, ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); - } else -#endif - { + } else { return submit( [&](handler &CGH) { CGH.template parallel_for(Range, Rest...); diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp index ea523283064ea..ca0753013c03b 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp @@ -227,7 +227,6 @@ TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutNoEvent) { ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); } -#if __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutMoveKernelNoEvent) { mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", @@ -276,7 +275,6 @@ TEST_F(FreeFunctionCommandsEventsTests, ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 1); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{2}); } -#endif TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedKernelNoEvent) { mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", From fa18fcd162416e95026e371a2f71278c0c6b3a69 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 7 Oct 2025 09:02:33 +0000 Subject: [PATCH 15/16] Comment out the event check --- .../FreeFunctionEventsHelpers.hpp | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp index e0a5c9be50c15..c275f7d03cc9d 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp @@ -26,53 +26,53 @@ inline ur_result_t after_urKernelGetInfo(void *pParams) { static thread_local size_t counter_urEnqueueKernelLaunch = 0; inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) { ++counter_urEnqueueKernelLaunch; -// TODO The no-handler scheduler submission includes a fix for the event return, -// where the event is returned by the scheduler on every submission. This fix -// is not yet applied to the handler-based path. -#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - auto params = *static_cast(pParams); - EXPECT_EQ(*params.pphEvent, nullptr); -#endif + // TODO The no-handler scheduler submission includes a fix for the event + // return, where the event is returned by the scheduler on every submission. + // This fix is not yet applied to the handler-based path. #ifndef + // __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // auto params = *static_cast(pParams); + // EXPECT_EQ(*params.pphEvent, nullptr); + // #endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueMemcpy = 0; inline ur_result_t redefined_urUSMEnqueueMemcpy(void *pParams) { ++counter_urUSMEnqueueMemcpy; -#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - auto params = *static_cast(pParams); - EXPECT_EQ(*params.pphEvent, nullptr); -#endif + // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // auto params = *static_cast(pParams); + // EXPECT_EQ(*params.pphEvent, nullptr); + // #endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueFill = 0; inline ur_result_t redefined_urUSMEnqueueFill(void *pParams) { ++counter_urUSMEnqueueFill; -#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - auto params = *static_cast(pParams); - EXPECT_EQ(*params.pphEvent, nullptr); -#endif + // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // auto params = *static_cast(pParams); + // EXPECT_EQ(*params.pphEvent, nullptr); + // #endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueuePrefetch = 0; inline ur_result_t redefined_urUSMEnqueuePrefetch(void *pParams) { ++counter_urUSMEnqueuePrefetch; -#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - auto params = *static_cast(pParams); - EXPECT_EQ(*params.pphEvent, nullptr); -#endif + // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // auto params = *static_cast(pParams); + // EXPECT_EQ(*params.pphEvent, nullptr); + // #endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueMemAdvise = 0; inline ur_result_t redefined_urUSMEnqueueMemAdvise(void *pParams) { ++counter_urUSMEnqueueMemAdvise; -#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - auto params = *static_cast(pParams); - EXPECT_EQ(*params.pphEvent, nullptr); -#endif + // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // auto params = *static_cast(pParams); + // EXPECT_EQ(*params.pphEvent, nullptr); + // #endif return UR_RESULT_SUCCESS; } From f464e173692c0bb0781245b5c81df31df1118c2f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 7 Oct 2025 10:01:35 +0000 Subject: [PATCH 16/16] Change the expected copy_count in test_num_kernel_copies --- sycl/test-e2e/Basic/test_num_kernel_copies.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp index 7c1781e873a39..98987c13c196e 100644 --- a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp +++ b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp @@ -29,7 +29,7 @@ int main(int argc, char **argv) { kernel<1> krn1; q.parallel_for(sycl::nd_range<1>{1, 1}, krn1); - assert(copy_count == 1); + assert(copy_count == 0); assert(move_count == 0); copy_count = 0;