diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 0a91065a71e58..b352c7984bcd1 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -62,8 +62,6 @@ class __copyAcc2Acc; namespace cl { namespace sycl { -namespace csd = cl::sycl::detail; - // Forward declaration template class buffer; @@ -106,7 +104,7 @@ template struct get_kernel_name_t { }; /// Specialization for the case when \c Name is undefined. -template struct get_kernel_name_t { +template struct get_kernel_name_t { using name = Type; }; @@ -409,9 +407,9 @@ class handler { break; case detail::CG::PREFETCH_USM: CommandGroup.reset(new detail::CGPrefetchUSM( - MDstPtr, MLength, std::move(MArgsStorage), - std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents))); + MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), + std::move(MSharedPtrStorage), std::move(MRequirements), + std::move(MEvents))); break; case detail::CG::NONE: throw runtime_error("Command group submitted without a kernel or a " @@ -666,7 +664,7 @@ class handler { extractArgsAndReqsFromLambda(MHostKernel->getPtr(), KI::getNumParams(), &KI::getParamDesc(0)); MKernelName = KI::getName(); - MOSModuleHandle = csd::OSUtil::getOSModuleHandle(KI::getName()); + MOSModuleHandle = detail::OSUtil::getOSModuleHandle(KI::getName()); } else { // In case w/o the integration header it is necessary to process // accessors from the list(which are associated with this handler) as @@ -676,9 +674,10 @@ class handler { } // single_task version with a kernel represented as a lambda. - template + template void single_task(KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_single_task(KernelFunc); #else @@ -691,9 +690,11 @@ class handler { // parallel_for version with a kernel represented as a lambda + range that // specifies global size only. - template + template void parallel_for(range NumWorkItems, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for(KernelFunc); #else @@ -708,17 +709,18 @@ class handler { MNDRDesc.set(range<1>{1}); MArgs = std::move(MAssociatedAccesors); - MHostKernel.reset( - new detail::HostKernel(std::move(Func))); + MHostKernel.reset(new detail::HostKernel(std::move(Func))); MCGType = detail::CG::RUN_ON_HOST_INTEL; } // parallel_for version with a kernel represented as a lambda + range and // offset that specify global size and global offset correspondingly. - template + template void parallel_for(range NumWorkItems, id WorkItemOffset, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for(KernelFunc); #else @@ -730,9 +732,11 @@ class handler { // parallel_for version with a kernel represented as a lambda + nd_range that // specifies global, local sizes and offset. - template + template void parallel_for(nd_range ExecutionRange, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for(KernelFunc); #else @@ -742,10 +746,12 @@ class handler { #endif } - template + template void parallel_for_work_group(range NumWorkGroups, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for_work_group(KernelFunc); #else @@ -755,11 +761,13 @@ class handler { #endif // __SYCL_DEVICE_ONLY__ } - template + template void parallel_for_work_group(range NumWorkGroups, range WorkGroupSize, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for_work_group(KernelFunc); #else @@ -823,9 +831,10 @@ class handler { // single_task version which takes two "kernels". One is a lambda which is // used if device, queue is bound to, is host device. Second is a sycl::kernel // which is used otherwise. - template + template void single_task(kernel SyclKernel, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_single_task(KernelFunc); #else @@ -842,10 +851,12 @@ class handler { // parallel_for version which takes two "kernels". One is a lambda which is // used if device, queue is bound to, is host device. Second is a sycl::kernel // which is used otherwise. range argument specifies global size. - template + template void parallel_for(kernel SyclKernel, range NumWorkItems, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for(KernelFunc); #else @@ -862,10 +873,12 @@ class handler { // parallel_for version which takes two "kernels". One is a lambda which is // used if device, queue is bound to, is host device. Second is a sycl::kernel // which is used otherwise. range and id specify global size and offset. - template + template void parallel_for(kernel SyclKernel, range NumWorkItems, id WorkItemOffset, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for(KernelFunc); #else @@ -882,10 +895,12 @@ class handler { // parallel_for version which takes two "kernels". One is a lambda which is // used if device, queue is bound to, is host device. Second is a sycl::kernel // which is used otherwise. nd_range specifies global, local size and offset. - template + template void parallel_for(kernel SyclKernel, nd_range NDRange, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for(KernelFunc); #else @@ -905,10 +920,12 @@ class handler { /// of the kernel. The same source kernel can be compiled multiple times /// yielding multiple kernel class objects accessible via the \c program class /// interface. - template + template void parallel_for_work_group(kernel SyclKernel, range NumWorkGroups, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for_work_group(KernelFunc); #else @@ -921,11 +938,13 @@ class handler { /// Two-kernel version of the \c parallel_for_work_group with group and local /// range. - template + template void parallel_for_work_group(kernel SyclKernel, range NumWorkGroups, range WorkGroupSize, KernelType KernelFunc) { - using NameT = typename csd::get_kernel_name_t::name; + using NameT = + typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ kernel_parallel_for_work_group(KernelFunc); #else @@ -1083,7 +1102,7 @@ class handler { // Shapes can be 1, 2 or 3 dimensional rectangles. template static bool IsCopyingRectRegionAvailable(const range Src, - const range Dst) { + const range Dst) { if (Dims_Src > Dims_Dst) return false; for (size_t I = 0; I < Dims_Src; ++I) @@ -1092,7 +1111,7 @@ class handler { return true; } -// copy memory pointed by accessor to the memory pointed by another accessor + // copy memory pointed by accessor to the memory pointed by another accessor template < typename T_Src, int Dims_Src, access::mode AccessMode_Src, access::target AccessTarget_Src, typename T_Dst, int Dims_Dst, @@ -1209,7 +1228,7 @@ class handler { } // Copy memory from the source to the destination. - void memcpy(void* Dest, const void* Src, size_t Count) { + void memcpy(void *Dest, const void *Src, size_t Count) { MSrcPtr = const_cast(Src); MDstPtr = Dest; MLength = Count; diff --git a/sycl/include/CL/sycl/ordered_queue.hpp b/sycl/include/CL/sycl/ordered_queue.hpp index 376df16a4b667..1a50be13606bf 100644 --- a/sycl/include/CL/sycl/ordered_queue.hpp +++ b/sycl/include/CL/sycl/ordered_queue.hpp @@ -29,34 +29,40 @@ class ordered_queue { explicit ordered_queue(const property_list &propList = {}) : ordered_queue(default_selector(), async_handler{}, propList) {} - ordered_queue(const async_handler &asyncHandler, const property_list &propList = {}) + ordered_queue(const async_handler &asyncHandler, + const property_list &propList = {}) : ordered_queue(default_selector(), asyncHandler, propList) {} ordered_queue(const device_selector &deviceSelector, - const property_list &propList = {}) - : ordered_queue(deviceSelector.select_device(), async_handler{}, propList) {} + const property_list &propList = {}) + : ordered_queue(deviceSelector.select_device(), async_handler{}, + propList) {} ordered_queue(const device_selector &deviceSelector, - const async_handler &asyncHandler, const property_list &propList = {}) + const async_handler &asyncHandler, + const property_list &propList = {}) : ordered_queue(deviceSelector.select_device(), asyncHandler, propList) {} ordered_queue(const device &syclDevice, const property_list &propList = {}) : ordered_queue(syclDevice, async_handler{}, propList) {} ordered_queue(const device &syclDevice, const async_handler &asyncHandler, - const property_list &propList = {}); + const property_list &propList = {}); - ordered_queue(const context &syclContext, const device_selector &deviceSelector, - const property_list &propList = {}) + ordered_queue(const context &syclContext, + const device_selector &deviceSelector, + const property_list &propList = {}) : ordered_queue(syclContext, deviceSelector, - detail::getSyclObjImpl(syclContext)->get_async_handler(), - propList) {} + detail::getSyclObjImpl(syclContext)->get_async_handler(), + propList) {} - ordered_queue(const context &syclContext, const device_selector &deviceSelector, - const async_handler &asyncHandler, const property_list &propList = {}); + ordered_queue(const context &syclContext, + const device_selector &deviceSelector, + const async_handler &asyncHandler, + const property_list &propList = {}); ordered_queue(cl_command_queue cl_Queue, const context &syclContext, - const async_handler &asyncHandler = {}); + const async_handler &asyncHandler = {}); ordered_queue(const ordered_queue &rhs) = default; @@ -104,22 +110,20 @@ class ordered_queue { return impl->get_property(); } - event memset(void* ptr, int value, size_t count) { + event memset(void *ptr, int value, size_t count) { return impl->memset(impl, ptr, value, count); } - event memcpy(void* dest, const void* src, size_t count) { + event memcpy(void *dest, const void *src, size_t count) { return impl->memcpy(impl, dest, src, count); } - event prefetch(const void* Ptr, size_t Count) { - return submit([=](handler &cgh) { - cgh.prefetch(Ptr, Count); - }); + event prefetch(const void *Ptr, size_t Count) { + return submit([=](handler &cgh) { cgh.prefetch(Ptr, Count); }); } // single_task version with a kernel represented as a lambda. - template + template void single_task(KernelType KernelFunc) { submit([&](handler &cgh) { cgh.template single_task(KernelFunc); @@ -128,7 +132,8 @@ class ordered_queue { // parallel_for version with a kernel represented as a lambda + range that // specifies global size only. - template + template void parallel_for(range NumWorkItems, KernelType KernelFunc) { // By-value or By-reference for this? submit([&](handler &cgh) { @@ -139,7 +144,8 @@ class ordered_queue { // parallel_for version with a kernel represented as a lambda + range and // offset that specify global size and global offset correspondingly. - template + template void parallel_for(range NumWorkItems, id WorkItemOffset, KernelType KernelFunc) { submit([&](handler &cgh) { @@ -150,7 +156,8 @@ class ordered_queue { // parallel_for version with a kernel represented as a lambda + nd_range that // specifies global, local sizes and offset. - template + template void parallel_for(nd_range ExecutionRange, KernelType KernelFunc) { submit([&](handler &cgh) { cgh.template parallel_for(ExecutionRange,