diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index c78c34d839042..3586a8d725ab2 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -28,8 +28,7 @@ if (NOT DEFINED SYCL_ENABLE_BACKENDS) set(SYCL_ENABLE_BACKENDS "opencl;level_zero;level_zero_v2") endif() -# Option to enable JIT, this in turn makes kernel fusion and spec constant -# materialization possible. +# Option to enable JIT, this in turn makes spec constant materialization possible. option(SYCL_ENABLE_EXTENSION_JIT "Enable extension to JIT kernels" ON) if (NOT XPTI_INCLUDES) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 40b08cf7a6e54..a8f9c8d670835 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -124,7 +124,6 @@ dependency graph to the SYCL runtime prior to execution: * Unlock DMA hardware features through graph analysis by the runtime. * Graph optimizations become available, including but not limited to: -** Kernel fusion/fission. ** Inter-node memory reuse from data staying resident on device. ** Identification of the peak intermediate output memory requirement, used for more optimal memory allocation. @@ -150,8 +149,7 @@ requirements were considered: 4. Integrate sub-graphs (previously constructed graphs) when constructing a new graph. 5. Support the USM model of memory as well as buffer/accessor model. -6. Compatible with other SYCL extensions and features, e.g. kernel fusion & - built-in kernels. +6. Compatible with other SYCL extensions and features, e.g. built-in kernels. 7. Ability to record a graph with commands submitted to different devices in the same context. 8. Capability to serialize graphs to a binary format which can then be diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph_fusion.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph_fusion.asciidoc deleted file mode 100644 index 91722042c0ada..0000000000000 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph_fusion.asciidoc +++ /dev/null @@ -1,779 +0,0 @@ -= sycl_ext_oneapi_graph_fusion - -:source-highlighter: coderay -:coderay-linenums-mode: table - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en -:dpcpp: pass:[DPC++] -:sectnums: -:sectnumlevels: 4 - -// Set the default source code type in this document to C++, -// for syntax highlighting purposes. This is needed because -// docbook uses c++ and html5 uses cpp. -:language: {basebackend@docbook:c++:cpp} - - -== Notice - -[%hardbreaks] -Copyright (C) Codeplay Software Limited. All rights reserved. - -Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks -of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by -permission by Khronos. - - -== Contact - -To report problems with this extension, please open a new issue at: - -https://github.com/intel/llvm/issues - -== Contributors - -Lukas Sommer, Codeplay + -Victor Lomüller, Codeplay + -Victor Perez, Codeplay + -Julian Oppermann, Codeplay + -Ewan Crawford, Codeplay + -Ben Tracy, Codeplay + -John Pennycook, Intel + -Greg Lueck, Intel + - -== Dependencies - -This extension is written against the SYCL 2020 revision 7 specification. All -references below to the "core SYCL specification" or to section numbers in the -SYCL specification refer to that revision. - -This extension builds on top of the experimental SYCL graphs -https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc[extension -proposal]. All references to the "graphs proposal" refer to this proposal. - -In addition, this extension also depends on the following other SYCL extensions: - -* link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties] -extension. -* link:../experimental/sycl_ext_oneapi_annotated_ptr.asciidoc[sycl_ext_oneapi_annotated_ptr] -extension. - -== Status - -This is a proposed extension specification, intended to gather community -feedback. Interfaces defined in this specification may not be implemented yet -or may be in a preliminary state. The specification itself may also change in -incompatible ways before it is finalized. *Shipping software products should -not rely on APIs defined in this specification.* - -== Overview - -The SYCL graph -https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc[extension -proposal] seeks to reduce the runtime overhead linked to SYCL kernel submission -and expose additional optimization opportunities. - -One of those further optimizations enabled by the graphs proposal is _kernel -fusion_. Fusing two or more kernels executing on the same device into a single -kernel launch can further reduce runtime overhead and enable further kernel -optimizations such as dataflow internalization discussed below. - -This proposal is a continuation of many of the ideas of the initial -https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc[experimental -kernel fusion proposal] for SYCL. However, instead of defining its own -SYCL-based API to record a sequence of kernels to fuse, this proposal builds on -top of the graphs proposal to allow the fusion of graphs. This not only unifies -the APIs, making sure users only need to familiarize themselves with a single -API, but also provides additional advantages. - -The graph proposal defines two APIs to create graphs: a proposal using a -recording mechanism, similar to the initial kernel fusion proposal; and another -one using explicit graph building. Thus, future users will be able to choose -from two different mechanisms to construct the sequence of kernels to fuse. As -there is an explicit step for finalization of graphs before being submitted for -execution, fusion can happen in this step, which also eliminates many of the -synchronization concerns that needed to be covered in the experimental kernel -fusion proposal. - -The aim of this document is to propose a mechanism for users to request the -fusion of two or more kernels in a SYCL graph into a single kernel **at -runtime**. This requires the extension of the runtime with some sort of JIT -compiler to allow for the fusion of kernel functions at runtime. - -== Specification - -=== Feature test macro - -This extension provides a feature-test macro as described in the core SYCL -specification. An implementation supporting this extension must predefine the -macro `SYCL_EXT_ONEAPI_GRAPH_FUSION` to one of the values defined in the -table below. Applications can test for the existence of this macro to determine -if the implementation supports this feature, or applications can test the -macro's value to determine which of the extension's features the implementation -supports. - -[%header,cols="1,5"] -|=== -|Value -|Description - -|1 -|Initial version of this extension. -|=== - -=== API modifications - -==== Properties - -===== Graph Fusion Properties - -The API for `command_graph::finalize()` includes a -`property_list` parameter. The following properties, defined by this extension, -can be added to the property list to indicate that the kernels in the -command-graph can or should be fused. - -```c++ -sycl::ext::oneapi::experimental::property::graph::enable_fusion -``` - -This property is only descriptive, not prescriptive. Implementations are free to -not perform fusion if it is not possible -(see below section <>), fusion is not -supported by the implementation, or the implementation decides not to perform -fusion for other reasons. It is not an error if an implementation does not -perform fusion even though the property is passed. - -Implementations can provide a diagnostic message in case fusion was not -performed through an implementation-specified mechanism, but are not required to -do so. - -```c++ -sycl::ext::oneapi::experimental::property::graph::require_fusion -``` - -This property is prescriptive, i.e., in contrast to the property above, -implementations _must_ perform fusion. If fusion is not supported by the -implementation at all, the implementation must raise an error with error code -`errc::feature_not_supported`. If the implementation is unable to perform fusion -for this graph (see below section <>), the -implementation must raise an error with error code `errc::kernel_not_supported`. - -===== Barrier property - -The following property can be added to the `property_list` of the -`command_graph::finalize()` API. - -```c++ -sycl::ext::oneapi::experimental::property::graph::insert_barriers -``` - -By default, graph fusion will not introduce any additional barriers to the -fused kernel. Existing group barriers inside the code will be retained (see -below). - -If the property list contains this property, additional work-group barriers are -introduced between kernels in the fused kernel (see below section on -synchronization in kernels). - -The property only takes effect if either the -`sycl::ext::oneapi::experimental::property::graph::enable_fusion` -property or the -`sycl::ext::oneapi::experimental::property::graph::require_fusion` property is -also part of the `property_list` of the same invocation of -`command_graph<...>::finalize()`. - -[NOTE] -==== -By adding the `insert_barriers` property, a _work-group barrier_ will be -inserted between the kernels. To achieve a device-wide synchronization, i.e., -a synchronization between different work-groups that is implicit between two -kernels when executed separately, users should leverage the subgraph feature of -the SYCL graph proposal, as device-wide synchronization inside the fused kernel -is not achievable. By creating two subgraphs, fusing each and adding both to -the same graph, a device-wide synchronization between two fused parts can be -achieved if necessary. -==== - -===== Access scope property - -Specializations of the following property template can be passed to three -different APIs, namely: - -* The `accessor` constructor, giving a more granular control. -* The `buffer` constructor, in which case all the `accessors` derived from -this buffer will inherit this property (unless overridden). -* The property list parameter of `annotated_ptr`, to apply the property to a -USM pointer. - -```c++ -namespace sycl::ext::oneapi::experimental::property{ - - template - struct access_scope {}; - - inline constexpr auto access_scope_work_group = - access_scope; - - inline constexpr auto access_scope_work_item = - access_scope; - -} // namespace sycl::ext::oneapi::experimental::property -``` - -Specializations of the `access_scope` property template can be used to express -the access pattern of kernels to a buffer or USM allocation. - -The specializations of the property are an assertion by the application that -each element in the buffer or allocated device memory is at most accessed in -the given memory scope in the kernel submitted by this command-group (in case -the property is specified on an accessor) or in any kernel in the graph (in case -the property is specified on a buffer or an USM pointer). - -More concretely, the two shortcuts express the following semantics: - -* `access_scope_work_group`: Applying this specialization asserts that each -element in the buffer or allocated device memory is accessed by no more than one -work-group. - -* `access_scope_work_item`: Applying this specialization asserts that each -element in the buffer or allocated device memory is accessed by no more than one -work-item. - -Implementations may treat specializations of the access scope property as a -hint to promote the elements of the buffer or allocated device memory to a -different type of memory (see below section on local and private -internalization). - -If different specializations are applied to accessors to the same buffer or -device memory allocation, the resolution rules specified below apply. - -The property is not prescriptive, implementations are free to not perform -internalization and it is no error if they do not perform internalization. -Implementations can provide a diagnostic message in case internalization was -not performed through an implementation-specified mechanism, but are not -required to do so. - -In case the `access_scope` property is attached to `annotated_ptr`, the -properties should be inspected by an implementation when the `annotated_ptr` is -captured by a kernel lambda or otherwise passed as an argument to a kernel -function. Implementations are not required to track internalization-related -information from other USM pointers that may be used by a kernel, such as those -stored inside of structs or other data structures. - -===== Internal memory property - -The following property can be passed to three different APIs, namely: - -* The `accessor` constructor, giving a more granular control. -* The `buffer` constructor, in which case all the `accessors` derived from -this buffer will inherit this property (unless overridden). -* The property list parameter of `annotated_ptr`, to apply the property to a -USM pointer. - -```c++ -sycl::ext::oneapi::experimental::property::fusion_internal_memory -``` - -By applying this property, the application asserts that the updates made to the -buffer or allocated device memory by the kernel submitted by this command-group -(in case the property is specified on an accessor) or in any kernel in the -graph (in case the property is specified on a buffer or an USM pointer) may not -be available for use after the fused kernel completes execution. -Implementations may treat this as a hint to not write back the final result to -global memory. - -The property is not prescriptive, implementations are free to not perform -internalization and it is no error if they do not perform internalization. -Implementations can provide a diagnostic message in case internalization was -not performed through an implementation-specified mechanism, but are not -required to do so. - -In case the `fusion_internal_memory` property is attached to `annotated_ptr`, -the properties should be inspected by an implementation when the -`annotated_ptr` is captured by a kernel lambda or otherwise passed as an -argument to a kernel function. Implementations are not required to track -internalization-related information from other USM pointers that may be used by -a kernel, such as those stored inside of structs or other data structures. - - -==== Device aspect - -To support querying whether a SYCL device and the underlying platform support -kernel fusion for graphs, the following device aspect is added as part of this -extension proposal. - -```c++ -sycl::aspect::ext_oneapi_graph_fusion -``` - -Devices with `aspect::ext_oneapi_graph_fusion` support kernel fusion for graphs. - -=== Linearization - -In order to be able to perform kernel fusion, the commands in a graph must be -arranged in a valid sequential order. - -A valid _linearization_ of the graph is an order of the commands in the graph -such that each command in the linearization depends only on commands that appear -in the sequence before the command itself. - -The exact linearization of the dependency DAG (which generally only implies a -partial order) is implementation defined. The linearization should be -deterministic, i.e., it should yield the same sequence when presented with the -same DAG. - -=== Synchronization in kernels - -Group barriers semantics do not change in the fused kernel and barriers already -in the unfused kernels are preserved in the fused kernel. - -Despite this, it is worth noting that, in order to introduce synchronization -between work items in a same work-group executing a fused kernel, a work-group -barrier can added between each of the kernels being fused by applying the -`insert_barriers` property. - -As the fusion compiler can reason about the access behavior of the different -kernels only in a very limited fashion, **it's the user's responsibility to -make sure no data races occur in the fused kernel**. Data races could in -particular be introduced because the implicit device-wide synchronization -between the execution of two separate kernels is eliminated by fusion. The user -must ensure that the kernels combined during fusion do not rely on this -synchronization or introduce appropriate synchronization. - -Device-wide synchronization can be achieved by splitting the graph into multiple -subgraphs and fusing each separately, as described above. - -=== Limitations - -Some scenarios might require fusion to be cancelled if some undesired scenarios -arise. The required implementation behavior in this case depends on the -property that was used to initiate fusion. - -If the _descriptive_ `enable_fusion` property was used to initiate fusion, it -is not an error for an implementation to cancel fusion in those scenarios. A -valid recovery from such a scenario is to not perform fusion and rather -maintain the original graph, executing the kernels individually rather than in -a single fused kernel. Implementations can provide a diagnostic message in case -fusion was cancelled through an implementation-specified mechanism, but are not -required to do so. - -If, on the other hand, the _prescriptive_ `require_fusion` property was used to -initiate fusion, implementations must raise an error if they need to cancel -fusion in those scenarios. - -The following sections describe a number of scenarios that might require to -cancel fusion. Note that some implementations might be more capable/permissive -and might not abort fusion in all of these cases. - -==== Hierarchical Parallelism - -The extension does not support kernels using hierarchical parallelism. Although -some implementations might want to add support for this kind of kernels. - -==== Incompatible ND-ranges of the kernels to fuse - -Incompatibility of ND-ranges will be determined by the kernel fusion -implementation. All implementations should support fusing kernels with the exact -same ND-ranges, but implementations might cancel fusion as soon as a kernel with -a different ND-range is submitted. - -==== Kernels with different dimensions - -Similar to the previous one, it is implementation-defined whether or not to -support fusing kernels with different dimensionality. - -==== No intermediate representation - -In case any of the kernels to be fused does not come with an accessible -suitable intermediate representation, kernel fusion is canceled. - -==== Explicit memory operations and host tasks - -The graph proposal allows graphs to contain, next to device kernels, explicit -memory operations and host tasks. As both of these types of commands cannot be -integrated into a fused kernel, fusion must be cancelled, unless there is a -valid linearization (see above section on linearization) that allows all memory -operations and host tasks to execute either before or after all device kernels. -It is valid to execute some memory operations and host tasks before all device -kernels and some after all device kernels, as long as that sequence is a valid -linearization. - -==== Multi-device graph - -Attempting to fuse a graph containing device kernels for more than one device -may lead to fusion being cancelled, as kernel fusion across multiple devices -and/or backends is generally not possible. - -=== Internalization - -While avoiding repeated kernel launch overheads will most likely already improve -application performance, kernel fusion can deliver even higher performance gains -when internalizing dataflows. - -In a situation where data produced by one kernel is consumed by another kernel -and the two kernels are fused, the dataflow from the first kernel to the second -kernel can be made internal to the fused kernel. Instead of using time-consuming -reads and writes to/from global memory, the fused kernel can use much faster -mechanisms, e.g., registers or private memory to "communicate" the result. - -To achieve this result during fusion, a fusion compiler must establish some -additional context and information. - -First, the compiler must know that two arguments refer to the same underlying -memory. This can be inferred during runtime, so no additional user input is -required. - -For the remaining information that needs to be established, the necessity of -user-provided input depends on the individual capabilities of the -implementation. - -If the implementation's fusion compiler is not able to initialize the -internalized buffers or memories, elements of the internalized buffer or memory -being read by a kernel must have been written before (either in the same kernel -or in a previous one in the same graph). This behavior can be asserted by the -application by applying the `no_init` property (see -https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_properties_2[section -4.7.6.4] of the SYCL specification) to the buffer or allocated device memory. - -To this end, this extension allows the use of the property in more places than -defined in Table 52 in the SYCL specification. More concretely, this extension -allows to use the property in the buffer constructor or the property list -parameter of `annotated_ptr<...>`. In case the `no_init` property is attached to -`annotated_ptr`, the properties should be inspected by an implementation when -the `annotated_ptr` is captured by a kernel lambda or otherwise passed as an -argument to a kernel function. Implementations are not required to track -internalization-related information from other USM pointers that may be used by -a kernel, such as those stored inside of structs or other data structures. - -If the implementation's fusion compiler is not able to guarantee write-back of -the final result after internalization, values stored to an internalized -buffer/memory must not be used by any other kernel not part of the graph, as -the data becomes unavailable to consumers through internalization. This is -knowledge that the compiler cannot deduce. Instead, the fact that the values -stored to an internalized buffer/memory are not used outside the fused kernel -must be provided by the user by applying the `fusion_internal_memory` property -to the buffer or allocated device memory as described above. - -The type of memory that can be used for internalization depends on the memory -access pattern of the fused kernel. Depending on the access pattern, the buffer -or allocated device memory can be classified as: - -* _Privately internalizable_: If not a single element of the buffer/memory is to - be accessed by more than one work-item; - -* _Locally internalizable_: If not a single element of the buffer/memory is to - be accessed by work items of different work groups. - -If the implementation's fusion compiler is not able to deduce the access -pattern, suitable information must be provided by the user. To this end, -specializations of the `access_scope` property template defined in this -proposal can be used to inform the fusion compiler about the access pattern of -the kernels involved in fusion. - -If an `annotated_ptr` is created with any of the properties relating to -internalization and captured by a kernel lambda or otherwise passed as an -argument to a kernel function participating in fusion, the underlying memory -must only be accessed via pointers that are also captured or passed as kernel -argument. Access to the underlying memory via a different pointer, such as -pointers stored inside of structs or other data structures results in undefined -behavior. - -As already stated above, it depends on the implementation's capabilities which -properties need to be applied to a buffer or allocated device memory to enable -dataflow internalization. Implementations should document the necessary -properties required to enable internalization in implementation documentation. - -All internalization-related properties are only _descriptive_, so it is not an -error if an implementation is unable to or for other reasons decides not to -perform internalization based on the specified properties. Implementations can -provide a diagnostic message in case the set of specified properties are not -sufficient to perform internalization, but are not required to do so. - -[NOTE] -==== -The current implementation in DPC++ requires the addition of the `no_init`, -`fusion_internal_memory` and one specialization of the `access_scope` property -to buffers or allocated device memory to enable internalization. -==== - -==== Buffer internalization - -In some cases, the user will specify different access scopes for a -buffer and accessors to such buffer. When incompatible combinations are used, an -`exception` with `errc::invalid` error code is thrown. Otherwise, these -properties must be combined as follows: - -[options="header"] -|=== -|Accessor Access Scope|Buffer Access Scope|Resulting Access Scope - -.3+.^|None -|None -|None - -|Work Group -|Work Group - -|Work Item -|Work Item - -.3+.^|Work Group -|None -|Work Group - -|Work Group -|Work Group - -|Work Item -|*Error* - -.3+.^|Work Item -|None -|Work Item - -|Work Group -|*Error* - -|Work Item -|Work Item -|=== - -In case different internalization targets are used for accessors to the same -buffer or for `annotated_ptr` pointing to the same underlying memory, the -following (commutative and associative) rules are followed: - -[options="header"] -|=== -|Accessor/Ptr~1~ Access Scope|Accessor/Ptr~2~ Access Scope|Resulting Access Scope - -|None -|_Any_ -|None - -.2+.^|Work Group -|Work Group -|Work Group - -|Work Item -|None - -|Work Item -|Work Item -|Work Item -|=== - -If no work-group size is specified or two kernels specify different -work-group sizes when attempting local internalization for any of the -kernels involved in the fusion, no internalization will be -performed. If there is a mismatch between the two accessors (access -range, access offset, number of dimensions, data type), no -internalization is performed. - -== Examples - -=== Buffer-based example - -```c++ -#include - -namespace sycl_ext = sycl::ext::oneapi::experimental; - -struct AddKernel { - sycl::accessor accIn1; - sycl::accessor accIn2; - sycl::accessor accOut; - - void operator()(sycl::id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; } -}; - -int main() { - constexpr size_t dataSize = 512; - int in1[dataSize], in2[dataSize], in3[dataSize], out[dataSize]; - - sycl::queue q{default_selector_v}; - - { - sycl::buffer bIn1{in1, sycl::range{dataSize}}; - bIn1.set_write_back(false); - sycl::buffer bIn2{in2, sycl::range{dataSize}}; - bIn2.set_write_back(false); - sycl::buffer bIn3{in3, sycl::range{dataSize}}; - bIn3.set_write_back(false); - buffer bTmp1{range{dataSize}}; - // Internalization specified on the buffer - sycl::buffer bTmp2{ - sycl::range{dataSize}, - {sycl_ext::property::access_scope_work_item{}, - sycl_ext::property::fusion_internal_memory{}, - sycl::no_init}}; - // Internalization specified on the buffer - sycl::buffer bTmp3{ - sycl::range{dataSize}, - {sycl_ext::property::access_scope_work_item{}, - sycl_ext::property::fusion_internal_memory{}, - sycl::no_init}}; - sycl::buffer bOut{out, sycl::range{dataSize}}; - bOut.set_write_back(false); - - sycl_ext::command_graph graph{ - q.get_context(), q.get_device(), - sycl_ext::property::graph::assume_buffer_outlives_graph{}}; - - graph.begin_recording(q); - - q.submit([&](sycl::handler &cgh) { - auto accIn1 = bIn1.get_access(cgh); - auto accIn2 = bIn2.get_access(cgh); - // Internalization specified on each accessor. - auto accTmp1 = bTmp1.get_access(cgh, - sycl_ext::property::access_scope_work_item{} - sycl_ext::property::fusion_internal_memory{}, - sycl::no_init); - cgh.parallel_for(dataSize, AddKernel{accIn1, accIn2, accTmp1}); - }); - - q.submit([&](sycl::handler &cgh) { - // Internalization specified on each accessor. - auto accTmp1 = bTmp1.get_access(cgh, - sycl_ext::property::access_scope_work_item{} - sycl_ext::property::fusion_internal_memory{}, - sycl::no_init); - auto accIn3 = bIn3.get_access(cgh); - auto accTmp2 = bTmp2.get_access(cgh); - cgh.parallel_for( - dataSize, [=](sycl::id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; }); - }); - - q.submit([&](sycl::handler &cgh) { - // Internalization specified on each accessor. - auto accTmp1 = bTmp1.get_access(cgh, - sycl_ext::property::access_scope_work_item{} - sycl_ext::property::fusion_internal_memory{}, - sycl::no_init); - auto accTmp3 = bTmp3.get_access(cgh); - cgh.parallel_for( - dataSize, [=](sycl::id<1> i) { accTmp3[i] = accTmp1[i] * 5; }); - }); - - q.submit([&](sycl::handler &cgh) { - auto accTmp2 = bTmp2.get_access(cgh); - auto accTmp3 = bTmp3.get_access(cgh); - auto accOut = bOut.get_access(cgh); - cgh.parallel_for(dataSize, - AddKernel{accTmp2, accTmp3, accOut}); - }); - - graph.end_recording(); - - // Trigger fusion during finalization. - auto exec_graph = - graph.finalize({sycl_ext::property::graph::require_fusion{}}); - - q.ext_oneapi_graph(exec_graph); - - q.wait(); - } - return 0; -} -``` - -=== USM-based example - -```c++ -#include - -namespace sycl_ext = sycl::ext::oneapi::experimental; - -int main() { - constexpr size_t dataSize = 512; - constexpr size_t numBytes = dataSize * sizeof(int); - - int in1[dataSize], in2[dataSize], in3[dataSize], out[dataSize]; - - sycl::queue q{default_selector_v}; - - sycl_ext::command_graph graph{q.get_context(), q.get_device()}; - - int *dIn1, dIn2, dIn3, dTmp, dOut; - - dIn1 = sycl::malloc_device(q, dataSize); - dIn2 = sycl::malloc_device(q, dataSize); - dIn3 = sycl::malloc_device(q, dataSize); - dOut = sycl::malloc_device(q, dataSize); - - // Specify internalization to local memory for an USM pointer - dTmp = sycl::malloc_device(q, dataSize) - auto annotatedTmp = sycl_ext::annotated_ptr( - dTmp, sycl_ext::property::access_scope_work_group{}, - sycl_ext::property::fusion_internal_memory{}, no_init); - - // This explicit memory operation is compatible with fusion, as it can be - // linearized before any device kernel in the graph. - auto copy_in1 = - graph.add([&](sycl::handler &cgh) { cgh.memcpy(dIn1, in1, numBytes); }); - - // This explicit memory operation is compatible with fusion, as it can be - // linearized before any device kernel in the graph. - auto copy_in2 = - graph.add([&](sycl::handler &cgh) { cgh.memcpy(dIn2, in2, numBytes); }); - - auto kernel1 = graph.add( - [&](sycl::handler &cgh) { - cgh.parallel_for( - dataSize, [=](sycl::id<1> i) { annotatedTmp[i] = in1[i] + in2[i]; }); - }, - {sycl_ext::property::node::depends_on(copy_in1, copy_in2)}); - - // This explicit memory operation is compatible with fusion, as it can be - // linearized before any device kernel in the graph. - auto copy_in3 = - graph.add([&](sycl::handler &cgh) { cgh.memcpy(dIn3, in3, numBytes); }); - - auto kernel2 = graph.add( - [&](sycl::handler &cgh) { - cgh.parallel_for( - dataSize, [=](sycl::id<1> i) { out[i] = annotatedTmp[i] * in3[i]; }); - }, - {sycl_ext::property::node::depends_on(copy_in3, kernel1)}); - - // This explicit memory operation is compatible with fusion, as it can be - // linearized after any device kernel in the graph. - auto copy_out = - graph.add([&](sycl::handler &cgh) { cgh.memcpy(out, dOut, numBytes); }, - {sycl_ext::property::node::depends_on(kernel2)}); - - // Trigger fusion during finalization. - auto exec = graph.finalize({sycl_ext::property::graph::require_fusion{}}); - - // use queue shortcut for graph submission - q.ext_oneapi_graph(exec).wait(); - - sycl::free(dIn1, q); - sycl::free(dIn2, q); - sycl::free(dIn3, q); - sycl::free(dOut, q); - sycl::free(dTmp, q); - - return 0; -} -``` - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Authors|Changes -|1|2023-02-16|Lukas Sommer|*Initial draft* -|2|2023-03-16|Lukas Sommer|*Remove reference to outdated `add_malloc_device` API* -|3|2023-04-11|Lukas Sommer|*Update usage examples for graph API changes* -|4|2023-08-17|Lukas Sommer|*Update after graph extension has been merged* -|5|2023-09-01|Lukas Sommer|*Split internalization properties and change barrier* -|6|2023-09-13|Lukas Sommer|*Use annotated_ptr for USM internalization* -|======================================== diff --git a/sycl/doc/extensions/removed/sycl_ext_codeplay_kernel_fusion.asciidoc b/sycl/doc/extensions/removed/sycl_ext_codeplay_kernel_fusion.asciidoc index 5cab7c8412d16..76aed66a7e4dd 100644 --- a/sycl/doc/extensions/removed/sycl_ext_codeplay_kernel_fusion.asciidoc +++ b/sycl/doc/extensions/removed/sycl_ext_codeplay_kernel_fusion.asciidoc @@ -44,18 +44,6 @@ SYCL specification refer to that revision. This experimental extension is no longer supported. -[NOTE] -==== -There is a link:../proposed/sycl_ext_oneapi_graph_fusion.asciidoc[follow-up -proposal] for fusion based on the -https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc[SYCL graph API]. -That proposal continues some of the ideas presented in this proposal, but uses -the more versatile SYCL graphs API to define the sequence of kernels to -execute. - -Once accepted and implemented, the new proposal will supersede this proposal. -==== - [NOTE] ==== This is an experimental extension for the SYCL specification. diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_prod.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_prod.asciidoc index f8b5f2ebb726d..320feb2b4e3da 100644 --- a/sycl/doc/extensions/supported/sycl_ext_oneapi_prod.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_prod.asciidoc @@ -106,5 +106,5 @@ in such cases to override default behavior and encourage earlier execution. Although `ext_oneapi_prod` is a performance hint, its impact upon application performance will be highly dependent upon the combination of device, backend and application characteristics. In some cases, it may even degrade performance -(e.g. by interfering with batching or fusion optimizations). +(e.g. by interfering with batching optimization). ==== diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index 2896de2ec66a6..dc10a5f079d53 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -33,26 +33,19 @@ enum DataLessPropKind { UseDefaultStream = 8, DiscardEvents = 9, DeviceReadOnly = 10, - // TODO(#15184): Remove the following fusion-related entries in the next - // ABI-breaking window. - FusionPromotePrivate = 11, - FusionPromoteLocal = 12, - FusionNoBarrier = 13, - FusionEnable = 14, - FusionForce = 15, - QueuePriorityNormal = 16, - QueuePriorityLow = 17, - QueuePriorityHigh = 18, - GraphNoCycleCheck = 19, - QueueSubmissionBatched = 20, - QueueSubmissionImmediate = 21, - GraphAssumeDataOutlivesBuffer = 22, - GraphAssumeBufferOutlivesGraph = 23, - GraphDependOnAllLeaves = 24, - GraphUpdatable = 25, - GraphEnableProfiling = 26, + QueuePriorityNormal = 11, + QueuePriorityLow = 12, + QueuePriorityHigh = 13, + GraphNoCycleCheck = 14, + QueueSubmissionBatched = 15, + QueueSubmissionImmediate = 16, + GraphAssumeDataOutlivesBuffer = 17, + GraphAssumeBufferOutlivesGraph = 18, + GraphDependOnAllLeaves = 19, + GraphUpdatable = 20, + GraphEnableProfiling = 21, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 26, + LastKnownDataLessPropKind = 21, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/sycl/ext/codeplay/experimental/fusion_wrapper.hpp b/sycl/include/sycl/ext/codeplay/experimental/fusion_wrapper.hpp deleted file mode 100644 index 3c837aa9aa0a7..0000000000000 --- a/sycl/include/sycl/ext/codeplay/experimental/fusion_wrapper.hpp +++ /dev/null @@ -1,97 +0,0 @@ -//==---- fusion_wrapper.hpp --- SYCL wrapper for queue for kernel fusion ---==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// TODO(#15184): Delete this file in the next ABI-breaking window. - -#pragma once - -#include // for __SYCL_EXPORT -#include // for event -#include // for property_list -#include // for queue - -#include // for shared_ptr - -namespace sycl { -inline namespace _V1 { - -namespace ext::codeplay::experimental { - -/// -/// A wrapper wrapping a sycl::queue to provide access to the kernel fusion API, -/// allowing to manage kernel fusion on the wrapped queue. -class __SYCL_EXPORT fusion_wrapper { - -public: - /// - /// Wrap a queue to get access to the kernel fusion API. - /// - /// @throw sycl::exception with errc::invalid if trying to construct a wrapper - /// on a queue which doesn't support fusion. - explicit fusion_wrapper(queue &q); - - /// - /// Access the queue wrapped by this fusion wrapper. - queue get_queue() const; - - /// - /// @brief Check whether the wrapped queue is in fusion mode or not. - bool is_in_fusion_mode() const; - - /// - /// @brief Set the wrapped queue into "fusion mode". This means that the - /// kernels that are submitted in subsequent calls to queue::submit() are not - /// submitted for execution right away, but rather added to a list of kernels - /// that should be fused. - /// - /// @throw sycl::exception with errc::invalid if this operation is called on a - /// queue which is already in fusion mode. - void start_fusion(); - - /// - /// @brief Cancel the fusion and submit all kernels submitted since the last - /// start_fusion() for immediate execution without fusion. The kernels are - /// executed in the same order as they were initially submitted to the wrapped - /// queue. - /// - /// This operation is asynchronous, i.e., it may return after the previously - /// submitted kernels have been passed to the scheduler, but before any of the - /// previously submitted kernel starts or completes execution. The events - /// returned by submit() since the last call to start_fusion remain valid and - /// can be used for synchronization. - /// - /// The queue is not in "fusion mode" anymore after this calls returns, until - /// the next start_fusion(). - void cancel_fusion(); - - /// - /// @brief Complete the fusion: JIT-compile a fused kernel from all kernels - /// submitted to the wrapped queue since the last start_fusion and submit the - /// fused kernel for execution. Inside the fused kernel, the per-work-item - /// effects are executed in the same order as the kernels were initially - /// submitted. - /// - /// This operation is asynchronous, i.e., it may return after the JIT - /// compilation is executed and the fused kernel is passed to the scheduler, - /// but before the fused kernel starts or completes execution. The returned - /// event allows to synchronize with the execution of the fused kernel. All - /// events returned by queue::submit since the last call to start_fusion - /// remain valid. - /// - /// The wrapped queue is not in "fusion mode" anymore after this calls - /// returns, until the next start_fusion(). - /// - /// @param properties Properties to take into account when performing fusion. - event complete_fusion(const property_list &propList = {}); - -private: - std::shared_ptr MQueue; -}; -} // namespace ext::codeplay::experimental -} // namespace _V1 -} // namespace sycl diff --git a/sycl/include/sycl/info/ext_codeplay_device_traits.def b/sycl/include/sycl/info/ext_codeplay_device_traits.def index 2a0a9a90450c2..f6bfaa9dd198f 100644 --- a/sycl/include/sycl/info/ext_codeplay_device_traits.def +++ b/sycl/include/sycl/info/ext_codeplay_device_traits.def @@ -2,8 +2,6 @@ #define __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF #define __SYCL_PARAM_TRAITS_TEMPLATE_SPEC __SYCL_PARAM_TRAITS_SPEC #endif -// TODO(#15184): Remove the fusion aspect in the next ABI-breaking window. -__SYCL_PARAM_TRAITS_SPEC(ext::codeplay::experimental,device, supports_fusion, bool, __SYCL_TRAIT_HANDLED_IN_RT) __SYCL_PARAM_TRAITS_SPEC( ext::codeplay::experimental, device, max_registers_per_work_group, uint32_t, UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 1e02a5fd930bc..6032ed720f628 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3645,19 +3645,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { CodeLoc); } -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES - /// @brief Returns true if the queue was created with the - /// ext::codeplay::experimental::property::queue::enable_fusion property. - /// - /// Equivalent to - /// `has_property()`. - /// - // TODO(#15184) Remove this function in the next ABI-breaking window. - __SYCL_DEPRECATED( - "Support for ext_codeplay_kernel_fusion extesnsion is dropped") - bool ext_codeplay_supports_fusion() const; -#endif - /// Shortcut for executing a graph of commands. /// /// \param Graph the graph of commands to execute diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index cf75d0c9c68ec..2c9e27261a386 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -90,7 +90,6 @@ can be disabled by setting SYCL_DISABLE_FSYCL_SYCLHPP_WARNING macro.") #include #include -#include #include #include #include diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index 7516865682f26..5d3069e402fdf 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -286,7 +286,6 @@ set(SYCL_COMMON_SOURCES "detail/error_handling/error_handling.cpp" "detail/event_impl.cpp" "detail/filter_selector_impl.cpp" - "detail/fusion/fusion_wrapper.cpp" "detail/global_handler.cpp" "detail/graph/graph_impl.cpp" "detail/graph/dynamic_impl.cpp" diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 020aae35bb9a6..d9e3df72625e5 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -901,13 +901,6 @@ class device_impl : public std::enable_shared_from_this { SupportFlags & UR_KERNEL_LAUNCH_PROPERTIES_FLAG_CLUSTER_DIMENSION); } - // ext_codeplay_device_traits.def - - CASE(ext::codeplay::experimental::info::device::supports_fusion) { - // TODO(#15184): Remove this aspect in the next ABI-breaking window. - return false; - } - // ext_oneapi_device_traits.def CASE(ext::oneapi::experimental::info::device::max_global_work_groups) { diff --git a/sycl/source/detail/fusion/fusion_wrapper.cpp b/sycl/source/detail/fusion/fusion_wrapper.cpp deleted file mode 100644 index 4a6aae1548d24..0000000000000 --- a/sycl/source/detail/fusion/fusion_wrapper.cpp +++ /dev/null @@ -1,40 +0,0 @@ -//==------------ fusion_wrapper.cpp ----------------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// TODO(#15184): Delete this file in the next ABI-breaking window. - -#include - -#include - -namespace sycl { -inline namespace _V1 { -namespace ext::codeplay::experimental { - -fusion_wrapper::fusion_wrapper(queue &Queue) - : MQueue{detail::getSyclObjImpl(Queue)} {} - -queue fusion_wrapper::get_queue() const { - return detail::createSyclObjFromImpl(MQueue); -} - -bool fusion_wrapper::is_in_fusion_mode() const { return false; } - -void fusion_wrapper::start_fusion() {} - -void fusion_wrapper::cancel_fusion() {} - -event fusion_wrapper::complete_fusion(const property_list &PropList) { - (void)PropList; - throw sycl::exception(sycl::errc::feature_not_supported, - "Kernel fusion extension is no longer supported"); -} - -} // namespace ext::codeplay::experimental -} // namespace _V1 -} // namespace sycl diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 4da53bba6a18b..2b972ec81e449 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -137,7 +137,7 @@ static ::jit_compiler::BinaryFormat getTargetFormat(queue_impl &Queue) { default: throw sycl::exception( sycl::make_error_code(sycl::errc::feature_not_supported), - "Backend unsupported by kernel fusion"); + "Backend unsupported by JIT compiler"); } } #endif // _WIN32 diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index f36b1e269e009..152b36efa68f2 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -304,20 +304,6 @@ bool Command::isHostTask() const { CGType::CodeplayHostTask); } -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES -// This function is unused and should be removed in the next ABI-breaking -// window. -bool Command::isFusable() const { - if ((MType != CommandType::RUN_CG)) { - return false; - } - const auto &CG = (static_cast(*this)).getCG(); - return (CG.getType() == CGType::Kernel) && - (!static_cast(CG).MKernelIsCooperative) && - (!static_cast(CG).MKernelUsesClusterLaunch); -} -#endif // __INTEL_PREVIEW_BREAKING_CHANGES - namespace { struct EnqueueNativeCommandData { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 1f3264c36a997..28b753b826857 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -252,12 +252,6 @@ class Command { bool isHostTask() const; -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES - // This function is unused and should be removed in the next ABI-breaking - // window. - bool isFusable() const; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES - protected: std::shared_ptr MQueue; EventImplPtr MEvent; diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 78ad6cc4d9932..550fdba176868 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -282,9 +282,6 @@ bool queue::device_has(aspect Aspect) const { // avoid creating sycl object from impl return impl->getDeviceImpl().has(Aspect); } - -// TODO(#15184) Remove this function in the next ABI-breaking window. -bool queue::ext_codeplay_supports_fusion() const { return false; } #endif sycl::detail::optional queue::ext_oneapi_get_last_event_impl() const { diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 8805cdc726754..956fb7a053f96 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3143,11 +3143,6 @@ _ZN4sycl3_V13ext6oneapi12experimental9image_memC2ERKNS3_16image_descriptorERKNS0 _ZN4sycl3_V13ext6oneapi12experimental9image_memC2ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi15filter_selectorC1ENS0_6detail11string_viewE _ZN4sycl3_V13ext6oneapi15filter_selectorC2ENS0_6detail11string_viewE -_ZN4sycl3_V13ext8codeplay12experimental14fusion_wrapper12start_fusionEv -_ZN4sycl3_V13ext8codeplay12experimental14fusion_wrapper13cancel_fusionEv -_ZN4sycl3_V13ext8codeplay12experimental14fusion_wrapper15complete_fusionERKNS0_13property_listE -_ZN4sycl3_V13ext8codeplay12experimental14fusion_wrapperC1ERNS0_5queueE -_ZN4sycl3_V13ext8codeplay12experimental14fusion_wrapperC2ERNS0_5queueE _ZN4sycl3_V14freeEPvRKNS0_5queueERKNS0_6detail13code_locationE _ZN4sycl3_V14freeEPvRKNS0_7contextERKNS0_6detail13code_locationE _ZN4sycl3_V15event13get_wait_listEv @@ -3670,8 +3665,6 @@ _ZNK4sycl3_V13ext6oneapi12experimental9image_mem9get_rangeEv _ZNK4sycl3_V13ext6oneapi15filter_selector13select_deviceEv _ZNK4sycl3_V13ext6oneapi15filter_selector5resetEv _ZNK4sycl3_V13ext6oneapi15filter_selectorclERKNS0_6deviceE -_ZNK4sycl3_V13ext8codeplay12experimental14fusion_wrapper17is_in_fusion_modeEv -_ZNK4sycl3_V13ext8codeplay12experimental14fusion_wrapper9get_queueEv _ZNK4sycl3_V15event11get_backendEv _ZNK4sycl3_V15event15getNativeVectorEv _ZNK4sycl3_V15event18get_profiling_infoINS0_4info15event_profiling11command_endEEENS0_6detail28is_event_profiling_info_descIT_E11return_typeEv @@ -3693,7 +3686,6 @@ _ZNK4sycl3_V15queue20ext_oneapi_get_graphEv _ZNK4sycl3_V15queue20ext_oneapi_get_stateEv _ZNK4sycl3_V15queue22submit_with_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue25submit_without_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb -_ZNK4sycl3_V15queue28ext_codeplay_supports_fusionEv _ZNK4sycl3_V15queue30ext_oneapi_get_last_event_implEv _ZNK4sycl3_V15queue3getEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue15reference_countEEENS0_6detail18is_queue_info_descIT_E11return_typeEv @@ -3815,7 +3807,6 @@ _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31w _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device32work_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi4info6device17num_compute_unitsEEENS0_6detail11ABINeutralTINS8_19is_device_info_descIT_E11return_typeEE4typeEv -_ZNK4sycl3_V16device13get_info_implINS0_3ext8codeplay12experimental4info6device15supports_fusionEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_3ext8codeplay12experimental4info6device28max_registers_per_work_groupEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_4info6device10extensionsEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_4info6device11device_typeEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 2104972ee6272..505ea8f7dfa3b 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -200,7 +200,6 @@ ??$get_info_impl@Usingle_fp_config@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4fp_config@info@_V1@sycl@@V?$allocator@W4fp_config@info@_V1@sycl@@@std@@@std@@XZ ??$get_info_impl@Usub_group_independent_forward_progress@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA_NXZ ??$get_info_impl@Usub_group_sizes@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@_KV?$allocator@_K@std@@@std@@XZ -??$get_info_impl@Usupports_fusion@device@info@experimental@codeplay@ext@_V1@sycl@@@device@_V1@sycl@@AEBA_NXZ ??$get_info_impl@Uusm_device_allocations@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA_NXZ ??$get_info_impl@Uusm_host_allocations@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA_NXZ ??$get_info_impl@Uusm_restricted_shared_allocations@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA_NXZ @@ -348,9 +347,6 @@ ??0filter_selector@oneapi@ext@_V1@sycl@@QEAA@$$QEAV01234@@Z ??0filter_selector@oneapi@ext@_V1@sycl@@QEAA@AEBV01234@@Z ??0filter_selector@oneapi@ext@_V1@sycl@@QEAA@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z -??0fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z -??0fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAA@AEAVqueue@45@@Z -??0fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAA@AEBV012345@@Z ??0gpu_selector@_V1@sycl@@QEAA@$$QEAV012@@Z ??0gpu_selector@_V1@sycl@@QEAA@AEBV012@@Z ??0gpu_selector@_V1@sycl@@QEAA@XZ @@ -462,7 +458,6 @@ ??1executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1filter_selector@ONEAPI@_V1@sycl@@UEAA@XZ ??1filter_selector@oneapi@ext@_V1@sycl@@UEAA@XZ -??1fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAA@XZ ??1gpu_selector@_V1@sycl@@UEAA@XZ ??1handler@_V1@sycl@@AEAA@XZ ??1image_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ @@ -556,8 +551,6 @@ ??4filter_selector@ONEAPI@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4filter_selector@oneapi@ext@_V1@sycl@@QEAAAEAV01234@$$QEAV01234@@Z ??4filter_selector@oneapi@ext@_V1@sycl@@QEAAAEAV01234@AEBV01234@@Z -??4fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z -??4fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z ??4gpu_selector@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z ??4gpu_selector@_V1@sycl@@QEAAAEAV012@AEBV012@@Z ??4image_mem@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z @@ -3743,14 +3736,12 @@ ?begin_recording@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBV?$vector@Vqueue@_V1@sycl@@V?$allocator@Vqueue@_V1@sycl@@@std@@@std@@AEBVproperty_list@67@@Z ?build_from_source@detail@experimental@oneapi@ext@_V1@sycl@@YA?AV?$kernel_bundle@$01@56@AEAV?$kernel_bundle@$02@56@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@Vstring_view@detail@_V1@sycl@@V?$allocator@Vstring_view@detail@_V1@sycl@@@std@@@std@@PEAVstring@156@2@Z ?build_impl@detail@_V1@sycl@@YA?AV?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@AEBV?$kernel_bundle@$0A@@23@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@5@AEBVproperty_list@23@@Z -?cancel_fusion@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAAXXZ ?category@exception@_V1@sycl@@QEBAAEBVerror_category@std@@XZ ?checkNodePropertiesAndThrow@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@KAXAEBVproperty_list@67@@Z ?close@ipc_memory@experimental@oneapi@ext@_V1@sycl@@YAXPEAXAEBVcontext@56@@Z ?code@exception@_V1@sycl@@QEBAAEBVerror_code@std@@XZ ?compile_from_source@detail@experimental@oneapi@ext@_V1@sycl@@YA?AV?$kernel_bundle@$00@56@AEAV?$kernel_bundle@$02@56@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@Vstring_view@detail@_V1@sycl@@V?$allocator@Vstring_view@detail@_V1@sycl@@@std@@@std@@PEAVstring@156@2@Z ?compile_impl@detail@_V1@sycl@@YA?AV?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@AEBV?$kernel_bundle@$0A@@23@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@5@AEBVproperty_list@23@@Z -?complete_fusion@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAA?AVevent@56@AEBVproperty_list@56@@Z ?computeFallbackKernelBounds@handler@_V1@sycl@@AEAA?AV?$id@$01@23@_K0@Z ?constructorNotification@buffer_plain@detail@_V1@sycl@@IEAAXAEBUcode_location@234@PEAXPEBX2IIQEA_K@Z ?constructorNotification@detail@_V1@sycl@@YAXPEAX0W4target@access@23@W4mode@523@AEBUcode_location@123@@Z @@ -3791,7 +3782,6 @@ ?export_device_mem_opaque_fd@detail@experimental@oneapi@ext@_V1@sycl@@YAHPEAXAEBVdevice@56@AEBVcontext@56@@Z ?export_device_mem_win32_nt_handle@detail@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEAXAEBVdevice@56@AEBVcontext@56@@Z ?ext_codeplay_has_graph@interop_handle@_V1@sycl@@QEBA_NXZ -?ext_codeplay_supports_fusion@queue@_V1@sycl@@QEBA_NXZ ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXVstring_view@detail@23@PEAX_K_N@Z ?ext_intel_write_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z @@ -4128,7 +4118,6 @@ ?get_pointer_type@detail@_V1@sycl@@YA?AW4alloc@usm@23@PEBXAEAVcontext_impl@123@@Z ?get_precision@stream@_V1@sycl@@QEBA_KXZ ?get_predecessors@node@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -?get_queue@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEBA?AVqueue@56@XZ ?get_range@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$range@$02@56@XZ ?get_range@image_plain@detail@_V1@sycl@@IEBA?AV?$range@$02@34@XZ ?get_required_mem_size@executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA_KXZ @@ -4186,7 +4175,6 @@ ?is_compatible@_V1@sycl@@YA_NAEBV?$vector@Vkernel_id@_V1@sycl@@V?$allocator@Vkernel_id@_V1@sycl@@@std@@@std@@AEBVdevice@12@@Z ?is_cpu@device@_V1@sycl@@QEBA_NXZ ?is_gpu@device@_V1@sycl@@QEBA_NXZ -?is_in_fusion_mode@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEBA_NXZ ?is_in_order@queue@_V1@sycl@@QEBA_NXZ ?is_specialization_constant_set@kernel_bundle_plain@detail@_V1@sycl@@IEBA_NPEBD@Z ?join_impl@detail@_V1@sycl@@YA?AV?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@AEBV?$vector@V?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@V?$allocator@V?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@@2@@5@W4bundle_state@23@@Z @@ -4358,7 +4346,6 @@ ?size@physical_mem@experimental@oneapi@ext@_V1@sycl@@QEBA_KXZ ?size@stream@_V1@sycl@@QEBA_KXZ ?start@HostProfilingInfo@detail@_V1@sycl@@QEAAXXZ -?start_fusion@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAAXXZ ?storeRawArg@handler@_V1@sycl@@AEAAPEAXAEBVraw_kernel_arg@experimental@oneapi@ext@23@@Z ?storeRawArg@handler@_V1@sycl@@AEAAPEAXPEBX_K@Z ?submit_graph_direct_with_event_impl@_V1@sycl@@YA?AVevent@12@AEBVqueue@12@AEAV?$command_graph@$00@experimental@oneapi@ext@12@V?$span@$$CBVevent@_V1@sycl@@$0?0@12@AEBUcode_location@detail@12@@Z