diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index cb2220e9fbc12..a5920d2a9a718 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -249,7 +249,6 @@ install(FILES file(GLOB_RECURSE HEADERS_IN_SYCL_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/sycl/*") file(GLOB_RECURSE HEADERS_IN_CL_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/CL/*") file(GLOB_RECURSE HEADERS_IN_STD_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/std/*") -file(GLOB_RECURSE HEADERS_IN_SYCLCOMPAT_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/syclcompat/*" "${sycl_inc_dir}/syclcompat.hpp") string(REPLACE "${sycl_inc_dir}" "${SYCL_INCLUDE_BUILD_DIR}" OUT_HEADERS_IN_SYCL_DIR "${HEADERS_IN_SYCL_DIR}") @@ -293,8 +292,6 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/sycl ${SYCL_INCLUDE_BUILD_DIR}/sycl COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/CL ${SYCL_INCLUDE_BUILD_DIR}/CL COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/std ${SYCL_INCLUDE_BUILD_DIR}/std - COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/syclcompat ${SYCL_INCLUDE_BUILD_DIR}/syclcompat - COMMAND ${CMAKE_COMMAND} -E copy ${sycl_inc_dir}/syclcompat.hpp ${SYCL_INCLUDE_BUILD_DIR}/syclcompat.hpp COMMAND ${CMAKE_COMMAND} -E copy ${UR_HEADERS_TO_COPY} ${SYCL_INCLUDE_BUILD_DIR} COMMENT "Copying SYCL headers ...") @@ -302,8 +299,6 @@ add_custom_command( install(DIRECTORY "${sycl_inc_dir}/sycl" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers) install(DIRECTORY "${sycl_inc_dir}/CL" DESTINATION ${SYCL_INCLUDE_DIR}/ COMPONENT sycl-headers) install(DIRECTORY "${sycl_inc_dir}/std" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers) -install(DIRECTORY "${sycl_inc_dir}/syclcompat" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers) -install(FILES "${sycl_inc_dir}/syclcompat.hpp" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers) install(FILES "${UNIFIED_RUNTIME_INCLUDE_DIR}/ur_api.h" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers) install(FILES "${UNIFIED_RUNTIME_INCLUDE_DIR}/ur_api_funcs.def" DESTINATION ${SYCL_INCLUDE_DIR} diff --git a/sycl/doc/index.rst b/sycl/doc/index.rst index fe3e1078514a8..fa885e8cdb000 100644 --- a/sycl/doc/index.rst +++ b/sycl/doc/index.rst @@ -14,7 +14,6 @@ Using oneAPI DPC++ for Application Development PreprocessorMacros cuda/contents Extensions - syclcompat/README.md FAQ EnvironmentVariables MultiTileCardWithLevelZero diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md deleted file mode 100644 index 9835d325a966d..0000000000000 --- a/sycl/doc/syclcompat/README.md +++ /dev/null @@ -1,3503 +0,0 @@ -**⚠️ DEPRECATION NOTICE ⚠️** - -**SYCLcompat is deprecated and will be removed in a future release. Users are encouraged to migrate to native SYCL APIs or alternative compatibility solutions. The `syclcompat` namespace has been marked with `[[deprecated]]` attribute.** - -# SYCLcompat - -SYCLcompat is a header-only library that intends to help developers familiar -with other heterogeneous programming models (such as OpenMP, CUDA or HIP) to -familiarize themselves with the SYCL programming API while porting their -existing codes. Compatibility tools can also benefit from the reduced API size -when converting legacy codebases. - -SYCLcompat provides: - -* A high-level API that provides closer semantics to other programming models, -simplifying line by line conversions. -* Alternative submission APIs that encapsulate SYCL-specific "queue" and -"event" APIs for easier reference. -* Ability to gradually introduce other SYCL concepts as the user familiarizes -themselves with the core SYCL API. -* Clear distinction between core SYCL API and the compatibility interface via -separate namespaces. - -## Notice - -Copyright © 2023-2024 Codeplay Software Limited. All rights reserved. - -Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of -The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by -permission by Khronos. - -## Support - -SYCLcompat depends on specific oneAPI DPC++ compiler extensions that may not be -available to all the SYCL 2020 specification implementations. - -Specifically, this library depends on the following SYCL extensions: - -* [sycl_ext_oneapi_local_memory]( - ../extensions/supported/sycl_ext_oneapi_local_memory.asciidoc) -* [sycl_ext_oneapi_complex]( - ../extensions/experimental/sycl_ext_oneapi_complex.asciidoc) -* [sycl_ext_oneapi_free_function_queries]( - ../extensions/supported/sycl_ext_oneapi_free_function_queries.asciidoc) -* [sycl_ext_oneapi_assert]( - ../extensions/supported/sycl_ext_oneapi_assert.asciidoc) -* [sycl_ext_oneapi_enqueue_barrier]( - ../extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc) -* [sycl_ext_oneapi_usm_device_read_only]( - ../extensions/supported/sycl_ext_oneapi_usm_device_read_only.asciidoc) -* [sycl_ext_oneapi_properties]( - ../extensions/experimental/sycl_ext_oneapi_properties.asciidoc) -* [sycl_ext_oneapi_enqueue_functions]( - ../extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc) -* [sycl_ext_oneapi_kernel_properties]( - ../extensions/experimental/sycl_ext_oneapi_kernel_properties.asciidoc) - -If available, the following extensions extend SYCLcompat functionality: - -* [sycl_ext_intel_device_info](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_intel_device_info.md) \[Optional\] -* [sycl_ext_oneapi_bfloat16_math_functions](../extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc) \[Optional\] -* [sycl_ext_oneapi_max_work_group_query]( - https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_max_work_group_query.md) - \[Optional\] - -### Hardware Requirements - -Some of the functionalities provided by SYCLcompat rely on Unified Shared Memory (`aspect::usm_device_allocations`), though most of the USM-like memory APIs (malloc*, memcpy*, memset*) support hardware with only buffer/accessor support. See section [Buffer Support](#buffer-support) below. - -## Usage - -All functionality is available under the `syclcompat::` namespace, imported -through the main header, `syclcompat.hpp`. Note that `syclcompat.hpp` does not -import the header. - -``` cpp -#include -``` - -This document presents the public API under the [Features](#features) section, -and provides a working [Sample code](#sample-code) using this library. Refer to -those to learn to use the library. - -## Versioning - -SYCLcompat adopts [semantic versioning](https://semver.org/) -(`major.minor.patch`) in a manner which aligns with oneAPI releases. Each oneAPI -product release has an associated SYCLcompat release. Between oneAPI releases, -there will be at most one `major` or `minor` bump. In other words, if a given -oneAPI release has SYCLcompat version `1.0.0`, the next release will have either -`1.1.0` or, if breaking changes have been made, `2.0.0`. This guarantee has -implications for code merged to the `sycl` branch, described below. - -Between release cycles, ongoing updates to SYCLcompat (including possibly -breaking changes) are merged into DPC++ via PRs to the -[`sycl`](https://github.com/intel/llvm/tree/sycl) branch. If a PR introduces the -*first* breaking changes since the last release, that PR must bump to the next -`major` version. Otherwise, if the PR introduces *new functionality* and neither -the `major` nor `minor` have been bumped since the last release, it must bump to -the next `minor` release. If a PR introduces important bugfixes to existing -functionality, `patch` should be bumped, and there are no limits to how many -`patch` bumps can occur between release cycles. - -### Release Process - -Once all changes planned for a release have been merged, the release process is -defined as: - -1. Check the `major.minor` version associated with the *previous* release. -2. Confirm the version bump process outlined above has been followed. -3. If no version bump has occurred since previous release, bump to next `minor`. -4. oneAPI release is delivered. -5. Tag the SYCLcompat release on DPC++ repo: `SYCLcompat-major.minor.0`. - -### Deprecation Process/Breaking Changes - -As outlined above, SYCLcompat may sometimes make API breaking changes, indicated -with a `major` version bump. Advanced notice (at least one major oneAPI release) -will be provided via a deprecation warning on the relevant APIs, indicating to -the user which alternative API should be used instead. - -Note that SYCLcompat is currently in pre-release, and until version `1.0.0` we -do not consider our API to be stable, and may change it with shorter notice. - -### Changelog - -Since SYCLcompat releases are aligned with oneAPI product releases, the changelog for SYCLcompat is incorporated into [SYCL's Release Notes](https://github.com/intel/llvm/blob/sycl/sycl/ReleaseNotes.md). - -### Experimental Namespace - -SYCLcompat provides some new experimental features in the `syclcompat::experimental` namespace. This serves as a testing ground for new features which are expected to migrate to `syclcompat::` in time, but the developers do not guarantee either API stability or continued existence of these features; they may be modified or removed without notice. When features are migrated from `syclcompat::experimental` to `syclcompat::`, this will be treated as a `minor` version bump. - -## Features - -### dim3 - -SYCLcompat provides a `dim3` class akin to that of CUDA or HIP programming -models. `dim3` encapsulates other languages iteration spaces that are -represented with coordinate letters (x, y, z). In SYCL, the fastest-moving -dimension is the one with the highest index, e.g. in a SYCL 2D range iteration -space, there are two dimensions, 0 and 1, and 1 will be the one that "moves -faster". For CUDA/HIP, the convention is reversed: `x` is the dimension which -moves fastest. `syclcompat::dim3` follows this convention, so that -`syclcompat::dim3(32, 4)` is equivalent to `sycl::range<2>(4, 32)`, and -`syclcompat::dim3(32, 4, 2)` is equivalent to `sycl::range<3>(2, 4, 32)`. - -```cpp -namespace syclcompat { - -class dim3 { -public: - unsigned int x, y, z; - dim3(const sycl::range<3> &r); - dim3(const sycl::range<2> &r); - dim3(const sycl::range<1> &r); - constexpr dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1); - - constexpr size_t size(); - - operator sycl::range<3>(); - operator sycl::range<2>(); - operator sycl::range<1>(); -}; - -// Element-wise operators -inline dim3 operator*(const dim3 &a, const dim3 &b); -inline dim3 operator+(const dim3 &a, const dim3 &b); -inline dim3 operator-(const dim3 &a, const dim3 &b); - -} // syclcompat -``` - -The compatibility headers for SYCL offer a number of convenience functions that -help the mapping between xyz-based coordinates to SYCL iteration spaces in the -different scopes available. In addition to the global range, the following -helper functions are also provided: - -``` c++ -namespace syclcompat { - -namespace local_id { -inline size_t x(); -inline size_t y(); -inline size_t z(); -} // namespace local_id - -namespace local_range { -inline size_t x(); -inline size_t y(); -inline size_t z(); -} // namespace local_range - -namespace work_group_id { -inline size_t x(); -inline size_t y(); -inline size_t z(); -} // namespace work_group_id - -namespace work_group_range { -inline size_t x(); -inline size_t y(); -inline size_t z(); -} // namespace work_group_range - -namespace global_range { -inline size_t x(); -inline size_t y(); -inline size_t z(); -} // namespace global_range - -namespace global_id { -inline size_t x(); -inline size_t y(); -inline size_t z(); -} // namespace global_id - -} // syclcompat -``` - -These translate any kernel dimensions from one convention to the other. An -example of an equivalent SYCL call for a 3D kernel using `compat` is -`syclcompat::global_id::x() == get_global_id(2)`. - -### launch - -SYCLcompat provides a kernel `launch` interface which accepts a function that -executes on the device (a.k.a "kernel") instead of a lambda/functor. It can be -called either by using a pair of "teams"/"blocks" and "threads", from -OpenMP/CUDA terminology, or using a `sycl::nd_range`. The interface accepts a -device _function_ with the use of an `auto F` template parameter, and a variadic -`Args` for the function's arguments. - -Various overloads for `launch` exist to permit the user to launch on a -specific `queue`, or to describe the range as either `nd_range` or `dim3, dim3`. - -``` c++ -namespace syclcompat { - -template -sycl::event launch(const dim3 &grid, const dim3 &threads, Args... args); - -template -sycl::event launch(const sycl::nd_range &range, Args... args); - -template -sycl::event launch(const sycl::nd_range &range, - sycl::queue q, Args... args); - -template -sycl::event launch(const dim3 &grid, const dim3 &threads, - sycl::queue q, Args... args); - -} // syclcompat -``` - -For example, if the user had an existing function named `vectorAdd` to execute -on a device such as follows: - -``` c++ -void vectorAdd(const float *A, const float *B, float *C, int n); -``` - -using SYCLcompat, the user can call it as follows: - -``` c++ -syclcompat::launch(blocksPerGrid, threadsPerBlock, d_A, d_B, d_C, n); -``` - -which would be equivalent to the following call using a `sycl::nd_range`: - -``` c++ -auto range = sycl::nd_range<3>{blocksPerGrid * threadsPerBlock, - threadsPerBlock}; -syclcompat::launch(range, d_A, d_B, d_C, n); -``` - -Note that since `syclcompat::launch` accepts a device function, the kernel -lambda is constructed by SYCLcompat internally. This means that, for -example, `sycl::local_accessor`s cannot be declared. Instead, users wishing to -use local memory should launch with a `launch_policy` object as described below. - -#### launch_policy - -In addition to the simple `syclcompat::launch` interface described above, -SYCLcompat provides a more flexible (`experimental`) interface to `launch` a -kernel with a given `launch_policy`. By constructing and passing a -`launch_policy`, users can pass `sycl::ext::oneapi::experimental::properties` -associated with the kernel or launch, as well as request **local memory** for -the kernel. - -In order to disambiguate the variadic constructor of `launch_policy`, the -following wrapper structs are defined. The `kernel_properties` and -`launch_properties` wrappers can be constructed *either* with a variadc set of -properties, or with an existing `sycl_exp::properties` object. - -```cpp -namespace syclcompat::experimental { -namespace sycl_exp = sycl::ext::oneapi::experimental; - -// Wrapper for kernel sycl_exp::properties -template struct kernel_properties { - using Props = Properties; - template - kernel_properties(Props... properties); - template - kernel_properties(sycl_exp::properties properties) - Properties props; -}; - -// Wrapper for launch sycl_exp::properties -template struct launch_properties { - using Props = Properties; - template - launch_properties(Props... properties); - template - launch_properties(sycl_exp::properties properties) - Properties props; -}; - -// Wrapper for local memory size -struct local_mem_size { - local_mem_size(size_t size = 0); - size_t size; -}; - -} //namespace syclcompat::experimental -``` - -The constructors of `launch_policy` are variadic, accepting any form of range -(`nd_range`, `range`, `dim3`, `dim3, dim3`), followed by zero or more of -`local_memory_size`, `kernel_properties`, and `launch_properties`: - -``` c++ -namespace syclcompat::experimental { -namespace sycl_exp = sycl::ext::oneapi::experimental; - -// launch_policy is constructed by the user & passed to `compat_exp::launch` -template -class launch_policy { -public: - using KPropsT = KProps; - using LPropsT = LProps; - using RangeT = Range; - static constexpr bool HasLocalMem = LocalMem; - - template - launch_policy(Range range, Ts... ts); - - template - launch_policy(dim3 global_range, Ts... ts); - - template - launch_policy(dim3 global_range, dim3 local_range, Ts... ts); - - KProps get_kernel_properties(); - LProps get_launch_properties(); - size_t get_local_mem_size(); - Range get_range(); -}; -} //namespace syclcompat::experimental -``` - -The `launch` overloads accepting a `launch_policy` are: - -```cpp -namespace syclcompat::experimental { - -template -sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args); - -template -sycl::event launch(LaunchPolicy launch_policy, Args... args); -} //namespace syclcompat::experimental - -``` - -For local memory, `launch` injects a `char *` pointer to the beginning -of a local accessor of the requested `local_mem_size` as the last argument of -the kernel function. This `char *` can then be reinterpreted as the datatype -required by the user within the kernel function. - -For example, the previous function named `vectorAdd` can be modified -with the following signature, which adds a `char *` pointer to access local -memory inside the kernel: - -``` c++ -void vectorAdd(const float *A, const float *B, float *C, int n, - char *local_mem); -``` - -Then, the new `vectorAdd` can be launched like this: - -``` c++ -using syclcompat::experimental; -launch_policy policy{blocksPerGrid, threadsPerBlock, - local_mem_size(nbytes)}; -launch(policy, d_A, d_B, d_C, n); -``` - -To request a different cache/local memory split on supported hardware: - -```c++ -using syclcompat::experimental; -namespace sycl_intel_exp = sycl::ext::intel::experimental; - -sycl_intel_exp::cache_config cache_config{ - sycl_intel_exp::large_slm}; -kernel_properties kernel_props{cache_config}; -launch_policy policy{blocksPerGrid, threadsPerBlock, - local_mem_size(nbytes), kernel_props}; - -launch(policy, d_A, d_B, d_C, n); -``` - -To request a certain cluster dimension on supported hardware: - -```c++ -using syclcompat::experimental; -namespace sycl_exp = sycl::ext::oneapi::experimental; - -sycl_exp::cuda::cluster_size cluster_dims(cluster_range); -launch_policy policy{blocksPerGrid, threadsPerBlock, - local_mem_size(nbytes), - launch_properties{cluster_dims}}; - -launch(policy, d_A, d_B, d_C, n); -``` - -### Utilities - -SYCLcompat introduces a set of utility functions designed to streamline the -usage of the library and its `launch` mechanism. - -The first utility function is `syclcompat::wg_barrier()`, which provides a -concise work-group barrier. `syclcompat::wg_barrier()` uses the -_SYCL_INTEL_free_function_queries_ extension to provide this functionality. - -The second utility function, `syclcompat::compute_nd_range`, ensures that the -provided global size and work group sizes are appropriate for a given -dimensionality, and that global size is rounded up to a multiple of the work -group size in each dimension. - -```c++ -namespace syclcompat { - -inline void wg_barrier(); - -template -inline sycl::nd_range compute_nd_range(sycl::range global_size_in, - sycl::range work_group_size); -inline sycl::nd_range<1> compute_nd_range(int global_size_in, - int work_group_size); - -} // syclcompat -``` - -### Queues - -The design for this library assumes _in-order_ queues -(`sycl::property::queue::in_order()`). - -Many of the APIs accept an optional `queue` parameter, and this can be an -out-of-order queue, either created manually or retrieved via a call to -`syclcompat::create_queue()`, specifying `false` for the `in_order` parameter. - -```c++ -namespace syclcompat { - -inline sycl::queue create_queue(bool print_on_async_exceptions = false, - bool in_order = true); - -} // syclcompat -``` - -However, SYCLcompat does not implement any mechanisms to deal with this case. -The rationale for this is that a user wanting the full power of SYCL's -dependency management shouldn't be using the this library. As such, support for -out-of-order queues is very limited. The only way to safely use an out-of-order -queue at present is to explicitly `q.wait()` or `e.wait()` where `e` is the -`sycl::event` returned through a `syclcompat::async` API. - -To facilitate machine translation from other heterogeneous programming models to -SYCL, SYCLcompat provides the following pointer aliases for `sycl::event` and -`sycl::queue`, and the function `destroy_event` which destroys an `event_ptr` -allocated on the heap. - -``` c++ -namespace syclcompat { - -using event_ptr = sycl::event *; - -using queue_ptr = sycl::queue *; - -static void destroy_event(event_ptr event); - -} // syclcompat -``` - -### Memory Operations - -This library provides interfaces to allocate memory to be accessed within kernel -functions and on the host. The `syclcompat::malloc` function allocates device -USM memory, the `syclcompat::malloc_host` function allocates host USM memory, -and the `syclcompat::malloc_shared` function allocates shared USM memory. - -In each case we provide a template and non-templated interface for allocating -memory, taking the number of elements or number of bytes respectively. - -The interface includes both synchronous and asynchronous `malloc`, `memcpy`, -`memset`, `fill`, and `free` operations. - -There is a helper class `pointer_attributes` to query allocation type for memory -pointers using SYCLcompat, through `sycl::usm::alloc` and -`sycl::get_pointer_device`. - -``` c++ -namespace syclcompat { - -// Expects number of elements -template -T *malloc(size_t count, sycl::queue q = get_default_queue()); -template -T *malloc_host(size_t count, sycl::queue q = get_default_queue()); -template -T *malloc_shared(size_t count, sycl::queue q = get_default_queue()); - -// Expects size of the memory in bytes -void *malloc(size_t num_bytes, sycl::queue q = get_default_queue()); -void *malloc_host(size_t num_bytes, sycl::queue q = get_default_queue()); -void *malloc_shared(size_t num_bytes, sycl::queue q = get_default_queue()); - -// 2D, 3D memory allocation wrappers -void *malloc(size_t &pitch, size_t x, size_t y, - sycl::queue q = get_default_queue()) -pitched_data malloc(sycl::range<3> size, sycl::queue q = get_default_queue()); - -// Blocking memcpy -void memcpy(void *to_ptr, const void *from_ptr, size_t size, - sycl::queue q = get_default_queue()); -void memcpy(T *to_ptr, const T *from_ptr, size_t count, - sycl::queue q = get_default_queue()); -void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr, - size_t from_pitch, size_t x, size_t y, - sycl::queue q = get_default_queue()); // 2D matrix -void memcpy(pitched_data to, sycl::id<3> to_pos, - pitched_data from, sycl::id<3> from_pos, - sycl::range<3> size, - sycl::queue q = get_default_queue()); // 3D matrix - -// Non-blocking memcpy -sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size, - sycl::queue q = get_default_queue()); -template -sycl::event memcpy_async(T *to_ptr, T void *from_ptr, size_t count, - sycl::queue q = get_default_queue()); -sycl::event memcpy_async(void *to_ptr, size_t to_pitch, - const void *from_ptr, size_t from_pitch, - size_t x, size_t y, - sycl::queue q = get_default_queue()); // 2D matrix -sycl::event memcpy_async(pitched_data to, sycl::id<3> to_pos, - pitched_data from, sycl::id<3> from_pos, - sycl::range<3> size, - sycl::queue q = get_default_queue()); // 3D matrix - -// Fill -template -void fill(void *dev_ptr, const T &pattern, size_t count, - sycl::queue q = get_default_queue()); -template -sycl::event fill_async(void *dev_ptr, const T &pattern, - size_t count, sycl::queue q = get_default_queue()); - -// Memset -void memset(void *dev_ptr, int value, size_t size, - sycl::queue q = get_default_queue()); -void memset(void *ptr, size_t pitch, int val, size_t x, size_t y, - sycl::queue q = get_default_queue()); // 2D matrix -void memset(pitched_data pitch, int val, sycl::range<3> size, - sycl::queue q = get_default_queue()); // 3D matrix -sycl::event memset_async(void *dev_ptr, int value, size_t size, - sycl::queue q = get_default_queue()); -sycl::event memset_async(void *ptr, size_t pitch, int val, - size_t x, size_t y, - sycl::queue q = get_default_queue()); // 2D matrix -sycl::event memset_async(pitched_data pitch, int val, - sycl::range<3> size, - sycl::queue q = get_default_queue()); // 3D matrix - -// Free -void wait_and_free(void *ptr, sycl::queue q = get_default_queue()); -void free(void *ptr, sycl::queue q = get_default_queue()); -sycl::event enqueue_free(const std::vector &pointers, - const std::vector &events, - sycl::queue q = get_default_queue()); - -// Queries pointer allocation type -class pointer_attributes { -public: - void init(const void *ptr, sycl::queue q = get_default_queue()); - sycl::usm::alloc get_memory_type(); - const void *get_device_pointer(); - const void *get_host_pointer(); - bool is_memory_shared(); - unsigned int get_device_id(); -}; - -} // syclcompat -``` - -The `syclcompat::experimental` namespace contains currently unsupported `memcpy` overloads which take a `syclcompat::experimental::memcpy_parameter` argument. These are included for forwards compatibility and currently throw a `std::runtime_error`. - -```cpp -namespace syclcompat { -namespace experimental { -// Forward declarations for types relating to unsupported memcpy_parameter API: - -#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES -class image_mem_wrapper; -#endif -class image_matrix; - -/// Memory copy parameters for 2D/3D memory data. -struct memcpy_parameter { - struct data_wrapper { - pitched_data pitched{}; - sycl::id<3> pos{}; -#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES - experimental::image_mem_wrapper *image_bindless{nullptr}; -#endif - image_matrix *image{nullptr}; - }; - data_wrapper from{}; - data_wrapper to{}; - sycl::range<3> size{}; -}; - -/// [UNSUPPORTED] Synchronously copies 2D/3D memory data specified by \p param . -/// The function will return after the copy is completed. -/// -/// \param param Memory copy parameters. -/// \param q Queue to execute the copy task. -/// \returns no return value. -static inline void memcpy(const memcpy_parameter ¶m, - sycl::queue q = get_default_queue()); - -/// [UNSUPPORTED] Asynchronously copies 2D/3D memory data specified by \p param -/// . The return of the function does NOT guarantee the copy is completed. -/// -/// \param param Memory copy parameters. -/// \param q Queue to execute the copy task. -/// \returns no return value. -static inline void memcpy_async(const memcpy_parameter ¶m, - sycl::queue q = get_default_queue()); - -} // namespace experimental -} // namespace syclcompat -``` - -Finally, the class `pitched_data`, which manages memory allocation for 3D -spaces, padded to avoid uncoalesced memory accesses. - -```c++ -namespace syclcompat { - -class pitched_data { -public: - pitched_data(); - pitched_data(void *data, size_t pitch, size_t x, size_t y); - - void *get_data_ptr(); - size_t get_pitch(); - size_t get_x(); - size_t get_y(); - - void set_data_ptr(void *data); - void set_pitch(size_t pitch); - void set_x(size_t x); - void set_y(size_t y); -}; - -} // syclcompat -``` - -There are various helper classes and aliases defined within SYCLcompat to -encapsulate and define memory operations and objects. These classes and aliases -are primarily designed to assist with machine translation from other -heterogeneous programming models. - -The wrapper class `device_memory` provides a unified representation for device -memory in various regions. The class provides methods to allocate memory for the -object (`init()`) and access the underlying memory in various ways (`get_ptr()`, -`get_access()`, `operator[]`). Aliases for global and USM shared specializations -are provided. - -The `memory_traits` class is provided as a traits helper for `device_memory`. -The `accessor` class template provides a 2D or 3D `sycl::accessor`-like wrapper -around raw pointers. - -```c++ -namespace syclcompat { - -enum class memory_region { - global = 0, // device global memory - constant, // device read-only memory - local, // device local memory - usm_shared, // memory which can be accessed by host and device -}; - -using byte_t = uint8_t; - -template class memory_traits { -public: - static constexpr sycl::access::address_space asp = - (Memory == memory_region::local) - ? sycl::access::address_space::local_space - : sycl::access::address_space::global_space; - static constexpr sycl::target target = - (Memory == memory_region::local) - ? sycl::target::local - : sycl::target::device; - static constexpr sycl::access_mode mode = - (Memory == memory_region::constant) - ? sycl::access_mode::read - : sycl::access_mode::read_write; - static constexpr size_t type_size = sizeof(T); - using element_t = - typename std::conditional_t; - using value_t = typename std::remove_cv_t; - template - using accessor_t = typename std::conditional_t< - target == sycl::target::local, - sycl::local_accessor, - sycl::accessor>; - using pointer_t = T *; -}; - -template class device_memory { -public: - using accessor_t = - typename memory_traits::template accessor_t; - using value_t = typename memory_traits::value_t; - using syclcompat_accessor_t = - syclcompat::accessor; - - device_memory(); - - device_memory(const sycl::range &in_range, - std::initializer_list &&init_list); - - template - device_memory( - const typename std::enable_if>::type &in_range, - std::initializer_list> &&init_list); - - device_memory(const sycl::range &range_in); - - // Variadic constructor taking 1, 2 or 3 integers to be interpreted as a - // sycl::range. - template - device_memory(Args... Arguments); - - ~device_memory(); - - // Allocate memory with default queue, and init memory if has initial value. - void init(); - // Allocate memory with specified queue, and init memory if has initial - // value. - void init(sycl::queue q); - - // The variable is assigned to a device pointer. - void assign(value_t *src, size_t size); - - // Get memory pointer of the memory object, which is virtual pointer when - // usm is not used, and device pointer when usm is used. - value_t *get_ptr(); - // Get memory pointer of the memory object, which is virtual pointer when - // usm is not used, and device pointer when usm is used. - value_t *get_ptr(sycl::queue q); - - // Get the device memory object size in bytes. - size_t get_size(); - - template - typename std::enable_if::type &operator[](size_t index); - - // Get accessor with dimension info for the device memory object - // when usm is used and dimension is greater than 1. - template - typename std::enable_if::type - get_access(sycl::handler &cgh); -}; - - -template -class device_memory : public device_memory { -public: - using base = device_memory; - using value_t = typename base::value_t; - using accessor_t = - typename memory_traits::template accessor_t<0>; - device_memory(const value_t &val); - device_memory(); -}; - -template -using global_memory = device_memory; -template -using constant_memory = detail::device_memory; -template -using shared_memory = device_memory; - - -template class accessor; - -template class accessor { -public: - using memory_t = memory_traits; - using element_t = typename memory_t::element_t; - using pointer_t = typename memory_t::pointer_t; - using accessor_t = typename memory_t::template accessor_t<3>; - - accessor(pointer_t data, const sycl::range<3> &in_range); - template - accessor(typename std::enable_if::type &acc); - accessor(const accessor_t &acc, const sycl::range<3> &in_range); - - accessor operator[](size_t index) const; - - pointer_t get_ptr() const; - -}; - -template class accessor { -public: - using memory_t = memory_traits; - using element_t = typename memory_t::element_t; - using pointer_t = typename memory_t::pointer_t; - using accessor_t = typename memory_t::template accessor_t<2>; - - accessor(pointer_t data, const sycl::range<2> &in_range); - template - accessor(typename std::enable_if::type &acc); - accessor(const accessor_t &acc, const sycl::range<2> &in_range); - - pointer_t operator[](size_t index); - - pointer_t get_ptr() const; -}; - -} // syclcompat -``` - -#### Buffer Support - -Although SYCLcompat is primarily designed around the Unified Shared Memory -model, there is (limited) support for the buffer/accessor model. This can be -enabled by setting the compiler define `SYCLCOMPAT_USM_LEVEL_NONE`. This macro -instructs SYCLcompat to effectively provide emulated USM pointers via a Memory -Manager singleton. - -Note that in `SYCLCOMPAT_USM_LEVEL_NONE` mode, the pointers returned by e.g. -`syclcompat::malloc`, and passed to `syclcompat::memcpy` can *only* interact -with `syclcompat` APIs. It is legal to perform pointer arithmetic on these -virtual pointers, but attempting to dereference them, passing them to `sycl` -APIs, or passing them into kernels will result in an error. - -The SYCLcompat tests with the suffix `_usmnone.cpp` provide examples of how to -use `SYCLCOMPAT_USM_LEVEL_NONE`. - -### ptr_to_int - -The following cuda backend specific function is introduced in order to -translate from local memory pointers to `uint32_t` or `size_t` variables that -contain a byte address to the local (local refers to`.shared` in nvptx) memory -state space. - -``` c++ -namespace syclcompat { -template -__syclcompat_inline__ - std::enable_if_t || std::is_same_v, - T> - ptr_to_int(void *ptr) -} // namespace syclcompat -``` - -These variables can be used in inline PTX instructions that take address -operands. Such inline PTX instructions are commonly used in optimized -libraries. A simplified example usage of the above functions is as follows: - -``` c++ - half *data = syclcompat::local_mem(); - // ... - // ... - T addr = - syclcompat::ptr_to_int(reinterpret_cast(data) + (id % 8) * 16); - uint32_t fragment; -#if defined(__NVPTX__) - asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" - : "=r"(fragment) - : "r"(addr)); -#endif -``` - -### Device Information - -`sycl::device` properties are encapsulated using the `device_info` helper class. -The class is meant to be constructed and used through the extended device -implemented in SYCLcompat. - -This is the synopsis of `device_info`: - -```c++ -class device_info { -public: - const char *get_name(); - char *get_name(); - template , - std::enable_if_t> || - std::is_same_v, - int> = 0> - auto get_max_work_item_sizes() const; - - template , - std::enable_if_t> || - std::is_same_v, - int> = 0> - auto get_max_work_item_sizes() const; - bool get_host_unified_memory() const; - int get_major_version() const; - int get_minor_version() const; - int get_integrated() const; - int get_max_clock_frequency() const; - int get_max_compute_units() const; - int get_max_work_group_size() const; - int get_max_sub_group_size() const; - int get_max_work_items_per_compute_unit() const; - int get_max_register_size_per_work_group() const; - template || - std::is_same_v, - int> = 0> - auto get_max_nd_range_size() const; - template || - std::is_same_v, - int> = 0> - auto get_max_nd_range_size(); - size_t get_global_mem_size() const; - size_t get_local_mem_size() const; - - unsigned int get_memory_clock_rate() const; - unsigned int get_memory_bus_width() const; - uint32_t get_device_id() const; - std::array get_uuid() const; - unsigned int get_global_mem_cache_size() const; - int get_image1d_max() const; - auto get_image2d_max() const; - auto get_image2d_max(); - auto get_image3d_max() const; - auto get_image3d_max(); - - void set_name(const char *name); - void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes); - [[deprecated]] void - set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes); - void set_host_unified_memory(bool host_unified_memory); - void set_major_version(int major); - void set_minor_version(int minor); - void set_integrated(int integrated); - void set_max_clock_frequency(int frequency); - void set_max_compute_units(int max_compute_units); - void set_global_mem_size(size_t global_mem_size); - void set_local_mem_size(size_t local_mem_size); - void set_max_work_group_size(int max_work_group_size); - void set_max_sub_group_size(int max_sub_group_size); - void - set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit); - void set_max_nd_range_size(int max_nd_range_size[]); - void set_max_nd_range_size(sycl::id<3> max_nd_range_size); - void set_memory_clock_rate(unsigned int memory_clock_rate); - void set_memory_bus_width(unsigned int memory_bus_width); - void - set_max_register_size_per_work_group(int max_register_size_per_work_group); - void set_device_id(uint32_t device_id); - void set_uuid(std::array uuid); - void set_global_mem_cache_size(unsigned int global_mem_cache_size); - void set_image1d_max(size_t image_max_buffer_size); - void set_image2d_max(size_t image_max_width_buffer_size, - size_t image_max_height_buffer_size); - void set_image3d_max(size_t image_max_width_buffer_size, - size_t image_max_height_buffer_size, - size_t image_max_depth_buffer_size); -}; -``` - -### Device Management - -Multiple SYCL functionalities are exposed through utility functions to manage -the current `sycl::device`, `sycl::queue`, and `sycl::context`, exposed as -follows: - -```c++ -namespace syclcompat { - -// Util function to create a new queue for the current device -static inline sycl::queue create_queue(bool print_on_async_exceptions = false, - bool in_order = true); - -// Util function to get the default queue of current device in -// device manager. -static inline sycl::queue get_default_queue(); - -// Util function to set the default queue of the current device in the -// device manager. -// If the device extension saved queue is the default queue, -// the previous saved queue will be overwritten as well. -// This function will be blocking if there are submitted kernels in the -// previous default queue. -static inline void set_default_queue(const sycl::queue &q); - -// Util function to wait for the queued kernels. -static inline void wait(sycl::queue q = get_default_queue()); - -// Util function to wait for the queued kernels and throw unhandled errors. -static inline void wait_and_throw(sycl::queue q = get_default_queue()); - -// Util function to get the id of current device in -// device manager. -static inline unsigned int get_current_device_id(); - -// Util function to get the current device. -static inline device_ext &get_current_device(); - -// Util function to get a device by id. -static inline device_ext &get_device(unsigned int id); - -// Util function to get the context of the default queue of current -// device in device manager. -static inline sycl::context get_default_context(); - -// Util function to get a CPU device. -static inline device_ext &cpu_device(); - -/// Filter out devices; only keep the device whose name contains one of the -/// subname in \p dev_subnames. -/// May break device id mapping and change current device. It's better to be -/// called before other SYCLcompat or SYCL APIs. -static inline void filter_device(const std::vector &dev_subnames); - -/// Print all the devices (and their IDs) in the dev_mgr -static inline void list_devices(); - -// Util function to select a device by its id -static inline unsigned int select_device(unsigned int id); - -// Util function to get the device id from a device -static inline unsigned int get_device_id(const sycl::device &dev); - -// Util function to get the number of available devices -static inline unsigned int device_count(); - -// Util function to check whether a device supports some kinds of sycl::aspect. -static inline void -has_capability_or_fail(const sycl::device &dev, - const std::initializer_list &props); -} // syclcompat -``` - -The exposed functionalities include creation and destruction of queues, through -`syclcompat::create_queue` and `syclcompat::destroy_queue`, and providing the -ability to wait for submitted kernels using `syclcompat::wait` or -`syclcompat::wait_and_throw`. Any async errors will be output to `stderr` if -`print_on_async_exceptions`, and will have the default behavior otherwise, which -calls `std:terminate`. Synchronous exceptions have to be managed by users -independently of what is set in this parameter. - -Devices are managed through a helper class, `device_ext`. The `device_ext` class -associates a vector of `sycl::queues` with its `sycl::device`. The `device_ext` -destructor waits on a set of `sycl::event` which can be added to via -`add_event`. This is used, for example, to implement `syclcompat::enqueue_free` to -schedule release of memory after a kernel or `mempcy`. SYCL device properties -can be queried through `device_ext` as well. -`device_ext` also provides the `has_capability_or_fail` member function, which -throws a `sycl::exception` if the device does not have the specified list of -`sycl::aspect`. - -Devices can be listed and filtered using `syclcompat::list_devices()` and -`syclcompat::filter_device()`. If `SYCLCOMPAT_VERBOSE` is defined at compile -time, the available SYCL devices are printed to the standard output both at -initialization time, and when the device list is filtered using -`syclcompat::filter_device`. - -Users can manage queues through the `syclcompat::set_default_queue(sycl::queue -q)` free function, and the `device_ext` `set_saved_queue`, `set_default_queue`, -and `get_saved_queue` member functions. -`set_default_queue` is blocking, and overwrites the previous default queue with -a user defined one, waiting for any submitted kernels to finish. -The `device_ext` automatically sets the saved queue to the default queue. -Therefore, it's important to note that if the previous default queue was the -device's saved queue, setting a new default queue will update the reference of -the saved queue to the new default one to keep the state of the class -consistent. - -The class is exposed as follows: - -```c++ -namespace syclcompat { - -class device_ext : public sycl::device { - device_ext(); - device_ext(const sycl::device &base, bool print_on_async_exceptions = false, - bool in_order = true); - ~device_ext(); - - bool is_native_host_atomic_supported(); - int get_major_version() const; - int get_minor_version() const; - int get_max_compute_units() const; - int get_max_clock_frequency() const; - int get_integrated() const; - int get_max_sub_group_size() const; - int get_max_register_size_per_work_group() const; - int get_max_work_group_size() const; - int get_mem_base_addr_align() const; - size_t get_global_mem_size() const; - size_t get_local_mem_size() const; - void get_memory_info(size_t &free_memory, size_t &total_memory) const; - - void get_device_info(device_info &out) const; - device_info get_device_info() const; - void reset(bool print_on_async_exceptions = false, bool in_order = true); - - sycl::queue *default_queue(); - void set_default_queue(const sycl::queue &q); - void queues_wait_and_throw(); - sycl::queue *create_queue(bool print_on_async_exceptions = false, - bool in_order = true); - void destroy_queue(sycl::queue *&queue); - void set_saved_queue(sycl::queue *q); - sycl::queue *get_saved_queue(); - sycl::context get_context(); - - void - has_capability_or_fail(const std::initializer_list &props) const; -}; - -} // syclcompat -``` - -Free functions are provided for querying major and minor version directly from a `sycl::device`, equivalent to the methods of `device_ext` described above: - -```c++ -static int get_major_version(const sycl::device &dev); -static int get_minor_version(const sycl::device &dev); -``` - -#### Multiple devices - -SYCLcompat allows you to manage multiple devices through -`syclcompat::select_device` and `syclcompat::create_queue`. The library uses the -default SYCL device (i.e. the device returned by `sycl::default_selector_v`) as -the default device, and exposes all other devices available on the system -through the `syclcompat::select_device(unsigned int id)` member function. - -The interface uses the `syclcompat::device_ext::get_current_device_id()` to get -the current CPU thread, and returns the associated device stored internally as a -map with that thread. The map is constructed using calls to -`syclcompat::select_device(unsigned int id)`. Any thread which hasn't used this -member function to select a device will be given the default device. Note that -this implies multiple threads on a single device by default. - -Be aware that targetting multiple devices may lead to unintended behavior caused -by developers, as SYCLcompat does not implement a mechanism to warn when the -wrong queue is used as an argument in any of the member functions of the -`syclcompat` namespace. - -#### Atomic Operations - -SYCLcompat provides an interface for common atomic operations (`add`, `sub`, -`and`, `or`, `xor`, `min`, `max`, `inc`, `dec`, `exchange`, `compare_exchange`). -While SYCL exposes atomic operations through member functions of -`sycl::atomic_ref`, this library provides access via functions taking a standard -pointer argument. Template arguments control the `sycl::memory_scope`, -`sycl::memory_order` and `sycl::access::address_space` of these atomic -operations. SYCLcompat also exposes overloads for these atomic functions which -take a runtime memoryScope argument. Every atomic operation is implemented via -an API function taking a raw pointer as the target. Additional overloads for -`syclcompat::compare_exchange_strong` are provided which take a -`sycl::multi_ptr` instead of a raw pointer. The type of the operand for most -atomic operations is defined as `syclcompat::type_identity_t` to avoid -template deduction issues when an operand of a different type (e.g. double -literal) is supplied. Atomic addition and subtraction free functions make use of -`syclcompat::arith_t` to differentiate between numeric and pointer -arithmetic. - -The available operations are exposed as follows: - -``` c++ -namespace syclcompat { - -template struct type_identity { - using type = T; -}; -template using type_identity_t = typename type_identity::type; - -template struct arith { - using type = std::conditional_t, std::ptrdiff_t, T>; -}; -template using arith_t = typename arith::type; - -template -T atomic_fetch_add(T *addr, arith_t operand); - -template -T atomic_fetch_sub(T *addr, arith_t operand); - -template -T atomic_fetch_and(T *addr, type_identity operand); - -template -T atomic_fetch_or(T *addr, type_identity operand); - -template -T atomic_fetch_xor(T *addr, type_identity operand); - -template -T atomic_fetch_min(T *addr, type_identity operand); - -template -T atomic_fetch_max(T *addr, type_identity operand); - -template -unsigned int atomic_fetch_compare_inc(unsigned int *addr, - unsigned int operand); - -template -unsigned int atomic_fetch_compare_dec(unsigned int *addr, - unsigned int operand); - -template -T atomic_exchange(T *addr, type_identity operand); - -template -T atomic_compare_exchange_strong( - sycl::multi_ptr addr, type_identity_t expected, - type_identity_t desired, - sycl::memory_order success = sycl::memory_order::relaxed, - sycl::memory_order fail = sycl::memory_order::relaxed); -template -T atomic_compare_exchange_strong( - T *addr, T expected, T desired, - sycl::memory_order success = sycl::memory_order::relaxed, - sycl::memory_order fail = sycl::memory_order::relaxed); - -} // namespace syclcompat -``` - -SYCLcompat also provides an atomic class with the `store`, `load`, `exchange`, -`compare_exchange_weak`, `fetch_add`, and `fetch_sub` operations. The atomic -class wrapper supports int, unsigned int, long, unsigned long, long long, -unsigned long long, float, double and pointer datatypes. - -```cpp -namespace syclcompat { - -template -class atomic { - static constexpr sycl::memory_order default_read_order = - sycl::atomic_ref::default_read_order; - static constexpr sycl::memory_order default_write_order = - sycl::atomic_ref::default_write_order; - static constexpr sycl::memory_scope default_scope = DefaultScope; - static constexpr sycl::memory_order default_read_modify_write_order = - DefaultOrder; - - constexpr atomic() noexcept = default; - - constexpr atomic(T d) noexcept; - - void store(T operand, sycl::memory_order memoryOrder = default_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept; - - T load(sycl::memory_order memoryOrder = default_read_order, - sycl::memory_scope memoryScope = default_scope) const noexcept; - - T exchange(T operand, - sycl::memory_order memoryOrder = default_read_modify_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept; - - bool compare_exchange_weak( - T &expected, T desired, sycl::memory_order success, - sycl::memory_order failure, - sycl::memory_scope memoryScope = default_scope) noexcept; - - bool compare_exchange_weak( - T &expected, T desired, - sycl::memory_order memoryOrder = default_read_modify_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept; - - bool compare_exchange_strong( - T &expected, T desired, sycl::memory_order success, - sycl::memory_order failure, - sycl::memory_scope memoryScope = default_scope) noexcept; - - bool compare_exchange_strong( - T &expected, T desired, - sycl::memory_order memoryOrder = default_read_modify_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept; - - T fetch_add(arith_t operand, - sycl::memory_order memoryOrder = default_read_modify_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept; - - T fetch_sub(arith_t operand, - sycl::memory_order memoryOrder = default_read_modify_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept; -}; - -} // namespace syclcompat -``` - -### Compatibility Utilities - -This library provides a number of small compatibility utilities which exist to -facilitate machine translation of code from other programming models to SYCL. -These functions are part of the public API, but they are not expected to be -useful to developers writing their own code. - -Functionality is provided to represent a pair of integers as a `double`. -`cast_ints_to_double(int, int)` returns a `double` containing the given integers -in the high & low 32-bits respectively. `cast_double_to_int` casts the high or -low 32-bits back into an integer. - -`reverse_bits` reverses the bits of a 32-bit unsigned integer, `ffs` returns the -position of the first least significant set bit in an integer. -`byte_level_permute` returns a byte-permutation of two input unsigned integers, -with bytes selected according to a third unsigned integer argument. -`match_all_over_sub_group` and `match_any_over_sub_group` allows comparison of -values across work-items within a sub-group. - -The function `ternary_logic_op`performs bitwise logical operations on three input values of -`a`, `b` and `c` based on the specified 8-bit truth table `lut` and return the -result. - -The functions `select_from_sub_group`, `shift_sub_group_left`, -`shift_sub_group_right` and `permute_sub_group_by_xor` provide equivalent -functionality to `sycl::select_from_group`, `sycl::shift_group_left`, -`sycl::shift_group_right` and `sycl::permute_group_by_xor`, respectively. -However, they provide an optional argument to represent the `logical_group` size -(default 32). - -`int_as_queue_ptr` helps with translation of code by reinterpret casting an -address to `sycl::queue *`, or returning a pointer to SYCLcompat's default queue -if the address is <= 2. -`args_selector` is a helper class for extracting arguments from an array of -pointers to arguments or buffer of arguments to pass to a kernel function. -The class allows users to exclude parameters such as `sycl::nd_item`. -Experimental support for masked versions of `select_from_sub_group`, -`shift_sub_group_left`, `shift_sub_group_right` and `permute_sub_group_by_xor` -is provided only for SPIRV or CUDA devices. - -As part of the compatibility utilities to facilitate machine translation to -SYCL, two aliases for errors are provided, `err0` and `err1`. - -```c++ -namespace syclcompat { - -inline int cast_double_to_int(double d, bool use_high32 = true); - -inline double cast_ints_to_double(int high32, int low32); - -inline unsigned int byte_level_permute(unsigned int a, unsigned int b, - unsigned int s); - -inline uint32_t lop3(uint32_t a, uint32_t b, uint32_t c, uint8_t lut) - -template inline int ffs(ValueT a); - -template -unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask, - T value); - -template -unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask, - T value, int *pred); - -template -ValueT select_from_sub_group(sycl::sub_group g, ValueT x, int remote_local_id, - int logical_sub_group_size = 32); - -template -ValueT shift_sub_group_left(sycl::sub_group g, ValueT x, unsigned int delta, - int logical_sub_group_size = 32); - -template -ValueT shift_sub_group_right(sycl::sub_group g, ValueT x, unsigned int delta, - int logical_sub_group_size = 32); - -template -ValueT permute_sub_group_by_xor(sycl::sub_group g, ValueT x, unsigned int mask, - int logical_sub_group_size = 32); - -namespace experimental { - -template -ValueT select_from_sub_group(unsigned int member_mask, sycl::sub_group g, ValueT x, - int remote_local_id, int logical_sub_group_size = 32); - -template -ValueT shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, ValueT x, - unsigned int delta, int logical_sub_group_size = 32); - -template -ValueT shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, ValueT x, - unsigned int delta, int logical_sub_group_size = 32); - -template -ValueT permute_sub_group_by_xor(unsigned int member_mask, sycql::sub_group g, ValueT x, - unsigned int mask, int logical_sub_group_size = 32); - -} // namespace experimental - -inline sycl::queue *int_as_queue_ptr(uintptr_t x); - -using err0 = detail::generic_error_type; -using err1 = detail::generic_error_type; - -template -class args_selector; - -template -class args_selector { -public: - // Get the type of the ith argument of R(Ts...) - template - using arg_type = - std::tuple_element_t(), std::tuple>; - - // If kernel_params is nonnull, then args_selector will - // extract arguments from kernel_params. Otherwise, it - // will extract them from extra. - args_selector(void **kernel_params, void **extra) - : kernel_params(kernel_params), args_buffer(get_args_buffer(extra)) {} - - // Get a reference to the i-th argument extracted from kernel_params - // or extra. - template arg_type &get(); -}; - -} // namespace syclcompat -``` - -The function `experimental::nd_range_barrier` synchronizes work items from all -work groups within a SYCL kernel. This is not officially supported by the SYCL -spec, and so should be used with caution. -`experimental::calculate_max_active_wg_per_xecore` and -`experimental::calculate_max_potential_wg` are used for occupancy calculation. -There is also an `experimental::logical_group` class which allows -`sycl::sub_group`s to be further subdivided into 'logical' groups to perform -sub-group level operations. This class provides methods to get the local & group -id and range. `experimental::group_type`, `experimental::group` and -`experimental::group_base` are helper classes to manage the supported group -types. - -```c++ -namespace syclcompat { -namespace experimental { - -#if defined(__AMDGPU__) || defined(__NVPTX__) -// seq_cst currently not working for AMD nor Nvidia -constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::acq_rel; -#else -constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::seq_cst; -#endif - -template -inline void nd_range_barrier( - sycl::nd_item item, - sycl::atomic_ref &counter); - -template <> -inline void nd_range_barrier( - sycl::nd_item<1> item, - sycl::atomic_ref &counter); - -template class logical_group { -public: - logical_group(sycl::nd_item item, sycl::group parent_group, - uint32_t size); - uint32_t get_local_linear_id() const; - uint32_t get_group_linear_id() const; - uint32_t get_local_linear_range() const; - uint32_t get_group_linear_range() const; -}; - -inline int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size, - int slm_size = 0, - int sg_size = 32, - bool used_barrier = false, - bool used_large_grf = false); - -inline int calculate_max_potential_wg(int *num_wg, int *wg_size, - int max_wg_size_for_device_code, - int slm_size = 0, int sg_size = 32, - bool used_barrier = false, - bool used_large_grf = false); -// Supported group types -enum class group_type { work_group, sub_group, logical_group, root_group }; - -// The group_base will dispatch the function call to the specific interface -// based on the group type. -template class group_base { -public: - group_base(sycl::nd_item item); - - // Returns the number of work-items in the group. - size_t get_local_linear_range(); - // Returns the index of the work-item within the group. - size_t get_local_linear_id(); - - // Wait for all the elements within the group to complete their execution - // before proceeding. - void barrier(); -}; - -// Container type that can store supported group_types. -template -class group : public group_base { -public: - group(GroupT g, sycl::nd_item item); -}; - -} // namespace experimental -} // namespace syclcompat -``` - -SYCLcompat provides a wrapper API `max_active_work_groups_per_cu` providing -'work-groups per compute unit' semantics. It is templated on the kernel -functor, and takes a work-group size represented by either `sycl::range` -or `syclcompat::dim3`, the local memory size in bytes, and an optional queue. -The function returns the maximum number of work-groups which can be executed -per compute unit. May return *zero* even when below resource limits (i.e. -returning `0` does not imply the kernel cannot execute). -```cpp -namespace syclcompat{ -template -size_t max_active_work_groups_per_cu( - syclcompat::dim3 wg_dim3, size_t local_mem_size, - sycl::queue queue = syclcompat::get_default_queue()); - -template -size_t max_active_work_groups_per_cu( - sycl::range wg_range, size_t local_mem_size, - sycl::queue queue = syclcompat::get_default_queue()); -} -``` - -To assist machine translation, helper aliases are provided for inlining and -alignment attributes. The class template declarations `sycl_compat_kernel_name` -and `sycl_compat_kernel_scalar` are used to assist automatic generation of -kernel names during machine translation. - -`get_sycl_language_version` returns an integer representing the version of the -SYCL spec supported by the current SYCL compiler. - -The `SYCLCOMPAT_CHECK_ERROR` macro encapsulates an error-handling mechanism for -expressions that might throw `sycl::exception` and `std::runtime_error`. If no -exceptions are thrown, it returns `syclcompat::error_code::success`. If a -`sycl::exception` is caught, it returns `syclcompat::error_code::backend_error`. -If a `std::runtime_error` exception is caught, -`syclcompat::error_code::default_error` is returned instead. For both cases, it -prints the error message to the standard error stream. - -`get_error_string_dummy` is a dummy function introduced to assist auto -migration. The SYCLomatic user should replace it with a real error-handling -function. SYCL reports errors using exceptions and does not use error codes. - -``` c++ -namespace syclcompat { - -template class syclcompat_kernel_name; -template class syclcompat_kernel_scalar; - -#if defined(_MSC_VER) -#define __syclcompat_align__(n) __declspec(align(n)) -#define __syclcompat_inline__ __forceinline -#else -#define __syclcompat_align__(n) __attribute__((aligned(n))) -#define __syclcompat_inline__ __inline__ __attribute__((always_inline)) -#endif - -#if defined(_MSC_VER) -#define __syclcompat_noinline__ __declspec(noinline) -#else -#define __syclcompat_noinline__ __attribute__((noinline)) -#endif - -#define SYCLCOMPAT_COMPATIBILITY_TEMP (900) - -#ifdef _WIN32 -#define SYCLCOMPAT_EXPORT __declspec(dllexport) -#else -#define SYCLCOMPAT_EXPORT -#endif - - -namespace syclcompat { -enum error_code { success = 0, backend_error = 1, default_error = 999 }; -inline const char *get_error_string_dummy(int ec); -} - -#define SYCLCOMPAT_CHECK_ERROR(expr) - -int get_sycl_language_version(); - -} // namespace syclcompat -``` - -### Kernel Helper Functions - -Kernel helper functions provide a structure `kernel_function_info` to keep SYCL -kernel information, and provide a utility function `get_kernel_function_info()` -to get the kernel information. Overloads are provided to allow either returning -a `kernel_function_info` object, or to return by pointer argument. In the -current version, `kernel_function_info` describes only maximum work-group size. - -SYCLcompat also provides the `kernel_library` and `kernel_function` classes. -`kernel_library` facilitates the loading and unloading of kernel libraries. -`kernel_function` represents a specific kernel function within a loaded library -and can be invoked with specified arguments. -`load_kernel_library`, `load_kernel_library_mem`, and `unload_kernel_library` -are free functions to handle the loading and unloading of `kernel_library` -objects. `get_kernel_function`, and `invoke_kernel_function` offer a similar -functionality for `kernel_function` objects. - -``` c++ -namespace syclcompat { - -struct kernel_function_info { - int max_work_group_size = 0; -}; - -static void get_kernel_function_info(kernel_function_info *kernel_info, - const void *function); -static kernel_function_info get_kernel_function_info(const void *function); - -class kernel_library { - constexpr kernel_library(); - constexpr kernel_library(void *ptr); - operator void *() const; -}; - -static kernel_library load_kernel_library(const std::string &name); -static kernel_library load_kernel_library_mem(char const *const image); -static void unload_kernel_library(const kernel_library &library); - -class kernel_function { - constexpr kernel_function(); - constexpr kernel_function(kernel_functor ptr); - operator void *() const; - void operator()(sycl::queue &q, const sycl::nd_range<3> &range, - unsigned int local_mem_size, void **args, void **extra); -}; - -static kernel_function get_kernel_function(kernel_library &library, - const std::string &name); -static void invoke_kernel_function(kernel_function &function, - sycl::queue &queue, - sycl::range<3> group_range, - sycl::range<3> local_range, - unsigned int local_mem_size, - void **kernel_params, void **extra); - -} // namespace syclcompat -``` - -### Math Functions - -The `funnelshift_*` APIs perform a concatenate-shift operation on two 32-bit -values, and return a 32-bit result. The two unsigned integer arguments (`low` -and `high`) are concatenated to a 64-bit value which is then shifted left or -right by `shift` bits. The functions then return either the least- or -most-significant 32 bits. The `_l*` variants shift *left* and return the *most* -significant 32 bits, while the `_r*` variants shift *right* and return the -*least* significant 32 bits. The `_l`/`_r` APIs differ from the `_lc`/`_rc` APIs -in how they clamp the `shift` argument: `funnelshift_l` and `funnelshift_r` -shift the result by `shift & 31` bits, whereas `funnelshift_lc` and -`funnelshift_rc` shift the result by `min(shift, 32)` bits. - -`syclcompat::fast_length` provides a wrapper to SYCL's -`fast_length(sycl::vec)` that accepts arguments for a C++ array and a -length. `syclcompat::length` provides a templated version that wraps over -`sycl::length`. There are wrappers for `clamp`, `isnan`, `cbrt`, `min`, `max`, -`fmax_nan`, `fmin_nan`, and `pow`, as well as an implementation of `relu` -saturation is also provided. - -`compare`, `unordered_compare`, `compare_both`, `unordered_compare_both`, -`compare_mask`, and `unordered_compare_mask`, handle both ordered and unordered -comparisons. - -`vectorized_max` and `vectorized_min` are binary operations returning the -max/min of two arguments, where each argument is treated as a `sycl::vec` type. -`vectorized_isgreater` performs elementwise `isgreater`, treating each argument -as a vector of elements, and returning `0` for vector components for which -`isgreater` is false, and `-1` when true. -`vectorized_sum_abs_diff` calculates the absolute difference for two values -without modulo overflow for vector types. - -The functions `cmul`,`cdiv`,`cabs`, `cmul_add`, and `conj` define complex math -operations which accept `sycl::vec` arguments representing complex values. - -The `dp4a` function returns the 4-way 8-bit dot product accumulate for unsigned -and signed 32-bit integer values. The `dp2a_lo` and `dp2a_hi` functions return the -two-way 16-bit to 8-bit dot product using the second and first 16 bits of the -second operand, respectively. These three APIs return a single 32-bit value with -the accumulated result, which is unsigned if both operands are `uint32_t` and -signed otherwise. - -Various maths functions are defined operate on any floating point types. -`syclcompat::is_floating_point_v` extends the standard library's -`std::is_floating_point_v` to include `sycl::half` and, where available, -`sycl::ext::oneapi::bfloat16`. The current version of SYCLcompat also provides -a specialization of `std::common_type_t` for `sycl::ext::oneapi::bfloat16`, -though this will be moved to the `sycl_ext_oneapi_bfloat16` extension in -future. - -```cpp -namespace std { -template <> struct common_type { - using type = sycl::ext::oneapi::bfloat16; -}; - -template <> -struct common_type { - using type = sycl::ext::oneapi::bfloat16; -}; - -template struct common_type { - using type = sycl::ext::oneapi::bfloat16; -}; - -template struct common_type { - using type = sycl::ext::oneapi::bfloat16; -}; -} // namespace std -``` - -```cpp -namespace syclcompat{ - -// Trait for extended floating point definition -template -struct is_floating_point : std::is_floating_point{}; - -template <> struct is_floating_point : std::true_type {}; - -#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS -template <> struct is_floating_point : std::true_type {}; -#endif -template - -inline constexpr bool is_floating_point_v = is_floating_point::value; - -inline unsigned int funnelshift_l(unsigned int low, unsigned int high, - unsigned int shift); - -inline unsigned int funnelshift_lc(unsigned int low, unsigned int high, - unsigned int shift); - -inline unsigned int funnelshift_r(unsigned int low, unsigned int high, - unsigned int shift); - -inline unsigned int funnelshift_rc(unsigned int low, unsigned int high, - unsigned int shift); - -inline float fast_length(const float *a, int len); - -template -inline ValueT length(const ValueT *a, const int len); - -inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val); - -// Determine whether 2 element value is NaN. -template -inline std::enable_if_t isnan(const ValueT a); - -// cbrt function wrapper. -template -inline std::enable_if_t || - std::is_same_v, - ValueT> -cbrt(ValueT val); - -// For floating-point types, `float` or `double` arguments are acceptable. -// For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or -// `std::int64_t` type arguments are acceptable. -// sycl::half supported as well. -template -std::enable_if_t && std::is_integral_v, - std::common_type_t> -min(T1 a, T2 b); -template -std::enable_if_t && std::is_floating_point_v, - std::common_type_t> -min(T1 a, T2 b); - -sycl::half min(sycl::half a, sycl::half b); - -template -std::enable_if_t && std::is_integral_v, - std::common_type_t> -max(T1 a, T2 b); -template -std::enable_if_t && std::is_floating_point_v, - std::common_type_t> -max(T1 a, T2 b); - -sycl::half max(sycl::half a, sycl::half b); - -// Performs 2 elements comparison and returns the bigger one. If either of -// inputs is NaN, then return NaN. -template -inline std::common_type_t fmax_nan(const ValueT a, - const ValueU b); - -template -inline sycl::vec, 2> -fmax_nan(const sycl::vec a, const sycl::vec b); - -template -inline sycl::marray, 2> -fmax_nan(const sycl::marray a, const sycl::marray b); - -// Performs 2 elements comparison and returns the smaller one. If either of -// inputs is NaN, then return NaN. -template -inline std::common_type_t fmin_nan(const ValueT a, - const ValueU b); -template -inline sycl::vec, 2> -fmin_nan(const sycl::vec a, const sycl::vec b); - -template -inline sycl::marray, 2> -fmin_nan(const sycl::marray a, const sycl::marray b); - -inline float pow(const float a, const int b) { return sycl::pown(a, b); } -inline double pow(const double a, const int b) { return sycl::pown(a, b); } - -template -inline typename std::enable_if_t, ValueT> -pow(const ValueT a, const ValueU b); - -// Requires aspect::fp64, as it casts to double internally. -template -inline typename std::enable_if_t, double> -pow(const ValueT a, const ValueU b); - -template inline ValueT relu(const ValueT a); - -template -inline sycl::vec -relu(const sycl::vec a); - -template -inline std::enable_if_t || - std::is_same_v, - sycl::marray> -relu(const sycl::marray a); - -// The following definition is enabled when BinaryOperation(ValueT, ValueT) returns bool -// std::enable_if_t, bool>, bool> -template -inline bool -compare(const ValueT a, const ValueT b, const BinaryOperation binary_op); -template -inline std::enable_if_t -compare(const ValueT a, const ValueT b, const BinaryOperation binary_op); - -// The following definition is enabled when BinaryOperation(ValueT, ValueT) returns bool -// std::enable_if_t, bool>, bool> -template -inline bool -unordered_compare(const ValueT a, const ValueT b, - const BinaryOperation binary_op); -template -inline std::enable_if_t -unordered_compare(const ValueT a, const ValueT b, - const BinaryOperation binary_op); - -template -inline std::enable_if_t -compare_both(const ValueT a, const ValueT b, const BinaryOperation binary_op); -template - -inline std::enable_if_t -unordered_compare_both(const ValueT a, const ValueT b, - const BinaryOperation binary_op); - -template -inline std::enable_if_t -compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op); - -template -inline std::enable_if_t -unordered_compare_mask(const ValueT a, const ValueT b, - const BinaryOperation binary_op); - -template inline T vectorized_max(T a, T b); - -template inline T vectorized_min(T a, T b); - -template inline T vectorized_isgreater(T a, T b); - -template <> -inline unsigned vectorized_isgreater(unsigned a, - unsigned b); - -template -inline unsigned vectorized_sum_abs_diff(unsigned a, unsigned b); - -template -sycl::vec cmul(sycl::vec x, sycl::vec y); - -template -sycl::vec cdiv(sycl::vec x, sycl::vec y); - -template T cabs(sycl::vec x); - -template -inline sycl::vec cmul_add(const sycl::vec a, - const sycl::vec b, - const sycl::vec c); - -template -inline sycl::marray cmul_add(const sycl::marray a, - const sycl::marray b, - const sycl::marray c); - -template sycl::vec conj(sycl::vec x); - -template inline ValueT reverse_bits(ValueT a); - - -template -using dot_product_acc_t = - std::conditional_t && std::is_unsigned_v, - uint32_t, int32_t>; - -template -inline dot_product_acc_t dp2a_lo(T1 a, T2 b, - dot_product_acc_t c); - -template -inline dot_product_acc_t dp2a_hi(T1 a, T2 b, - dot_product_acc_t c); - -template -inline dot_product_acc_t dp4a(T1 a, T2 b, - dot_product_acc_t c); -} // namespace syclcompat -``` - -`vectorized_binary` computes the `BinaryOperation` for two operands, -with each value treated as a vector type. `vectorized_unary` offers the same -interface for operations with a single operand. `vectorized_ternary` offers the -interface for three operands with two `BinaryOperation`. -The implemented `BinaryOperation`s are `abs_diff`, `add_sat`, `rhadd`, `hadd`, -`maximum`, `minimum`, and `sub_sat`. -And the `vectorized_with_pred` offers the `BinaryOperation` for two operands, -meanwihle provides the pred of high/low halfword operation. - -```cpp -namespace syclcompat { - -template -inline unsigned vectorized_unary(unsigned a, const UnaryOperation unary_op); - -// A sycl::abs wrapper functor. -struct abs { - template auto operator()(const ValueT x) const; -}; - -template -inline unsigned vectorized_binary(unsigned a, unsigned b, - const BinaryOperation binary_op, - bool need_relu = false); - -template -inline unsigned vectorized_ternary(unsigned a, unsigned b, unsigned c, - const BinaryOperation1 binary_op1, - const BinaryOperation2 binary_op2, - bool need_relu = false); - -template -inline unsigned vectorized_with_pred(unsigned a, unsigned b, - const BinaryOperation binary_op, - bool *pred_hi, bool *pred_lo); - -// A sycl::abs_diff wrapper functor. -struct abs_diff { - template - auto operator()(const ValueT x, const ValueT y) const; -}; -// A sycl::add_sat wrapper functor. -struct add_sat { - template - auto operator()(const ValueT x, const ValueT y) const; -}; -// A sycl::rhadd wrapper functor. -struct rhadd { - template - auto operator()(const ValueT x, const ValueT y) const; -}; -// A sycl::hadd wrapper functor. -struct hadd { - template - auto operator()(const ValueT x, const ValueT y) const; -}; -// A sycl::max wrapper functor. -struct maximum { - template - auto operator()(const ValueT x, const ValueT y) const; - template - auto operator()(const ValueT x, const ValueT y, bool *pred) const; -}; -// A sycl::min wrapper functor. -struct minimum { - template - auto operator()(const ValueT x, const ValueT y) const; - template - auto operator()(const ValueT x, const ValueT y, bool *pred) const; -}; -// A sycl::sub_sat wrapper functor. -struct sub_sat { - template - auto operator()(const ValueT x, const ValueT y) const; -}; - -} // namespace syclcompat -``` - -`vectorized_binary` also supports comparison operators from the standard library (`std::equal_to`, `std::not_equal_to`, etc) -and the semantics can be modified by changing the comparison operator template instantiation. For example: - -```cpp -unsigned int Input1; -unsigned int Input2; -// initialize inputs... - -// Performs comparison on sycl::ushort2, following sycl::vec semantics -// Returns unsigned int containing, per vector element, 0xFFFF if true, and 0x0000 if false -syclcompat::vectorized_binary( - Input1, Input2, std::equal_to<>()); - -// Performs element-wise comparison on unsigned short -// Returns unsigned int containing, per vector element, 1 if true, and 0 if false -syclcompat::vectorized_binary( - Input1, Input2, std::equal_to()); -``` - -The math header provides a set of functions to extend 32-bit operations -to 33 bit, and handle sign extension internally. There is support for `add`, -`sub`, `absdiff`, `min` and `max` operations. Each operation provides overloads -to include a second, separate, `BinaryOperation` after the first, and include -the `_sat` variation, determines if the returning value is saturated or not. - -```cpp -template -inline constexpr RetT extend_add(AT a, BT b); - -template -inline constexpr RetT extend_add(AT a, BT b, CT c, BinaryOperation second_op); - -template -inline constexpr RetT extend_add_sat(AT a, BT b); - -template -inline constexpr RetT extend_add_sat(AT a, BT b, CT c, - BinaryOperation second_op); - -template -inline constexpr RetT extend_sub(AT a, BT b); - -template -inline constexpr RetT extend_sub(AT a, BT b, CT c, BinaryOperation second_op); - -template -inline constexpr RetT extend_sub_sat(AT a, BT b); - -template -inline constexpr RetT extend_sub_sat(AT a, BT b, CT c, - BinaryOperation second_op); - -template -inline constexpr RetT extend_absdiff(AT a, BT b); - -template -inline constexpr RetT extend_absdiff(AT a, BT b, CT c, - BinaryOperation second_op); - -template -inline constexpr RetT extend_absdiff_sat(AT a, BT b); - -template -inline constexpr RetT extend_absdiff_sat(AT a, BT b, CT c, - BinaryOperation second_op); - -template -inline constexpr RetT extend_min(AT a, BT b); - -template -inline constexpr RetT extend_min(AT a, BT b, CT c, BinaryOperation second_op); - -template -inline constexpr RetT extend_min_sat(AT a, BT b); - -template -inline constexpr RetT extend_min_sat(AT a, BT b, CT c, - BinaryOperation second_op); - -template -inline constexpr RetT extend_max(AT a, BT b); - -template -inline constexpr RetT extend_max(AT a, BT b, CT c, BinaryOperation second_op); - -template -inline constexpr RetT extend_max_sat(AT a, BT b); - -template -inline constexpr RetT extend_max_sat(AT a, BT b, CT c, - BinaryOperation second_op); -``` - -Another set of vectorized extend 32-bit operations is provided in the math -header.These APIs treat each of the 32-bit operands as 2-elements vector -(16-bits each) while handling sign extension to 17-bits internally. There is -support for `add`, `sub`, `absdiff`, `min`, `max` and `avg` binary operations. -Each operation provides has a `_sat` variat which determines if the returning -value is saturated or not, and a `_add` variant that computes the binary sum -of the the initial operation outputs and a third operand. - -```cpp -/// Compute vectorized addition of \p a and \p b, with each value treated as a -/// 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized addition of the two values -template -inline constexpr RetT extend_vadd2(AT a, BT b, RetT c); - -/// Compute vectorized addition of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized addition of the two -/// values and the third value -template -inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c); - -/// Compute vectorized addition of \p a and \p b with saturation, with each -/// value treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized addition of the two values with saturation -template -inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c); - -/// Compute vectorized subtraction of \p a and \p b, with each value treated as -/// a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized subtraction of the two values -template -inline constexpr RetT extend_vsub2(AT a, BT b, RetT c); - -/// Compute vectorized subtraction of \p a and \p b, with each value treated as -/// a 2 elements vector type and extend each element to 17 bit. Then add each -/// half of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized subtraction of the -/// two values and the third value -template -inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c); - -/// Compute vectorized subtraction of \p a and \p b with saturation, with each -/// value treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized subtraction of the two values with saturation -template -inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c); - -/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized abs_diff of the two values -template -inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c); - -/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized abs_diff of the -/// two values and the third value -template -inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c); - -/// Compute vectorized abs_diff of \p a and \p b with saturation, with each -/// value treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized abs_diff of the two values with saturation -template -inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c); - -/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized minimum of the two values -template -inline constexpr RetT extend_vmin2(AT a, BT b, RetT c); - -/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized minimum of the -/// two values and the third value -template -inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c); - -/// Compute vectorized minimum of \p a and \p b with saturation, with each value -/// treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized minimum of the two values with saturation -template -inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c); - -/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized maximum of the two values -template -inline constexpr RetT extend_vmax2(AT a, BT b, RetT c); - -/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized maximum of the -/// two values and the third value -template -inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c); - -/// Compute vectorized maximum of \p a and \p b with saturation, with each value -/// treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized maximum of the two values with saturation -template -inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c); - -/// Compute vectorized average of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized average of the two values -template -inline constexpr RetT extend_vavrg2(AT a, BT b, RetT c); - -/// Compute vectorized average of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend average maximum of the -/// two values and the third value -template -inline constexpr RetT extend_vavrg2_add(AT a, BT b, RetT c); - -/// Compute vectorized average of \p a and \p b with saturation, with each value -/// treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized average of the two values with saturation -template -inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c); -``` - -Similarly, a set of vectorized extend 32-bit operations is provided in the math -header treating each of the 32-bit operands as 4-elements vector (8-bits each) -while handling sign extension to 9-bits internally. There is support for `add`, -`sub`, `absdiff`, `min`, `max` and `avg` binary operations. -Each operation provides has a `_sat` variat which determines if the returning -value is saturated or not, and a `_add` variant that computes the binary sum -of the the initial operation outputs and a third operand. - -```cpp -/// Compute vectorized addition of \p a and \p b, with each value treated as a -/// 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized addition of the two values -template -inline constexpr RetT extend_vadd4(AT a, BT b, RetT c); - -/// Compute vectorized addition of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized addition of the two -/// values and the third value -template -inline constexpr RetT extend_vadd4_add(AT a, BT b, RetT c); - -/// Compute vectorized addition of \p a and \p b with saturation, with each -/// value treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized addition of the two values with saturation -template -inline constexpr RetT extend_vadd4_sat(AT a, BT b, RetT c); - -/// Compute vectorized subtraction of \p a and \p b, with each value treated as -/// a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized subtraction of the two values -template -inline constexpr RetT extend_vsub4(AT a, BT b, RetT c); - -/// Compute vectorized subtraction of \p a and \p b, with each value treated as -/// a 4 elements vector type and extend each element to 9 bit. Then add each -/// half of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized subtraction of the -/// two values and the third value -template -inline constexpr RetT extend_vsub4_add(AT a, BT b, RetT c); - -/// Compute vectorized subtraction of \p a and \p b with saturation, with each -/// value treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized subtraction of the two values with saturation -template -inline constexpr RetT extend_vsub4_sat(AT a, BT b, RetT c); - -/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized abs_diff of the two values -template -inline constexpr RetT extend_vabsdiff4(AT a, BT b, RetT c); - -/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized abs_diff of the -/// two values and the third value -template -inline constexpr RetT extend_vabsdiff4_add(AT a, BT b, RetT c); - -/// Compute vectorized abs_diff of \p a and \p b with saturation, with each -/// value treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized abs_diff of the two values with saturation -template -inline constexpr RetT extend_vabsdiff4_sat(AT a, BT b, RetT c); - -/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized minimum of the two values -template -inline constexpr RetT extend_vmin4(AT a, BT b, RetT c); - -/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized minimum of the -/// two values and the third value -template -inline constexpr RetT extend_vmin4_add(AT a, BT b, RetT c); - -/// Compute vectorized minimum of \p a and \p b with saturation, with each value -/// treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized minimum of the two values with saturation -template -inline constexpr RetT extend_vmin4_sat(AT a, BT b, RetT c); - -/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized maximum of the two values -template -inline constexpr RetT extend_vmax4(AT a, BT b, RetT c); - -/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized maximum of the -/// two values and the third value -template -inline constexpr RetT extend_vmax4_add(AT a, BT b, RetT c); - -/// Compute vectorized maximum of \p a and \p b with saturation, with each value -/// treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized maximum of the two values with saturation -template -inline constexpr RetT extend_vmax4_sat(AT a, BT b, RetT c); - -/// Compute vectorized average of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized average of the two values -template -inline constexpr RetT extend_vavrg4(AT a, BT b, RetT c); - -/// Compute vectorized average of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized average of the -/// two values and the third value -template -inline constexpr RetT extend_vavrg4_add(AT a, BT b, RetT c); - -/// Compute vectorized average of \p a and \p b with saturation, with each value -/// treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized average of the two values with saturation -template -inline constexpr RetT extend_vavrg4_sat(AT a, BT b, RetT c); -``` - -Vectorized comparison APIs also provided in the math header behave similarly -and support a `std` comparison operator parameter which can be `greater`, -`less`, `greater_equal`, `less_equal`, `equal_to` or `not_equal_to`. These APIs -cover both the 2-elements *(16-bits each)* and 4-elements *(8-bits each)* -variants, as well as an additional `_add` variant that computes the sum of the -2/4 output elements. - -```cpp -/// Extend \p a and \p b to 33 bit and vectorized compare input values using -/// specified comparison \p cmp . -/// -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \tparam [in] BinaryOperation The type of the compare operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] cmp The comparsion operator -/// \returns The comparison result of the two extended values. -template -inline constexpr unsigned extend_vcompare2(AT a, BT b, BinaryOperation cmp); - -/// Extend Inputs to 33 bit, and vectorized compare input values using specified -/// comparison \p cmp , then add the result with \p c . -/// -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \tparam [in] BinaryOperation The type of the compare operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] cmp The comparsion operator -/// \returns The comparison result of the two extended values, and add the -/// result with \p c . -template -inline constexpr unsigned extend_vcompare2_add(AT a, BT b, unsigned c, - BinaryOperation cmp); - -/// Extend \p a and \p b to 33 bit and vectorized compare input values using -/// specified comparison \p cmp . -/// -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \tparam [in] BinaryOperation The type of the compare operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] cmp The comparsion operator -/// \returns The comparison result of the two extended values. -template -inline constexpr unsigned extend_vcompare4(AT a, BT b, BinaryOperation cmp); - -/// Extend Inputs to 33 bit, and vectorized compare input values using specified -/// comparison \p cmp , then add the result with \p c . -/// -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \tparam [in] BinaryOperation The type of the compare operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] cmp The comparsion operator -/// \returns The comparison result of the two extended values, and add the -/// result with \p c . -template -inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c, - BinaryOperation cmp); -``` - -The math header file provides APIs for bit-field insertion (`bfi_safe`) and -bit-field extraction (`bfe_safe`). These are bounds-checked variants of -underlying `detail` APIs (`detail::bfi`, `detail::bfe`) which, in future -releases, will be exposed to the user. - -```c++ - -/// Bitfield-insert with boundary checking. -/// -/// Align and insert a bit field from \param x into \param y . Source \param -/// bit_start gives the starting bit position for the insertion, and source -/// \param num_bits gives the bit field length in bits. -/// -/// \tparam T The type of \param x and \param y , must be an unsigned integer. -/// \param x The source of the bitfield. -/// \param y The source where bitfield is inserted. -/// \param bit_start The position to start insertion. -/// \param num_bits The number of bits to insertion. -template -inline T bfi_safe(const T x, const T y, const uint32_t bit_start, - const uint32_t num_bits); - -/// Bitfield-extract with boundary checking. -/// -/// Extract bit field from \param source and return the zero or sign-extended -/// result. Source \param bit_start gives the bit field starting bit position, -/// and source \param num_bits gives the bit field length in bits. -/// -/// The result is padded with the sign bit of the extracted field. If `num_bits` -/// is zero, the result is zero. If the start position is beyond the msb of the -/// input, the result is filled with the replicated sign bit of the extracted -/// field. -/// -/// \tparam T The type of \param source value, must be an integer. -/// \param source The source value to extracting. -/// \param bit_start The position to start extracting. -/// \param num_bits The number of bits to extracting. -template -inline T bfe_safe(const T source, const uint32_t bit_start, - const uint32_t num_bits); -``` - -### Group Utilities - -Group utility functions and classes optimize data movement, -processing, and communication within work-groups. The `exchange` class -facilitates rearranging data between blocked and striped layouts, improving -memory access patterns. The `group_radix_sort` class implements an efficient -radix sort for distributed data, supporting both ascending and descending -order. The `group_load` and `group_store` classes manage structured data -movement between linear memory and work-group arrangements, supporting -blocked and striped formats with optional range-guarding. The `group_shuffle` -class enables efficient inter-work-item communication through selective data -exchanges, shifting operations, and group-wide data movement. These utilities -collectively enhance parallel performance by improving memory efficiency, -load balancing, and computational throughput. - -``` c++ -namespace syclcompat { -/// Rearranging data partitioned across a work-group. -/// -/// \tparam T The type of the data elements. -/// \tparam ElementsPerWorkItem The number of data elements assigned to a -/// work-item. -template class exchange { -public: - exchange(uint8_t *local_memory); - - static size_t get_local_memory_size(size_t group_threads); - - /// Inplace rearrange elements from blocked order to striped order. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// blocked \p input across the work-group is: - /// - /// {[0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511]}. - /// - /// The striped order output is: - /// - /// {[0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511]}. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - template - __syclcompat_inline__ void - blocked_to_striped(Item item, T (&input)[ElementsPerWorkItem]); - - /// Inplace rearrange elements from striped order to blocked order. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// striped \p input across the work-group is: - /// - /// { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }. - /// - /// The blocked order output is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - template - __syclcompat_inline__ void - striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem]); - - /// Rearrange elements from blocked order to striped order. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// blocked \p input across the work-group is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// The striped order output is: - /// - /// { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param output The corresponding output data of each work-item. - template - __syclcompat_inline__ void - blocked_to_striped(Item item, T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem]); - - /// Rearrange elements from striped order to blocked order. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// striped \p input across the work-group is: - /// - /// { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }. - /// - /// The blocked order output is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param output The corresponding output data of each work-item. - template - __syclcompat_inline__ void - striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem]); - - /// Inplace exchanges data items annotated by rank into blocked arrangement. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// striped \p input across the work-group is: - /// - /// { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }. - /// - /// The rank across the work-group is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// The blocked order output is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param ranks The corresponding rank annotation of each work-item. - template - __syclcompat_inline__ void - scatter_to_blocked(Item item, T (&input)[ElementsPerWorkItem], - int (&ranks)[ElementsPerWorkItem]); - - /// Inplace exchanges data items annotated by rank into striped arrangement. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// blocked \p input across the work-group is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// The rank across the work-group is: - /// - /// { [16, 20, 24, 28], [32, 36, 40, 44], ..., [499, 503, 507, 511] }. - /// - /// The striped order output of each work-item will be: - /// - /// { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param ranks The corresponding rank annotation of each work-item. - template - __syclcompat_inline__ void - scatter_to_striped(Item item, T (&input)[ElementsPerWorkItem], - int (&ranks)[ElementsPerWorkItem]); -}; - -/// The work-group wide radix sort to sort integer data elements -/// assigned to all work-items in the work-group. -/// -/// \tparam T The type of the data elements. -/// \tparam ElementsPerWorkItem The number of data elements assigned to -/// a work-item. -/// \tparam RADIX_BITS The number of radix bits per digit place. -template -class group_radix_sort { -public: - group_radix_sort(uint8_t *local_memory); - - static size_t get_local_memory_size(size_t group_threads); - - /// Performs an ascending work-group wide radix sort over a blocked - /// arrangement of input elements. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - /// - /// The ascending order output is: - /// - /// { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param begin_bit The beginning (least-significant) bit index needed for - /// key comparison. - /// \param end_bit The past-the-end (most-significant) bit - /// index needed for key comparison. - template - __syclcompat_inline__ void - sort(const Item &item, T (&input)[ElementsPerWorkItem], int begin_bit = 0, - int end_bit = 8 * sizeof(T)); - - /// Performs an descending work-group wide radix sort over a blocked - /// arrangement of input elements. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - /// - /// The descending order output is: - /// - /// { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param begin_bit The beginning (least-significant) bit index needed for - /// key comparison. - /// \param end_bit The past-the-end (most-significant) bit - /// index needed for key comparison. - template - __syclcompat_inline__ void - sort_descending(const Item &item, T (&input)[ElementsPerWorkItem], - int begin_bit = 0, int end_bit = 8 * sizeof(T)); - - /// Performs an ascending radix sort across a blocked arrangement of input - /// elements, leaving them in a striped arrangement. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - /// - /// The corresponding output of each work-item will be: - /// - /// { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., - /// [127,255,383,511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param begin_bit The beginning (least-significant) bit index needed for - /// key comparison. - /// \param end_bit The past-the-end (most-significant) bit - /// index needed for key comparison. - template - __syclcompat_inline__ void - sort_blocked_to_striped(const Item &item, T (&input)[ElementsPerWorkItem], - int begin_bit = 0, int end_bit = 8 * sizeof(T)); - - /// Performs an descending radix sort across a blocked arrangement of input - /// elements, leaving them in a striped arrangement. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - /// - /// The descending striped order output is: - /// - /// { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., - /// [127,255,383,511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param begin_bit The beginning (least-significant) bit index needed for - /// key comparison. - /// \param end_bit The past-the-end (most-significant) bit - /// index needed for key comparison. - template - __syclcompat_inline__ void sort_descending_blocked_to_striped( - const Item &item, T (&input)[ElementsPerWorkItem], int begin_bit = 0, - int end_bit = 8 * sizeof(T)); -}; - -/// Load linear segment items into block format across threads -/// Helper for Block Load -enum load_algorithm { - BLOCK_LOAD_DIRECT, - BLOCK_LOAD_STRIPED, -}; - -/// Load a linear segment of elements into a blocked arrangement across the -/// work-group. -/// -/// \tparam T The data type to load. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam InputIteratorT The random-access iterator type for input \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param input_iter The work-group's base input iterator for loading from. -/// \param data Data to load. -template -__syclcompat_inline__ void load_direct_blocked(const ItemT &item, - InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem]); - -/// Load a linear segment of elements into a striped arrangement across the -/// work-group. -/// -/// \tparam T The data type to load. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam InputIteratorT The random-access iterator type for input \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param input_iter The work-group's base input iterator for loading from. -/// \param data Data to load. -template -__syclcompat_inline__ void load_direct_striped(const ItemT &item, - InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem]); - -/// Load a linear segment of elements into a blocked arrangement across the -/// work-group, guarded by range. -/// -/// \tparam T The data type to load. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam InputIteratorT The random-access iterator type for input \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param input_iter The work-group's base input iterator for loading from. -/// \param data Data to load. -/// \param valid_items Number of valid items to load -template -__syclcompat_inline__ void -load_direct_blocked(const ItemT &item, InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem], int valid_items); - -/// Load a linear segment of elements into a striped arrangement across the -/// work-group, guarded by range. -/// -/// \tparam T The data type to load. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam InputIteratorT The random-access iterator type for input \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param input_iter The work-group's base input iterator for loading from. -/// \param data Data to load. -/// \param valid_items Number of valid items to load -template -__syclcompat_inline__ void -load_direct_striped(const ItemT &item, InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem], int valid_items); - -/// Store a blocked arrangement of items across a work-group into a linear -/// segment of items. -/// -/// \tparam T The data type to store. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam OutputIteratorT The random-access iterator type for output. -/// \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param output_iter The work-group's base output iterator for writing. -/// \param data Data to store. -template -__syclcompat_inline__ void -store_direct_blocked(const ItemT &item, OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem]); - -/// Store a striped arrangement of items across a work-group into a linear -/// segment of items. -/// -/// \tparam T The data type to store. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam OutputIteratorT The random-access iterator type for output. -/// \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param output_iter The work-group's base output iterator for writing. -/// \param items Data to store. -template -__syclcompat_inline__ void -store_direct_striped(const ItemT &item, OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem]); - -/// Store a blocked arrangement of items across a work-group into a linear -/// segment of items, guarded by range. -/// -/// \tparam T The data type to store. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam OutputIteratorT The random-access iterator type for output. -/// \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param output_iter The work-group's base output iterator for writing. -/// \param data Data to store. -/// \param valid_items Number of valid items to load -template -__syclcompat_inline__ void -store_direct_blocked(const ItemT &item, OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem], size_t valid_items); - -/// Store a striped arrangement of items across a work-group into a linear -/// segment of items, guarded by range. -/// -/// \tparam T The data type to store. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam OutputIteratorT The random-access iterator type for output. -/// \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param output_iter The work-group's base output iterator for writing. -/// \param items Data to store. -/// \param valid_items Number of valid items to load -template -__syclcompat_inline__ void -store_direct_striped(const ItemT &item, OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem], size_t valid_items); - -/// Enumerates alternative algorithms for syclcompat::group::group_load to read -/// a linear segment of data from memory into a blocked arrangement across a -/// work-group. -enum class group_load_algorithm { - /// A blocked arrangement of data is read directly from memory. - blocked, - - /// A striped arrangement of data is read directly from memory. - striped -}; - -/// Provide methods for loading a linear segment of items from memory into a -/// blocked arrangement across a work-group. -/// -/// \tparam T The input data type. -/// \tparam ElementsPerWorkItem The number of data elements assigned to a -/// work-item. -/// \tparam LoadAlgorithm The data movement strategy, default is blocked. -template -class group_load { -public: - group_load(uint8_t *); - - static size_t get_local_memory_size([[maybe_unused]] size_t work_group_size); - - /// Load a linear segment of items from memory. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// 1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511. - /// - /// The blocked order \p data of each work-item will be: - /// - /// {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. - /// - /// The striped order \p output of each work-item will be: - /// - /// {[0,128,256,384], [1,129,257,385], ..., [127,255,383,511]}. - /// - /// \tparam ItemT The sycl::nd_item index space class. - /// \tparam InputIteratorT The random-access iterator type for input - /// \iterator. - /// \param item The work-item identifier. - /// \param input_iter The work-group's base input iterator for loading from. - /// \param data The data to load. - template - __syclcompat_inline__ void load(const ItemT &item, InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem]); - - /// Load a linear segment of items from memory, guarded by range. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and - /// valid_items is 5, the \p input across the work-group is: - /// - /// 0, 1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511. - /// - /// The blocked order \p data of each work-item will be: - /// - /// {[0,1,2,3], [4,?,?,?], ..., [?,?,?,?]}. - /// - /// The striped order \p output of each work-item will be: - /// - /// {[0,?,?,?], [1,?,?,?], [2,?,?,?], [3,?,?,?] ..., [?,?,?,?]}. - /// - /// \tparam ItemT The sycl::nd_item index space class. - /// \tparam InputIteratorT The random-access iterator type for input - /// \iterator. - /// \param item The work-item identifier. - /// \param input_iter The work-group's base input iterator for loading from. - /// \param data The data to load. - /// \param valid_items Number of valid items to load - template - __syclcompat_inline__ void load(const ItemT &item, InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem], - int valid_items); -}; - -/// Enumerates alternative algorithms for syclcompat::group::group_load to write -/// a blocked arrangement of items across a work-group to a linear segment of -/// memory. -enum class group_store_algorithm { - /// A blocked arrangement of data is written directly to memory. - blocked, - - /// A striped arrangement of data is written directly to memory. - striped, -}; - -/// Provide methods for writing a blocked arrangement of elements partitioned -/// across a work-group to a linear segment of memory. -/// -/// \tparam T The output data type. -/// \tparam ElementsPerWorkItem The number of data elements assigned to a -/// work-item. -/// \tparam StoreAlgorithm The data movement strategy, default is blocked. -template -class group_store { -public: - group_store(uint8_t *); - - static size_t get_local_memory_size([[maybe_unused]] size_t work_group_size); - - /// Store items into a linear segment of memory. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. - /// - /// The blocked order \p output will be: - /// - /// 1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511. - /// - /// The striped order \p output will be: - /// - /// 0, 128, 256, 384, 1, 129, 257, 385, ..., 127, 255, 383, 511. - /// - /// \tparam ItemT The sycl::nd_item index space class. - /// \tparam OutputIteratorT The random-access iterator type for \p output - /// iterator. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param data The data to store. - template - __syclcompat_inline__ void store(const ItemT &item, - OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem]); - - /// Store items into a linear segment of memory, guarded by range. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and - /// \p valid_items is 5, the \p output across the work-group is: - /// - /// {[0,0,0,0], [0,0,0,0], ..., [0,0,0,0]}. - /// - /// The blocked order \p output will be: - /// - /// 0, 1, 2, 3, 4, 5, 0, 0, ..., 0, 0, 0, 0. - /// - /// The striped order \p output will be: - /// - /// 0, 4, 8, 12, 16, 0, 0, 0, ..., 0, 0, 0, 0. - /// - /// \tparam ItemT The sycl::nd_item index space class. - /// \tparam OutputIteratorT The random-access iterator type for \p output - /// iterator. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param data The data to store. - /// \param valid_items Number of valid items to load - template - __syclcompat_inline__ void - store(const ItemT &item, OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem], size_t valid_items); -}; - -/// The work-group wide shuffle operations that allow work-items to exchange -/// data elements with other work-items within the same work-group. -/// -/// \tparam T The type of the data elements. -/// \tparam group_dim_0 The first dimension size of the work-group. -/// \tparam group_dim_1 The second dimension size of the work-group. -/// \tparam group_dim_2 The third dimension size of the work-group. -template -class group_shuffle { -public: - group_shuffle(uint8_t *local_memory); - - static constexpr size_t get_local_memory_size(size_t work_group_size); - - /// Selects a value from a work-item at a given distance in the work-group - /// and stores the value in the output. - /// - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input from the calling work-item. - /// \param output The output where the selected data will be stored. - /// \param distance The distance of work-items to look ahead or behind in the - /// work-group. - template - __syclcompat_inline__ void select(const ItemT &item, T input, T &output, - int distance = 1); - /// Selects a value from a work-item at a given distance in the work-group - /// and stores the value in the output, using a wrapped index to handle - /// overflow. - /// - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data to be selected. - /// \param output The output where the selected data will be stored. - /// \param distance The number of work-items to look ahead in the - /// work-group. - template - __syclcompat_inline__ void select2(const ItemT &item, T input, T &output, - unsigned int distance = 1); - /// Performs a shuffle operation to move data to the right across the - /// work-items, shifting elements in a work-item array by one position to the - /// right. - /// - /// \tparam ElementsPerWorkItem The number of data elements per work-item. - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data to be shuffled. - /// \param output The array that will store the shuffle result. - template - __syclcompat_inline__ void shuffle_right(const ItemT &item, - T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem]); - - /// Performs a shuffle operation to move data to the right across the - /// work-items, storing the suffix of the group after the shuffle operation. - /// - /// \tparam ElementsPerWorkItem The number of data elements per work-item. - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data to be shuffled. - /// \param output The array that will store the shuffle result. - /// \param group_suffix The suffix of the group after the shuffle. - template - __syclcompat_inline__ void - shuffle_right(const ItemT &item, T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem], T &group_suffix); - - /// Performs a shuffle operation to move data to the left across the - /// work-items, shifting elements in a work-item array by one position to the - /// left. - /// - /// \tparam ElementsPerWorkItem The number of data elements per work-item. - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data to be shuffled. - /// \param output The array that will store the shuffle result. - template - __syclcompat_inline__ void shuffle_left(const ItemT &item, - T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem]); - - /// Performs a shuffle operation to move data to the left across the - /// work-items, storing the prefix of the group before the shuffle operation. - /// - /// \tparam ElementsPerWorkItem The number of data elements per work-item. - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data to be shuffled. - /// \param output The array that will store the shuffle result. - /// \param group_prefix The prefix of the group before the shuffle. - template - __syclcompat_inline__ void - shuffle_left(const ItemT &item, T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem], T &group_prefix); -}; -} // namespace syclcompat -``` - -## Sample Code - -The file [helloworld.cpp](../../test-e2e/syclcompat/helloworld.cpp) contains -a simple example which computes `y = mx + b` implemented using this library. - -## Maintainers - -To report problems with this library, please open a new issue with the [COMPAT] -tag at: - - - -## Contributors - -Alberto Cabrera, Codeplay \ -Gordon Brown, Codeplay \ -Joe Todd, Codeplay \ -Pietro Ghiglio, Codeplay \ -Ruyman Reyes, Codeplay/Intel - -## Contributions - -This library is licensed under the Apache 2.0 license. If you have an idea for a -new sample, different build system integration or even a fix for something that -is broken, please get in contact. diff --git a/sycl/include/syclcompat.hpp b/sycl/include/syclcompat.hpp deleted file mode 100644 index c12ad8ef0cf89..0000000000000 --- a/sycl/include/syclcompat.hpp +++ /dev/null @@ -1,25 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCLcompat - * - * syclcompat.hpp - * - * Description: - * Main include header for SYCLcompat - **************************************************************************/ - -#pragma once - -#include diff --git a/sycl/include/syclcompat/atomic.hpp b/sycl/include/syclcompat/atomic.hpp deleted file mode 100644 index 85f5dab65f7f1..0000000000000 --- a/sycl/include/syclcompat/atomic.hpp +++ /dev/null @@ -1,473 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCL compatibility extension - * - * atomic.hpp - * - * Description: - * Atomic functionality for the SYCL compatibility extension - **************************************************************************/ - -// The original source was under the license below: -//==---- atomic.hpp -------------------------------*- C++ -*----------------==// -// -// Copyright (C) Intel Corporation -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include - -#include -#include -#include -#include - -#include - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { - -/// Atomically add the value operand to the value at the addr and assign the -/// result to the value at addr. -/// \param [in, out] addr The pointer to the data. -/// \param operand The value to add to the value at \p addr. -/// \param memoryOrder The memory ordering used. -/// \returns The value at the \p addr before the call. -template -inline T atomic_fetch_add(T *addr, arith_t operand) { - auto atm = - sycl::atomic_ref(addr[0]); - return atm.fetch_add(operand); -} - -/// Atomically subtract the value operand from the value at the addr and -/// assign the result to the value at addr. -/// \param [in, out] addr The pointer to the data. -/// \param operand The value to subtract from the value at \p addr. -/// \param memoryOrder The memory ordering used. -/// \returns The value at the \p addr before the call. -template -inline T atomic_fetch_sub(T *addr, arith_t operand) { - auto atm = - sycl::atomic_ref(addr[0]); - return atm.fetch_sub(operand); -} - -/// Atomically perform a bitwise AND between the value operand and the value -/// at the addr and assign the result to the value at addr. -/// \param [in, out] addr The pointer to the data. -/// \param operand The value to use in bitwise AND operation with the value at -/// the \p addr. -/// \param memoryOrder The memory ordering used. -/// \returns The value at the \p addr before the call. -template -inline T atomic_fetch_and(T *addr, type_identity_t operand) { - auto atm = - sycl::atomic_ref(addr[0]); - return atm.fetch_and(operand); -} - -/// Atomically or the value at the addr with the value operand, and assign -/// the result to the value at addr. -/// \param [in, out] addr The pointer to the data. -/// \param operand The value to use in bitwise OR operation with the value at -/// the \p addr. -/// \param memoryOrder The memory ordering used. -/// \returns The value at the \p addr before the call. -template -inline T atomic_fetch_or(T *addr, type_identity_t operand) { - auto atm = - sycl::atomic_ref(addr[0]); - return atm.fetch_or(operand); -} - -/// Atomically xor the value at the addr with the value operand, and assign -/// the result to the value at addr. -/// \param [in, out] addr The pointer to the data. -/// \param operand The value to use in bitwise XOR operation with the value at -/// the \p addr. -/// \param memoryOrder The memory ordering used. -/// \returns The value at the \p addr before the call. -template -inline T atomic_fetch_xor(T *addr, type_identity_t operand) { - auto atm = - sycl::atomic_ref(addr[0]); - return atm.fetch_xor(operand); -} - -/// Atomically calculate the minimum of the value at addr and the value -/// operand and assign the result to the value at addr. -/// \param [in, out] addr The pointer to the data. -/// \param operand. \param memoryOrder The memory ordering used. -/// \returns The value at the \p addr before the call. -template -inline T atomic_fetch_min(T *addr, type_identity_t operand) { - auto atm = - sycl::atomic_ref(addr[0]); - return atm.fetch_min(operand); -} - -/// Atomically calculate the maximum of the value at addr and the value -/// operand and assign the result to the value at addr. -/// \param [in, out] addr The pointer to the data. -/// \param operand. -/// \param memoryOrder The memory ordering used. -/// \returns The value at the \p addr before the call. -template -inline T atomic_fetch_max(T *addr, type_identity_t operand) { - auto atm = - sycl::atomic_ref(addr[0]); - return atm.fetch_max(operand); -} - -/// Atomically set \p operand to the value stored in \p addr, if old value -/// stored in \p addr is equal to zero or greater than \p operand, else decrease -/// the value stored in \p addr. \param [in, out] addr The pointer to the data. -/// \param operand The threshold value. -/// \param memoryOrder The memory ordering used. -/// \returns The old value stored in \p addr. -template -unsigned int atomic_fetch_compare_dec(unsigned int *addr, - unsigned int operand) { - auto atm = - sycl::atomic_ref( - addr[0]); - unsigned int old; - - while (true) { - old = atm.load(); - if (old == 0 || old > operand) { - if (atm.compare_exchange_strong(old, operand)) - break; - } else if (atm.compare_exchange_strong(old, old - 1)) - break; - } - - return old; -} - -/// Atomically increment the value stored in \p addr if old value stored in \p -/// addr is less than \p operand, else set 0 to the value stored in \p addr. -/// \param [in, out] addr The pointer to the data. -/// \param operand The threshold value. -/// \param memoryOrder The memory ordering used. -/// \returns The old value stored in \p addr. -template -inline unsigned int atomic_fetch_compare_inc(unsigned int *addr, - unsigned int operand) { - auto atm = - sycl::atomic_ref( - addr[0]); - unsigned int old; - while (true) { - old = atm.load(); - if (old >= operand) { - if (atm.compare_exchange_strong(old, 0)) - break; - } else if (atm.compare_exchange_strong(old, old + 1)) - break; - } - return old; -} - -/// Atomically exchange the value at the address addr with the value operand. -/// \param [in, out] addr The pointer to the data. -/// \param operand The value to be exchanged with the value pointed by \p addr. -/// \param memoryOrder The memory ordering used. -/// \returns The value at the \p addr before the call. -template -inline T atomic_exchange(T *addr, type_identity_t operand) { - auto atm = - sycl::atomic_ref(addr[0]); - return atm.exchange(operand); -} - -/// Atomically compare the value at \p addr to the value expected and exchange -/// with the value desired if the value at \p addr is equal to the value -/// expected. Returns the value at the \p addr before the call. -/// \param [in, out] addr Multi_ptr. -/// \param expected The value to compare against the value at \p addr. -/// \param desired The value to assign to \p addr if the value at \p addr -/// is expected. -/// \param success The memory ordering used when comparison succeeds. -/// \param fail The memory ordering used when comparison fails. -/// \returns The value at the \p addr before the call. -template -T atomic_compare_exchange_strong( - sycl::multi_ptr addr, type_identity_t expected, - type_identity_t desired, - sycl::memory_order success = sycl::memory_order::relaxed, - sycl::memory_order fail = sycl::memory_order::relaxed) { - auto atm = sycl::atomic_ref(*addr); - - atm.compare_exchange_strong(expected, desired, success, fail); - return expected; -} - -/// Atomically compare the value at \p addr to the value expected and exchange -/// with the value desired if the value at \p addr is equal to the value -/// expected. Returns the value at the \p addr before the call. -/// \param [in] addr The pointer to the data. -/// \param expected The value to compare against the value at \p addr. -/// \param desired The value to assign to \p addr if the value at \p addr is -/// expected. -/// \param success The memory ordering used when comparison succeeds. -/// \param fail The memory ordering used when comparison fails. -/// \returns The value at the \p addr before the call. -template -T atomic_compare_exchange_strong( - T *addr, type_identity_t expected, type_identity_t desired, - sycl::memory_order success = sycl::memory_order::relaxed, - sycl::memory_order fail = sycl::memory_order::relaxed) { - auto atm = - sycl::atomic_ref(addr[0]); - atm.compare_exchange_strong(expected, desired, success, fail); - return expected; -} - -/// Atomic extension to implement standard APIs in std::atomic -namespace detail { -template struct IsValidAtomicType { - static constexpr bool value = - (std::is_same::value || std::is_same::value || - std::is_same::value || std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || std::is_same::value || - std::is_pointer::value); -}; -} // namespace detail - -template -class atomic { - static_assert( - detail::IsValidAtomicType::value, - "Invalid atomic type. Valid types are int, unsigned int, long, " - "unsigned long, long long, unsigned long long, float, double " - "and pointer types"); - T __d; - -public: - /// default memory synchronization order - static constexpr sycl::memory_order default_read_order = - sycl::atomic_ref::default_read_order; - static constexpr sycl::memory_order default_write_order = - sycl::atomic_ref::default_write_order; - static constexpr sycl::memory_scope default_scope = DefaultScope; - static constexpr sycl::memory_order default_read_modify_write_order = - DefaultOrder; - - /// Default constructor. - constexpr atomic() noexcept = default; - /// Constructor with initialize value. - constexpr atomic(T d) noexcept : __d(d){}; - - /// atomically replaces the value of the referenced object with a non-atomic - /// argument - /// \param operand The value to replace the pointed value. - /// \param memoryOrder The memory ordering used. - /// \param memoryScope The memory scope used. - void store(T operand, sycl::memory_order memoryOrder = default_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept { - sycl::atomic_ref atm(__d); - atm.store(operand, memoryOrder, memoryScope); - } - - /// atomically obtains the value of the referenced object - /// \param memoryOrder The memory ordering used. - /// \param memoryScope The memory scope used. - /// \returns The value of the referenced object - T load(sycl::memory_order memoryOrder = default_read_order, - sycl::memory_scope memoryScope = default_scope) const noexcept { - sycl::atomic_ref atm( - const_cast(__d)); - return atm.load(memoryOrder, memoryScope); - } - - /// atomically replaces the value of the referenced object and obtains the - /// value held previously - /// \param operand The value to replace the pointed value. - /// \param memoryOrder The memory ordering used. - /// \param memoryScope The memory scope used. - /// \returns The value of the referenced object before the call. - T exchange(T operand, - sycl::memory_order memoryOrder = default_read_modify_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept { - - sycl::atomic_ref atm(__d); - return atm.exchange(operand, memoryOrder, memoryScope); - } - - /// atomically compares the value of the referenced object with non-atomic - /// argument and performs atomic exchange if equal or atomic load if not - /// \param expected The value expected to be found in the object referenced by - /// the atomic_ref object - /// \param desired The value to store in the referenced object if it is as - /// expected - /// \param success The memory models for the read-modify-write - /// \param failure The memory models for load operations - /// \param memoryScope The memory scope used. - /// \returns true if the referenced object was successfully changed, false - /// otherwise. - bool compare_exchange_weak( - T &expected, T desired, sycl::memory_order success, - sycl::memory_order failure, - sycl::memory_scope memoryScope = default_scope) noexcept { - sycl::atomic_ref atm(__d); - return atm.compare_exchange_weak(expected, desired, success, failure, - memoryScope); - } - /// \param expected The value expected to be found in the object referenced by - /// the atomic_ref object - /// \param desired The value to store in the referenced - /// object if it is as expected - /// \param memoryOrder The memory synchronization ordering for - /// operations - /// \param memoryScope The memory scope used. - /// \returns true if the referenced object was successfully - /// changed, false otherwise. - bool compare_exchange_weak( - T &expected, T desired, - sycl::memory_order memoryOrder = default_read_modify_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept { - sycl::atomic_ref atm(__d); - return atm.compare_exchange_weak(expected, desired, memoryOrder, - memoryScope); - } - - /// atomically compares the value of the referenced object with non-atomic - /// argument and performs atomic exchange if equal or atomic load if not - /// \param expected The value expected to be found in the object referenced by - /// the atomic_ref object - /// \param desired The value to store in the referenced - /// object if it is as expected - /// \param success The memory models for the - /// read-modify-write - /// \param failure The memory models for load operations - /// \param memoryScope The memory scope used. - /// \returns true if the referenced object was successfully changed, false - /// otherwise. - bool compare_exchange_strong( - T &expected, T desired, sycl::memory_order success, - sycl::memory_order failure, - sycl::memory_scope memoryScope = default_scope) noexcept { - - sycl::atomic_ref atm(__d); - return atm.compare_exchange_strong(expected, desired, success, failure, - memoryScope); - } - /// \param expected The value expected to be found in the object referenced by - /// the atomic_ref object - /// \param desired The value to store in the referenced - /// object if it is as expected - /// \param memoryOrder The memory synchronization ordering for - /// operations - /// \param memoryScope The memory scope used. - /// \returns true if the referenced object was successfully changed, false - /// otherwise. - bool compare_exchange_strong( - T &expected, T desired, - sycl::memory_order memoryOrder = default_read_modify_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept { - sycl::atomic_ref atm(__d); - return atm.compare_exchange_strong(expected, desired, memoryOrder, - memoryScope); - } - - /// atomically adds the argument to the value stored in the atomic object and - /// obtains the value held previously - /// \param operand The other argument of arithmetic addition - /// \param memoryOrder The memory ordering used. - /// \param memoryScope The memory scope used. - /// \returns The value of the referenced object before the call. - T fetch_add(arith_t operand, - sycl::memory_order memoryOrder = default_read_modify_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept { - - auto atm = sycl::atomic_ref(__d); - return atm.fetch_add(operand, memoryOrder, memoryScope); - } - - /// atomically subtracts the argument from the value stored in the atomic - /// object and obtains the value held previously - /// \param operand The other argument of arithmetic subtraction - /// \param memoryOrder The memory ordering used. - /// \param memoryScope The memory scope used. - /// \returns The value of the referenced object before the call. - T fetch_sub(arith_t operand, - sycl::memory_order memoryOrder = default_read_modify_write_order, - sycl::memory_scope memoryScope = default_scope) noexcept { - - auto atm = sycl::atomic_ref(__d); - return atm.fetch_sub(operand, memoryOrder, memoryScope); - } -}; - -} // namespace syclcompat diff --git a/sycl/include/syclcompat/defs.hpp b/sycl/include/syclcompat/defs.hpp deleted file mode 100644 index 32f0c2197bde7..0000000000000 --- a/sycl/include/syclcompat/defs.hpp +++ /dev/null @@ -1,93 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCLcompat - * - * defs.hpp - * - * Description: - * helper aliases and definitions for SYCLcompat - * - **************************************************************************/ - -// The original source was under the license below: -//==---- defs.hpp ---------------------------------*- C++ -*----------------==// -// -// Copyright (C) Intel Corporation -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include - -template class syclcompat_kernel_name; -template class syclcompat_kernel_scalar; - -#if defined(_MSC_VER) -#define __syclcompat_align__(n) __declspec(align(n)) -#define __syclcompat_inline__ __forceinline -#define __syclcompat_noinline__ __declspec(noinline) -#else -#define __syclcompat_align__(n) __attribute__((aligned(n))) -#define __syclcompat_inline__ __inline__ __attribute__((always_inline)) -#define __syclcompat_noinline__ __attribute__((noinline)) -#endif - -#define SYCLCOMPAT_COMPATIBILITY_TEMP (900) - -#ifdef _WIN32 -#define SYCLCOMPAT_EXPORT __declspec(dllexport) -#else -#define SYCLCOMPAT_EXPORT -#endif - -#define SYCLCOMPAT_MAJOR_VERSION 0 -#define SYCLCOMPAT_MINOR_VERSION 2 -#define SYCLCOMPAT_PATCH_VERSION 0 - -#define SYCLCOMPAT_MAKE_VERSION(_major, _minor, _patch) \ - ((1E6 * _major) + (1E3 * _minor) + _patch) - -#define SYCLCOMPAT_VERSION \ - SYCLCOMPAT_MAKE_VERSION(SYCLCOMPAT_MAJOR_VERSION, SYCLCOMPAT_MINOR_VERSION, \ - SYCLCOMPAT_PATCH_VERSION) - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { -enum error_code { success = 0, backend_error = 1, default_error = 999 }; -/// A dummy function introduced to assist auto migration. -/// The SYCLomatic user should replace it with a real error-handling function. -/// SYCL reports errors using exceptions and does not use error codes. -inline const char *get_error_string_dummy(int ec) { - (void)ec; - return ""; // Return the error string for the error code - // ec. -} -} // namespace syclcompat - -#define SYCLCOMPAT_CHECK_ERROR(expr) \ - [&]() { \ - try { \ - expr; \ - return syclcompat::error_code::success; \ - } catch (sycl::exception const &e) { \ - std::cerr << e.what() << std::endl; \ - return syclcompat::error_code::backend_error; \ - } catch (std::runtime_error const &e) { \ - std::cerr << e.what() << std::endl; \ - return syclcompat::error_code::default_error; \ - } \ - }() diff --git a/sycl/include/syclcompat/device.hpp b/sycl/include/syclcompat/device.hpp deleted file mode 100644 index 5951b4fc6492c..0000000000000 --- a/sycl/include/syclcompat/device.hpp +++ /dev/null @@ -1,954 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCL compatibility extension - * - * device.hpp - * - * Description: - * Device functionality for the SYCL compatibility extension - **************************************************************************/ - -// The original source was under the license below: -//==---- device.hpp -------------------------------*- C++ -*----------------==// -// -// Copyright (C) Intel Corporation -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(__linux__) -#include -#include -#endif -#if defined(_WIN64) -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#endif - -#include -#include -#include -#include - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { - -namespace detail { -static void parse_version_string(const std::string &ver, int &major, - int &minor) { - // Version string has the following format: - // a. OpenCL - // b. - // c. e.g gfx1030 - std::string::size_type i = 0; - while (i < ver.size()) { - if (isdigit(ver[i])) - break; - i++; - } - if (i < ver.size()) - major = std::stoi(&(ver[i])); - else - major = 0; - while (i < ver.size()) { - if (ver[i] == '.') - break; - i++; - } - i++; - if (i < ver.size()) - minor = std::stoi(&(ver[i])); - else - minor = 0; -} - -static void get_version(const sycl::device &dev, int &major, int &minor) { - std::string ver = dev.get_info(); - parse_version_string(ver, major, minor); -} - -/// SYCL default exception handler -inline auto exception_handler = [](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { - try { - std::rethrow_exception(e); - } catch (sycl::exception const &e) { - std::cerr << "[SYCLcompat] Caught asynchronous SYCL exception:" - << std::endl - << e.what() << std::endl - << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - } - } -}; - -} // namespace detail - -using event_ptr = sycl::event *; - -using queue_ptr = sycl::queue *; - -using device_ptr = char *; - -/// Destroy \p event pointed memory. -/// -/// \param event Pointer to the sycl::event address. -static void destroy_event(event_ptr event) { delete event; } - -class device_info { -public: - // get interface - const char *get_name() const { return _name; } - char *get_name() { return _name; } - template , - std::enable_if_t> || - std::is_same_v, - int> = 0> - auto get_max_work_item_sizes() const { - if constexpr (std::is_same_v>) - return _max_work_item_sizes; - else - return _max_work_item_sizes_i; - } - template , - std::enable_if_t> || - std::is_same_v, - int> = 0> - auto get_max_work_item_sizes() { - if constexpr (std::is_same_v>) - return _max_work_item_sizes; - else - return _max_work_item_sizes_i; - } - bool get_host_unified_memory() const { return _host_unified_memory; } - int get_major_version() const { return _major; } - int get_minor_version() const { return _minor; } - int get_integrated() const { return _integrated; } - int get_max_clock_frequency() const { return _frequency; } - int get_max_compute_units() const { return _max_compute_units; } - int get_max_work_group_size() const { return _max_work_group_size; } - int get_max_sub_group_size() const { return _max_sub_group_size; } - int get_max_work_items_per_compute_unit() const { - return _max_work_items_per_compute_unit; - } - int get_max_register_size_per_work_group() const { - return _max_register_size_per_work_group; - } - template || - std::is_same_v, - int> = 0> - auto get_max_nd_range_size() const { - if constexpr (std::is_same_v) - return _max_nd_range_size; - else - return _max_nd_range_size_i; - } - template || - std::is_same_v, - int> = 0> - auto get_max_nd_range_size() { - if constexpr (std::is_same_v) - return _max_nd_range_size; - else - return _max_nd_range_size_i; - } - size_t get_global_mem_size() const { return _global_mem_size; } - size_t get_local_mem_size() const { return _local_mem_size; } - /// Returns the maximum clock rate of device's global memory in kHz. If - /// compiler does not support this API then returns default value 3200000 kHz. - unsigned int get_memory_clock_rate() const { return _memory_clock_rate; } - /// Returns the maximum bus width between device and memory in bits. If - /// compiler does not support this API then returns default value 64 bits. - unsigned int get_memory_bus_width() const { return _memory_bus_width; } - uint32_t get_device_id() const { return _device_id; } - std::array get_uuid() const { return _uuid; } - /// Returns global memory cache size in bytes. - unsigned int get_global_mem_cache_size() const { - return _global_mem_cache_size; - } - int get_image1d_max() const { return _image1d_max; } - auto get_image2d_max() const { return _image2d_max; } - auto get_image2d_max() { return _image2d_max; } - auto get_image3d_max() const { return _image3d_max; } - auto get_image3d_max() { return _image3d_max; } - - // set interface - void set_name(const char *name) { - size_t length = strlen(name); - if (length < device_info::NAME_BUFFER_SIZE) { - std::memcpy(_name, name, length + 1); - } else { - std::memcpy(_name, name, device_info::NAME_BUFFER_SIZE - 1); - _name[255] = '\0'; - } - } - void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) { - _max_work_item_sizes = max_work_item_sizes; - for (int i = 0; i < 3; ++i) - _max_work_item_sizes_i[i] = max_work_item_sizes[i]; - } - [[deprecated]] void - set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) { - for (int i = 0; i < 3; ++i) { - _max_work_item_sizes[i] = max_work_item_sizes[i]; - _max_work_item_sizes_i[i] = max_work_item_sizes[i]; - } - } - void set_host_unified_memory(bool host_unified_memory) { - _host_unified_memory = host_unified_memory; - } - void set_major_version(int major) { _major = major; } - void set_minor_version(int minor) { _minor = minor; } - void set_integrated(int integrated) { _integrated = integrated; } - void set_max_clock_frequency(int frequency) { _frequency = frequency; } - void set_max_compute_units(int max_compute_units) { - _max_compute_units = max_compute_units; - } - void set_global_mem_size(size_t global_mem_size) { - _global_mem_size = global_mem_size; - } - void set_local_mem_size(size_t local_mem_size) { - _local_mem_size = local_mem_size; - } - void set_max_work_group_size(int max_work_group_size) { - _max_work_group_size = max_work_group_size; - } - void set_max_sub_group_size(int max_sub_group_size) { - _max_sub_group_size = max_sub_group_size; - } - void - set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) { - _max_work_items_per_compute_unit = max_work_items_per_compute_unit; - } - void set_max_nd_range_size(int max_nd_range_size[]) { - for (int i = 0; i < 3; i++) { - _max_nd_range_size[i] = max_nd_range_size[i]; - _max_nd_range_size_i[i] = max_nd_range_size[i]; - } - } - void set_max_nd_range_size(sycl::id<3> max_nd_range_size) { - for (int i = 0; i < 3; i++) { - _max_nd_range_size[i] = max_nd_range_size[i]; - _max_nd_range_size_i[i] = max_nd_range_size[i]; - } - } - void set_memory_clock_rate(unsigned int memory_clock_rate) { - _memory_clock_rate = memory_clock_rate; - } - void set_memory_bus_width(unsigned int memory_bus_width) { - _memory_bus_width = memory_bus_width; - } - void - set_max_register_size_per_work_group(int max_register_size_per_work_group) { - _max_register_size_per_work_group = max_register_size_per_work_group; - } - void set_device_id(uint32_t device_id) { _device_id = device_id; } - void set_uuid(std::array uuid) { _uuid = std::move(uuid); } - void set_global_mem_cache_size(unsigned int global_mem_cache_size) { - _global_mem_cache_size = global_mem_cache_size; - } - void set_image1d_max(size_t image_max_buffer_size) { - _image1d_max = image_max_buffer_size; - } - void set_image2d_max(size_t image_max_width_buffer_size, - size_t image_max_height_buffer_size) { - _image2d_max[0] = image_max_width_buffer_size; - _image2d_max[1] = image_max_height_buffer_size; - } - void set_image3d_max(size_t image_max_width_buffer_size, - size_t image_max_height_buffer_size, - size_t image_max_depth_buffer_size) { - _image3d_max[0] = image_max_width_buffer_size; - _image3d_max[1] = image_max_height_buffer_size; - _image3d_max[2] = image_max_depth_buffer_size; - } - -private: - constexpr static size_t NAME_BUFFER_SIZE = 256; - - char _name[device_info::NAME_BUFFER_SIZE]; - sycl::range<3> _max_work_item_sizes; - int _max_work_item_sizes_i[3]; - bool _host_unified_memory = false; - int _major; - int _minor; - int _integrated = 0; - int _frequency; - // Set estimated value 3200000 kHz as default value. - unsigned int _memory_clock_rate = 3200000; - // Set estimated value 64 bits as default value. - unsigned int _memory_bus_width = 64; - unsigned int _global_mem_cache_size; - int _max_compute_units; - int _max_work_group_size; - int _max_sub_group_size; - int _max_work_items_per_compute_unit; - int _max_register_size_per_work_group; - size_t _global_mem_size; - size_t _local_mem_size; - size_t _max_nd_range_size[3]; - int _max_nd_range_size_i[3]; - uint32_t _device_id; - std::array _uuid; - int _image1d_max; - int _image2d_max[2]; - int _image3d_max[3]; -}; - -static int get_major_version(const sycl::device &dev) { - int major, minor; - detail::get_version(dev, major, minor); - return major; -} - -static int get_minor_version(const sycl::device &dev) { - int major, minor; - detail::get_version(dev, major, minor); - return minor; -} - -static inline void -has_capability_or_fail(const sycl::device &dev, - const std::initializer_list &props) { - for (const auto &it : props) { - if (dev.has(it)) - continue; - switch (it) { - case sycl::aspect::fp64: - throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), - "[SYCLcompat] 'double' is not supported in '" + - dev.get_info() + - "' device"); - break; - case sycl::aspect::fp16: - throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), - "[SYCLcompat] 'half' is not supported in '" + - dev.get_info() + - "' device"); - break; - default: -#define __SYCL_ASPECT(ASPECT, ID) \ - case sycl::aspect::ASPECT: \ - return #ASPECT; -#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID) -#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE) - auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string { - switch (AspectNum) { -#include -#include - default: - return "unknown aspect"; - } - }; -#undef __SYCL_ASPECT_DEPRECATED_ALIAS -#undef __SYCL_ASPECT_DEPRECATED -#undef __SYCL_ASPECT - throw sycl::exception( - sycl::make_error_code(sycl::errc::runtime), - "[SYCLcompat] '" + getAspectNameStr(it) + "' is not supported in '" + - dev.get_info() + "' device"); - } - break; - } -} - -/// device extension -class device_ext : public sycl::device { -public: - device_ext() : sycl::device(), _ctx(*this) {} - ~device_ext() { - try { - std::lock_guard lock(m_mutex); - sycl::event::wait(_events); - _queues.clear(); - } catch (std::exception &e) { - __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~device_ext", e); - } - } - device_ext(const sycl::device &base, bool print_on_async_exceptions = false, - bool in_order = true) - : sycl::device(base), _ctx(*this) { - if (!this->has(sycl::aspect::usm_device_allocations)) { - throw std::invalid_argument( - "Device does not support device USM allocations"); - } - // calls create_queue since we don't have a locked m_mutex - _default_queue = create_queue(print_on_async_exceptions, in_order); - _saved_queue = _default_queue; - } - - bool is_native_host_atomic_supported() { return false; } - int get_major_version() const { return syclcompat::get_major_version(*this); } - - int get_minor_version() const { return syclcompat::get_minor_version(*this); } - - int get_max_compute_units() const { - return get_device_info().get_max_compute_units(); - } - - /// Return the maximum clock frequency of this device in KHz. - int get_max_clock_frequency() const { - return get_device_info().get_max_clock_frequency(); - } - - int get_integrated() const { return get_device_info().get_integrated(); } - - int get_max_sub_group_size() const { - return get_device_info().get_max_sub_group_size(); - } - - int get_max_register_size_per_work_group() const { - return get_device_info().get_max_register_size_per_work_group(); - } - - int get_max_work_group_size() const { - return get_device_info().get_max_work_group_size(); - } - - int get_mem_base_addr_align() const { - return get_info(); - } - - size_t get_global_mem_size() const { - return get_device_info().get_global_mem_size(); - } - - size_t get_local_mem_size() const { - return get_device_info().get_local_mem_size(); - } - - /// Get the number of bytes of free and total memory on the SYCL device. - /// \param [out] free_memory The number of bytes of free memory on the SYCL - /// device. - /// \param [out] total_memory The number of bytes of total memory on the SYCL - /// device. - void get_memory_info(size_t &free_memory, size_t &total_memory) const { - if (!has(sycl::aspect::ext_intel_free_memory)) { - std::cerr << "[SYCLCompat] get_memory_info: ext_intel_free_memory is not " - "supported." - << std::endl; - free_memory = 0; - } else { - free_memory = get_info(); - } - total_memory = get_device_info().get_global_mem_size(); - } - - void get_device_info(device_info &out) const { - if (_dev_info) { - out = *_dev_info; - return; - } - - std::lock_guard lock(m_mutex); - device_info prop; - prop.set_name(get_info().c_str()); - - int major, minor; - get_version(major, minor); - prop.set_major_version(major); - prop.set_minor_version(minor); - - prop.set_max_work_item_sizes( - // SYCL 2020-conformant code, max_work_item_sizes is a struct - // templated by an int - get_info>()); - - prop.set_host_unified_memory(has(sycl::aspect::usm_host_allocations)); - - prop.set_max_clock_frequency( - get_info()); - prop.set_max_compute_units( - get_info()); - prop.set_max_work_group_size( - get_info()); - prop.set_global_mem_size(get_info()); - prop.set_local_mem_size(get_info()); - -#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6) - if (has(sycl::aspect::ext_intel_memory_clock_rate)) { - unsigned int tmp = - get_info(); - if (tmp != 0) - prop.set_memory_clock_rate(1000 * tmp); - } - if (has(sycl::aspect::ext_intel_memory_bus_width)) { - prop.set_memory_bus_width( - get_info()); - } - if (has(sycl::aspect::ext_intel_device_id)) { - prop.set_device_id(get_info()); - } - if (has(sycl::aspect::ext_intel_device_info_uuid)) { - prop.set_uuid(get_info()); - } -#elif defined(_MSC_VER) && !defined(__clang__) -#pragma message("get_device_info: querying memory_clock_rate and \ -memory_bus_width are not supported by the compiler used. \ -Use 3200000 kHz as memory_clock_rate default value. \ -Use 64 bits as memory_bus_width default value.") -#else -#warning "get_device_info: querying memory_clock_rate and \ -memory_bus_width are not supported by the compiler used. \ -Use 3200000 kHz as memory_clock_rate default value. \ -Use 64 bits as memory_bus_width default value." -#endif - - size_t max_sub_group_size = 1; - std::vector sub_group_sizes = - get_info(); - - for (const auto &sub_group_size : sub_group_sizes) { - if (max_sub_group_size < sub_group_size) - max_sub_group_size = sub_group_size; - } - - prop.set_max_sub_group_size(max_sub_group_size); - - prop.set_max_work_items_per_compute_unit( - get_info()); -#ifdef SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY - prop.set_max_nd_range_size( - get_info>()); -#else -#if defined(_MSC_VER) && !defined(__clang__) -#pragma message("get_device_info: querying the maximum number \ - of work groups is not supported.") -#else -#warning "get_device_info: querying the maximum number of \ - work groups is not supported." -#endif - int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; - prop.set_max_nd_range_size(max_nd_range_size); -#endif - - // Estimates max register size per work group, feel free to update the - // value according to device properties. - prop.set_max_register_size_per_work_group(65536); - - prop.set_global_mem_cache_size( - get_info()); - - prop.set_image1d_max(get_info()); - prop.set_image1d_max(get_info()); - prop.set_image2d_max(get_info(), - get_info()); - prop.set_image3d_max(get_info(), - get_info(), - get_info()); - - _dev_info = prop; - out = prop; - } - - device_info get_device_info() const { - if (!_dev_info) { - this->get_device_info(*_dev_info); - } - return _dev_info.value(); - } - - void reset(bool print_on_async_exceptions = false, bool in_order = true) { - std::lock_guard lock(m_mutex); - // The queues are shared_ptrs and the ref counts of the shared_ptrs increase - // only in wait_and_throw(). If there is no other thread calling - // wait_and_throw(), the queues will be destructed. The destructor waits for - // all commands executing on the queue to complete. It isn't possible to - // destroy a queue immediately. This is a synchronization point in SYCL. - _queues.clear(); - // create new default queue - // calls create_queue_impl since we already have a locked m_mutex - - _saved_queue = _default_queue = - in_order ? create_queue_impl(print_on_async_exceptions, - sycl::property::queue::in_order()) - : create_queue_impl(print_on_async_exceptions); - } - - void set_default_queue(const sycl::queue &q) { - std::lock_guard lock(m_mutex); - _queues.front().get()->wait_and_throw(); - _queues[0] = std::make_shared(q); - if (_saved_queue == _default_queue) - _saved_queue = _queues.front().get(); - _default_queue = _queues.front().get(); - } - - queue_ptr default_queue() { return _default_queue; } - - void queues_wait_and_throw() { - std::unique_lock lock(m_mutex); - std::vector> current_queues(_queues); - lock.unlock(); - for (const auto &q : current_queues) { - q->wait_and_throw(); - } - // Guard the destruct of current_queues to make sure the ref count is safe. - lock.lock(); - } - queue_ptr create_queue(bool print_on_async_exceptions = false, - bool in_order = true) { - std::lock_guard lock(m_mutex); - return in_order ? create_queue_impl(print_on_async_exceptions, - sycl::property::queue::in_order()) - : create_queue_impl(print_on_async_exceptions); - } - void destroy_queue(queue_ptr &queue) { - std::lock_guard lock(m_mutex); - _queues.erase( - std::remove_if(_queues.begin(), _queues.end(), - [=](const std::shared_ptr &q) -> bool { - return q.get() == queue; - }), - _queues.end()); - queue = nullptr; - } - void set_saved_queue(queue_ptr q) { - std::lock_guard lock(m_mutex); - _saved_queue = q; - } - queue_ptr get_saved_queue() const { - std::lock_guard lock(m_mutex); - return _saved_queue; - } - sycl::context get_context() const { return _ctx; } - - /// Util function to check whether a device supports some kinds of - /// sycl::aspect. - void has_capability_or_fail( - const std::initializer_list &props) const { - ::syclcompat::has_capability_or_fail(*this, props); - } - -private: - /// Caller should only be done from functions where the resource \p m_mutex - /// has been acquired. - template - queue_ptr create_queue_impl(bool print_on_async_exceptions = false, - PropertiesT... properties) { - sycl::property_list prop = sycl::property_list( -#ifdef SYCLCOMPAT_PROFILING_ENABLED - sycl::property::queue::enable_profiling(), -#endif - properties...); - if (print_on_async_exceptions) { - _queues.push_back(std::make_shared( - _ctx, *this, detail::exception_handler, prop)); - } else { - _queues.push_back(std::make_shared(_ctx, *this, prop)); - } - return _queues.back().get(); - } - - void get_version(int &major, int &minor) const { - detail::get_version(*this, major, minor); - } - void add_event(sycl::event event) { - std::lock_guard lock(m_mutex); - _events.push_back(event); - } - friend sycl::event enqueue_free(const std::vector &, - const std::vector &, - sycl::queue); - queue_ptr _default_queue; - queue_ptr _saved_queue; - sycl::context _ctx; - std::vector> _queues; - mutable std::mutex m_mutex; - std::vector _events; - mutable std::optional _dev_info; -}; - -namespace detail { - -static inline unsigned int get_tid() { -#if defined(__linux__) - return syscall(SYS_gettid); -#elif defined(_WIN64) - return GetCurrentThreadId(); -#else -#error "Only support Windows and Linux." -#endif -} - -/// device manager -class dev_mgr { -public: - device_ext ¤t_device() { - unsigned int dev_id = current_device_id(); - check_id(dev_id); - return *_devs[dev_id]; - } - device_ext &cpu_device() const { - std::lock_guard lock(m_mutex); - if (_cpu_device == -1) { - throw std::runtime_error("[SYCLcompat] No valid cpu device"); - } else { - return *_devs[_cpu_device]; - } - } - device_ext &get_device(unsigned int id) const { - std::lock_guard lock(m_mutex); - check_id(id); - return *_devs[id]; - } - unsigned int current_device_id() const { - std::lock_guard lock(m_mutex); - auto it = _thread2dev_map.find(get_tid()); - if (it != _thread2dev_map.end()) - return it->second; - return _default_device_id; - } - - /// Select device with a device ID. - /// \param [in] id The id of the device which can - /// be obtained through get_device_id(const sycl::device). - void select_device(unsigned int id) { - std::lock_guard lock(m_mutex); - check_id(id); - _thread2dev_map[get_tid()] = id; - } - unsigned int device_count() { return _devs.size(); } - - unsigned int get_device_id(const sycl::device &dev) { - if (!_devs.size()) { - throw std::runtime_error( - "[SYCLcompat] No SYCL devices found in the device list. Device list " - "may have been filtered by syclcompat::filter_device"); - } - unsigned int id = 0; - for (auto dev_item : _devs) { - if (*dev_item == dev) { - return id; - } - id++; - } - throw std::runtime_error("[SYCLcompat] The device[" + - dev.get_info() + - "] is filtered out by syclcompat::filter_device " - "in current device list!"); - } - - /// List all the devices with its id in dev_mgr. - void list_devices() const { - for (size_t i = 0; i < _devs.size(); ++i) { - std::cout << "Device " << i << ": " - << _devs[i]->get_info() << std::endl; - } - } - - /// Filter out devices; only keep the device whose name contains one of the - /// subname in \p dev_subnames. - /// May break device id mapping and change current device. It's better to be - /// called before other SYCLcompat/SYCL APIs. - void filter(const std::vector &dev_subnames) { - std::lock_guard lock(m_mutex); - auto iter = _devs.begin(); - while (iter != _devs.end()) { - std::string dev_name = (*iter)->get_info(); - bool matched = false; - for (const auto &name : dev_subnames) { - if (dev_name.find(name) != std::string::npos) { - matched = true; - break; - } - } - if (matched) - ++iter; - else - iter = _devs.erase(iter); - } - _cpu_device = -1; - for (unsigned i = 0; i < _devs.size(); ++i) { - if (_devs[i]->is_cpu()) { - _cpu_device = i; - break; - } - } - _thread2dev_map.clear(); -#ifdef SYCLCOMPAT_VERBOSE - list_devices(); -#endif - } - - /// Select device with a Device Selector - /// \param selector device selector to get the device id from. Defaults to - /// sycl::gpu_selector_v - template - std::enable_if_t< - std::is_invocable_r_v> - select_device(const DeviceSelector &selector = sycl::gpu_selector_v) { - sycl::device selected_device = sycl::device(selector); - unsigned int selected_device_id = get_device_id(selected_device); - select_device(selected_device_id); - } - - /// Returns the instance of device manager singleton. - static dev_mgr &instance() { - static dev_mgr d_m; - return d_m; - } - dev_mgr(const dev_mgr &) = delete; - dev_mgr &operator=(const dev_mgr &) = delete; - dev_mgr(dev_mgr &&) = delete; - dev_mgr &operator=(dev_mgr &&) = delete; - -private: - mutable std::mutex m_mutex; - - dev_mgr() { - sycl::device default_device = sycl::device(sycl::default_selector_v); - _devs.push_back(std::make_shared(default_device)); - - std::vector sycl_all_devs = - sycl::device::get_devices(sycl::info::device_type::all); - // Collect other devices except for the default device. - if (default_device.is_cpu()) - _cpu_device = 0; - for (auto &dev : sycl_all_devs) { - if (dev == default_device) { - continue; - } - _devs.push_back(std::make_shared(dev)); - if (_cpu_device == -1 && dev.is_cpu()) { - _cpu_device = _devs.size() - 1; - } - } -#ifdef SYCLCOMPAT_VERBOSE - list_devices(); -#endif - } - void check_id(unsigned int id) const { - if (id >= _devs.size()) { - throw std::runtime_error("invalid device id"); - } - } - std::vector> _devs; - /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current - /// thread id in _thread2dev_map, which means default device should be used - /// for the current thread. - const unsigned int _default_device_id = 0; - /// thread-id to device-id map. - std::map _thread2dev_map; - int _cpu_device = -1; -}; - -} // namespace detail - -static inline sycl::queue create_queue(bool print_on_async_exceptions = false, - bool in_order = true) { - return *detail::dev_mgr::instance().current_device().create_queue( - print_on_async_exceptions, in_order); -} - -/// Util function to get the default queue of current device in -/// device manager. -static inline sycl::queue get_default_queue() { - return *detail::dev_mgr::instance().current_device().default_queue(); -} - -/// Util function to change the default queue of the current device in the -/// device manager -/// If the device extension saved queue is the default queue, -/// the previous saved queue will be overwritten as well. -/// This function will be blocking if there are submitted kernels in the -/// previous default queue. -/// @param q New user-defined queue -static inline void set_default_queue(const sycl::queue &q) { - detail::dev_mgr::instance().current_device().set_default_queue(q); -} - -static inline void wait(sycl::queue q = get_default_queue()) { q.wait(); } - -static inline void wait_and_throw(sycl::queue q = get_default_queue()) { - q.wait_and_throw(); -} - -/// Util function to get the id of current device in -/// device manager. -static inline unsigned int get_current_device_id() { - return detail::dev_mgr::instance().current_device_id(); -} - -/// Util function to get the current device. -static inline device_ext &get_current_device() { - return detail::dev_mgr::instance().current_device(); -} - -/// Util function to get a device by id. -static inline device_ext &get_device(unsigned int id) { - return detail::dev_mgr::instance().get_device(id); -} - -/// Util function to get the context of the default queue of current -/// device in device manager. -static inline sycl::context get_default_context() { - return get_current_device().get_context(); -} - -/// Util function to get a CPU device. -static inline device_ext &cpu_device() { - return detail::dev_mgr::instance().cpu_device(); -} - -/// Filter out devices; only keep the device whose name contains one of the -/// subname in \p dev_subnames. -/// May break device id mapping and change current device. It's better to be -/// called before other SYCLcompat or SYCL APIs. -static inline void filter_device(const std::vector &dev_subnames) { - detail::dev_mgr::instance().filter(dev_subnames); -} - -/// List all the devices with its id in dev_mgr. -static inline void list_devices() { - detail::dev_mgr::instance().list_devices(); -} - -static inline unsigned int select_device(unsigned int id) { - detail::dev_mgr::instance().select_device(id); - return id; -} - -template -static inline std::enable_if_t< - std::is_invocable_r_v> -select_device(const DeviceSelector &selector = sycl::gpu_selector_v) { - detail::dev_mgr::instance().select_device(selector); -} - -static inline unsigned int get_device_id(const sycl::device &dev) { - return detail::dev_mgr::instance().get_device_id(dev); -} - -static inline unsigned int device_count() { - return detail::dev_mgr::instance().device_count(); -} -} // namespace syclcompat diff --git a/sycl/include/syclcompat/dims.hpp b/sycl/include/syclcompat/dims.hpp deleted file mode 100644 index 3af6c15f96d2a..0000000000000 --- a/sycl/include/syclcompat/dims.hpp +++ /dev/null @@ -1,74 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCLcompat - * - * dims.hpp - * - * Description: - * dim3 functionality for SYCLcompat - **************************************************************************/ - -#pragma once - -#include -#include - -#include - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { - -class dim3 { -public: - unsigned int x, y, z; - - dim3(const sycl::range<3> &r) : x(r[2]), y(r[1]), z(r[0]) {} - - dim3(const sycl::range<2> &r) : x(r[1]), y(r[0]), z(1) {} - - dim3(const sycl::range<1> &r) : x(r[0]), y(1), z(1) {} - - constexpr dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1) - : x(x), y(y), z(z) {} - - constexpr size_t size() const { return x * y * z; } - - operator sycl::range<3>() const { return sycl::range<3>(z, y, x); } - operator sycl::range<2>() const { - if (z != 1) - throw std::invalid_argument( - "Attempting to convert a 3D dim3 into sycl::range<2>"); - return sycl::range<2>(y, x); - } - operator sycl::range<1>() const { - if (z != 1 || y != 1) - throw std::invalid_argument( - "Attempting to convert a 2D or 3D dim3 into sycl::range<1>"); - return sycl::range<1>(x); - } -}; // namespace dim3 - -inline dim3 operator*(const dim3 &a, const dim3 &b) { - return dim3{a.x * b.x, a.y * b.y, a.z * b.z}; -} - -inline dim3 operator+(const dim3 &a, const dim3 &b) { - return dim3{a.x + b.x, a.y + b.y, a.z + b.z}; -} - -inline dim3 operator-(const dim3 &a, const dim3 &b) { - return dim3{a.x - b.x, a.y - b.y, a.z - b.z}; -} - -} // namespace syclcompat diff --git a/sycl/include/syclcompat/group_utils.hpp b/sycl/include/syclcompat/group_utils.hpp deleted file mode 100644 index 52376fe7b45d7..0000000000000 --- a/sycl/include/syclcompat/group_utils.hpp +++ /dev/null @@ -1,1269 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCL compatibility extension - * - * group_utils.hpp - * - * Description: - * Group util functionality for the SYCL compatibility extension - **************************************************************************/ - -// The original source was under the license below: -//==---- group_utils.hpp ------------------*- C++ -*--------------------==// -// -// Copyright (C) Intel Corporation -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// See https://llvm.org/LICENSE.txt for license information. -// -//===------------------------------------------------------------------===// - -#pragma once - -#include -#include -#include - -#include -#include - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { -namespace group { -namespace detail { - -template -constexpr auto __reduce_over_group(_Args... __args) { - return sycl::reduce_over_group(__args...); -} - -template constexpr auto __group_broadcast(_Args... __args) { - return sycl::group_broadcast(__args...); -} - -template -constexpr auto __exclusive_scan_over_group(_Args... __args) { - return sycl::exclusive_scan_over_group(__args...); -} - -template -constexpr auto __inclusive_scan_over_group(_Args... __args) { - return sycl::inclusive_scan_over_group(__args...); -} - -template -__syclcompat_inline__ T -exclusive_scan(const Item &item, T input, BinaryOperation binary_op, - GroupPrefixCallbackOperation &prefix_callback_op) { - T group_aggregate; - - T output = - detail::__exclusive_scan_over_group(item.get_group(), input, binary_op); - if (item.get_local_linear_id() == item.get_local_range().size() - 1) { - group_aggregate = binary_op(output, input); - } - - group_aggregate = detail::__group_broadcast( - item.get_group(), group_aggregate, item.get_local_range().size() - 1); - - T group_prefix = prefix_callback_op(group_aggregate); - if (item.get_local_linear_id() == 0) { - output = group_prefix; - } else { - output = binary_op(group_prefix, output); - } - - return output; -} - -typedef uint16_t digit_counter_type; -typedef uint32_t packed_counter_type; - -template struct log2 { - enum { VALUE = log2> 1), COUNT + 1>::VALUE }; -}; - -template struct log2 { - enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 }; -}; - -template class radix_rank { -public: - static size_t get_local_memory_size(size_t group_threads) { - return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type); - } - - radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {} - - template - __syclcompat_inline__ void - rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD], - int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) { - - digit_counter_type thread_prefixes[VALUES_PER_THREAD]; - digit_counter_type *digit_counters[VALUES_PER_THREAD]; - digit_counter_type *buffer = - reinterpret_cast(_local_memory); - auto g = item.get_group(); - reset_local_memory(item); - - sycl::group_barrier(g, sycl::memory_scope::work_group); - -#pragma unroll - for (int i = 0; i < VALUES_PER_THREAD; ++i) { - uint32_t digit = - ::syclcompat::detail::bfe(keys[i], current_bit, num_bits); - uint32_t sub_counter = digit >> LOG_COUNTER_LANES; - uint32_t counter_lane = digit & (COUNTER_LANES - 1); - - if (DESCENDING) { - sub_counter = PACKING_RATIO - 1 - sub_counter; - counter_lane = COUNTER_LANES - 1 - counter_lane; - } - - digit_counters[i] = - &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO + - item.get_local_linear_id() * PACKING_RATIO + sub_counter]; - thread_prefixes[i] = *digit_counters[i]; - *digit_counters[i] = thread_prefixes[i] + 1; - } - - sycl::group_barrier(g, sycl::memory_scope::work_group); - - scan_counters(item); - - sycl::group_barrier(g, sycl::memory_scope::work_group); - - for (int i = 0; i < VALUES_PER_THREAD; ++i) { - ranks[i] = thread_prefixes[i] + *digit_counters[i]; - } - } - -private: - template - __syclcompat_inline__ void reset_local_memory(const Item &item) { - packed_counter_type *ptr = - reinterpret_cast(_local_memory); - -#pragma unroll - for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { - ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0; - } - } - - template - __syclcompat_inline__ packed_counter_type upsweep(const Item &item) { - packed_counter_type sum = 0; - packed_counter_type *ptr = - reinterpret_cast(_local_memory); - -#pragma unroll - for (int i = 0; i < PADDED_COUNTER_LANES; i++) { - cached_segment[i] = - ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i]; - } - -#pragma unroll - for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { - sum += cached_segment[i]; - } - - return sum; - } - - template - __syclcompat_inline__ void - exclusive_downsweep(const Item &item, packed_counter_type raking_partial) { - packed_counter_type *ptr = - reinterpret_cast(_local_memory); - packed_counter_type sum = raking_partial; - -#pragma unroll - for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { - packed_counter_type value = cached_segment[i]; - cached_segment[i] = sum; - sum += value; - } - -#pragma unroll - for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { - ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] = - cached_segment[i]; - } - } - - struct prefix_callback { - __syclcompat_inline__ packed_counter_type - operator()(packed_counter_type block_aggregate) { - packed_counter_type block_prefix = 0; - -#pragma unroll - for (int packed = 1; packed < PACKING_RATIO; packed++) { - block_prefix += block_aggregate - << (sizeof(digit_counter_type) * 8 * packed); - } - - return block_prefix; - } - }; - - template - __syclcompat_inline__ void scan_counters(const Item &item) { - packed_counter_type raking_partial = upsweep(item); - - prefix_callback callback; - packed_counter_type exclusive_partial = exclusive_scan( - item, raking_partial, sycl::ext::oneapi::plus(), - callback); - - exclusive_downsweep(item, exclusive_partial); - } - -private: - static constexpr int PACKING_RATIO = - sizeof(packed_counter_type) / sizeof(digit_counter_type); - static constexpr int LOG_PACKING_RATIO = log2::VALUE; - static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO; - static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES; - static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1; - - packed_counter_type cached_segment[PADDED_COUNTER_LANES]; - uint8_t *_local_memory; -}; - -template struct base_traits { - - static __syclcompat_inline__ U twiddle_in(U key) { - throw std::runtime_error("Not implemented"); - } - static __syclcompat_inline__ U twiddle_out(U key) { - throw std::runtime_error("Not implemented"); - } -}; - -template struct base_traits { - static __syclcompat_inline__ U twiddle_in(U key) { return key; } - static __syclcompat_inline__ U twiddle_out(U key) { return key; } -}; - -template struct base_traits { - static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1); - static __syclcompat_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; } - static __syclcompat_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; } -}; - -template struct base_traits { - static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1); - static __syclcompat_inline__ U twiddle_in(U key) { - U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT; - return key ^ mask; - } - static __syclcompat_inline__ U twiddle_out(U key) { - U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1); - return key ^ mask; - } -}; - -template struct traits : base_traits {}; -template <> struct traits : base_traits {}; -template <> struct traits : base_traits {}; -template <> struct traits : base_traits {}; - -template struct power_of_two { - enum { VALUE = ((N & (N - 1)) == 0) }; -}; - -__syclcompat_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, - uint32_t addend) { - return (x >> shift) + addend; -} - -} // namespace detail - -/// Rearranging data partitioned across a work-group. -/// -/// \tparam T The type of the data elements. -/// \tparam ElementsPerWorkItem The number of data elements assigned to a -/// work-item. -template class exchange { -public: - static size_t get_local_memory_size(size_t group_threads) { - size_t padding_values = - (INSERT_PADDING) - ? ((group_threads * ElementsPerWorkItem) >> LOG_LOCAL_MEMORY_BANKS) - : 0; - return (group_threads * ElementsPerWorkItem + padding_values) * sizeof(T); - } - - exchange(uint8_t *local_memory) : _local_memory(local_memory) {} - - // TODO: Investigate if padding is required for performance, - // and if specializations are required for specific target hardware. - static size_t adjust_by_padding(size_t offset) { - - if constexpr (INSERT_PADDING) { - offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset); - } - return offset; - } - - struct blocked_offset { - template size_t operator()(Item item, size_t i) { - size_t offset = item.get_local_linear_id() * ElementsPerWorkItem + i; - return adjust_by_padding(offset); - } - }; - - struct striped_offset { - template size_t operator()(Item item, size_t i) { - size_t offset = i * item.get_local_range(2) * item.get_local_range(1) * - item.get_local_range(0) + - item.get_local_linear_id(); - return adjust_by_padding(offset); - } - }; - - template struct scatter_offset { - Iterator begin; - scatter_offset(const int (&ranks)[ElementsPerWorkItem]) { - begin = std::begin(ranks); - } - template size_t operator()(Item item, size_t i) const { - // iterator i is expected to be within bounds [0,VALUES_PER_THREAD) - return adjust_by_padding(begin[i]); - } - }; - - /// Inplace rearrange elements from blocked order to striped order. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// blocked \p input across the work-group is: - /// - /// {[0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511]}. - /// - /// The striped order output is: - /// - /// {[0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511]}. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - template - __syclcompat_inline__ void - blocked_to_striped(Item item, T (&input)[ElementsPerWorkItem]) { - striped_offset get_striped_offset; - blocked_offset get_blocked_offset; - helper_exchange(item, input, input, get_blocked_offset, get_striped_offset); - } - - /// Inplace rearrange elements from striped order to blocked order. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// striped \p input across the work-group is: - /// - /// { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }. - /// - /// The blocked order output is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - template - __syclcompat_inline__ void - striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem]) { - blocked_offset get_blocked_offset; - striped_offset get_striped_offset; - helper_exchange(item, input, input, get_striped_offset, get_blocked_offset); - } - - /// Rearrange elements from blocked order to striped order. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// blocked \p input across the work-group is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// The striped order output is: - /// - /// { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param output The corresponding output data of each work-item. - template - __syclcompat_inline__ void - blocked_to_striped(Item item, T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem]) { - striped_offset get_striped_offset; - blocked_offset get_blocked_offset; - helper_exchange(item, input, output, get_blocked_offset, - get_striped_offset); - } - - /// Rearrange elements from striped order to blocked order. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// striped \p input across the work-group is: - /// - /// { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }. - /// - /// The blocked order output is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param output The corresponding output data of each work-item. - template - __syclcompat_inline__ void - striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem]) { - blocked_offset get_blocked_offset; - striped_offset get_striped_offset; - helper_exchange(item, input, output, get_striped_offset, - get_blocked_offset); - } - - /// Inplace exchanges data items annotated by rank into blocked arrangement. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// striped \p input across the work-group is: - /// - /// { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }. - /// - /// The rank across the work-group is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// The blocked order output is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param ranks The corresponding rank annotation of each work-item. - template - __syclcompat_inline__ void - scatter_to_blocked(Item item, T (&input)[ElementsPerWorkItem], - int (&ranks)[ElementsPerWorkItem]) { - scatter_offset get_scatter_offset(ranks); - blocked_offset get_blocked_offset; - helper_exchange(item, input, input, get_scatter_offset, get_blocked_offset); - } - - /// Inplace exchanges data items annotated by rank into striped arrangement. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// blocked \p input across the work-group is: - /// - /// { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }. - /// - /// The rank across the work-group is: - /// - /// { [16, 20, 24, 28], [32, 36, 40, 44], ..., [499, 503, 507, 511] }. - /// - /// The striped order output of each work-item will be: - /// - /// { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param ranks The corresponding rank annotation of each work-item. - template - __syclcompat_inline__ void - scatter_to_striped(Item item, T (&input)[ElementsPerWorkItem], - int (&ranks)[ElementsPerWorkItem]) { - scatter_offset get_scatter_offset(ranks); - striped_offset get_striped_offset; - helper_exchange(item, input, input, get_scatter_offset, get_striped_offset); - } - -private: - template - __syclcompat_inline__ void - helper_exchange(Item item, T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem], - offsetFunctorTypeFW &offset_functor_fw, - offsetFunctorTypeRV &offset_functor_rv) { - T *buffer = reinterpret_cast(_local_memory); -#pragma unroll - for (size_t i = 0; i < ElementsPerWorkItem; i++) { - size_t offset = offset_functor_fw(item, i); - buffer[offset] = input[i]; - } - sycl::group_barrier(item.get_group()); -#pragma unroll - for (size_t i = 0; i < ElementsPerWorkItem; i++) { - size_t offset = offset_functor_rv(item, i); - output[i] = buffer[offset]; - } - } - - static constexpr int LOG_LOCAL_MEMORY_BANKS = 4; - static constexpr bool INSERT_PADDING = - (ElementsPerWorkItem > 4) && - (detail::power_of_two::VALUE); - - uint8_t *_local_memory; -}; - -/// The work-group wide radix sort to sort integer data elements -/// assigned to all work-items in the work-group. -/// -/// \tparam T The type of the data elements. -/// \tparam ElementsPerWorkItem The number of data elements assigned to -/// a work-item. -/// \tparam RADIX_BITS The number of radix bits per digit place. -template -class group_radix_sort { - uint8_t *_local_memory; - -public: - group_radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {} - - static size_t get_local_memory_size(size_t group_threads) { - size_t ranks_size = - detail::radix_rank::get_local_memory_size(group_threads); - size_t exchange_size = - exchange::get_local_memory_size(group_threads); - return sycl::max(ranks_size, exchange_size); - } - -private: - template - __syclcompat_inline__ void - helper_sort(const Item &item, T (&keys)[ElementsPerWorkItem], - int begin_bit = 0, int end_bit = 8 * sizeof(T), - bool is_striped = false) { - - uint32_t(&unsigned_keys)[ElementsPerWorkItem] = - reinterpret_cast(keys); - -#pragma unroll - for (int i = 0; i < ElementsPerWorkItem; ++i) { - unsigned_keys[i] = detail::traits::twiddle_in(unsigned_keys[i]); - } - - for (int i = begin_bit; i < end_bit; i += RADIX_BITS) { - int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit); - - int ranks[ElementsPerWorkItem]; - detail::radix_rank(_local_memory) - .template rank_keys(item, unsigned_keys, - ranks, i, pass_bits); - - sycl::group_barrier(item.get_group()); - - bool last_iter = i + RADIX_BITS >= end_bit; - if (last_iter && is_striped) { - exchange(_local_memory) - .scatter_to_striped(item, keys, ranks); - - } else { - exchange(_local_memory) - .scatter_to_blocked(item, keys, ranks); - } - - sycl::group_barrier(item.get_group()); - } - -#pragma unroll - for (int i = 0; i < ElementsPerWorkItem; ++i) { - unsigned_keys[i] = detail::traits::twiddle_out(unsigned_keys[i]); - } - } - -public: - /// Performs an ascending work-group wide radix sort over a blocked - /// arrangement of input elements. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - /// - /// The ascending order output is: - /// - /// { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param begin_bit The beginning (least-significant) bit index needed for - /// key comparison. - /// \param end_bit The past-the-end (most-significant) bit - /// index needed for key comparison. - template - __syclcompat_inline__ void - sort(const Item &item, T (&input)[ElementsPerWorkItem], int begin_bit = 0, - int end_bit = 8 * sizeof(T)) { - helper_sort(item, input, begin_bit, end_bit); - } - - /// Performs an descending work-group wide radix sort over a blocked - /// arrangement of input elements. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - /// - /// The descending order output is: - /// - /// { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param begin_bit The beginning (least-significant) bit index needed for - /// key comparison. - /// \param end_bit The past-the-end (most-significant) bit - /// index needed for key comparison. - template - __syclcompat_inline__ void - sort_descending(const Item &item, T (&input)[ElementsPerWorkItem], - int begin_bit = 0, int end_bit = 8 * sizeof(T)) { - helper_sort(item, input, begin_bit, end_bit); - } - - /// Performs an ascending radix sort across a blocked arrangement of input - /// elements, leaving them in a striped arrangement. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - /// - /// The corresponding output of each work-item will be: - /// - /// { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., - /// [127,255,383,511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param begin_bit The beginning (least-significant) bit index needed for - /// key comparison. - /// \param end_bit The past-the-end (most-significant) bit - /// index needed for key comparison. - template - __syclcompat_inline__ void - sort_blocked_to_striped(const Item &item, T (&input)[ElementsPerWorkItem], - int begin_bit = 0, int end_bit = 8 * sizeof(T)) { - helper_sort(item, input, begin_bit, end_bit, - /*is_striped=*/true); - } - - /// Performs an descending radix sort across a blocked arrangement of input - /// elements, leaving them in a striped arrangement. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - /// - /// The descending striped order output is: - /// - /// { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., - /// [127,255,383,511] }. - /// - /// \tparam Item The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param begin_bit The beginning (least-significant) bit index needed for - /// key comparison. - /// \param end_bit The past-the-end (most-significant) bit - /// index needed for key comparison. - template - __syclcompat_inline__ void sort_descending_blocked_to_striped( - const Item &item, T (&input)[ElementsPerWorkItem], int begin_bit = 0, - int end_bit = 8 * sizeof(T)) { - helper_sort(item, input, begin_bit, end_bit, - /*is_striped=*/true); - } -}; - -/// Load linear segment items into block format across threads -/// Helper for Block Load -enum load_algorithm { - BLOCK_LOAD_DIRECT, - BLOCK_LOAD_STRIPED, -}; - -/// Load a linear segment of elements into a blocked arrangement across the -/// work-group. -/// -/// \tparam T The data type to load. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam InputIteratorT The random-access iterator type for input \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param input_iter The work-group's base input iterator for loading from. -/// \param data Data to load. -template -__syclcompat_inline__ void load_direct_blocked(const ItemT &item, - InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem]) { - size_t work_item_id = item.get_local_linear_id(); -#pragma unroll - for (size_t i = 0; i < ElementsPerWorkItem; i++) - data[i] = input_iter[(work_item_id * ElementsPerWorkItem) + i]; -} - -/// Load a linear segment of elements into a striped arrangement across the -/// work-group. -/// -/// \tparam T The data type to load. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam InputIteratorT The random-access iterator type for input \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param input_iter The work-group's base input iterator for loading from. -/// \param data Data to load. -template -__syclcompat_inline__ void load_direct_striped(const ItemT &item, - InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem]) { - size_t work_group_size = item.get_group().get_local_linear_range(); - size_t work_item_id = item.get_local_linear_id(); -#pragma unroll - for (size_t i = 0; i < ElementsPerWorkItem; i++) - data[i] = input_iter[work_item_id + i * work_group_size]; -} - -/// Load a linear segment of elements into a blocked arrangement across the -/// work-group, guarded by range. -/// -/// \tparam T The data type to load. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam InputIteratorT The random-access iterator type for input \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param input_iter The work-group's base input iterator for loading from. -/// \param data Data to load. -/// \param valid_items Number of valid items to load -template -__syclcompat_inline__ void -load_direct_blocked(const ItemT &item, InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem], int valid_items) { - size_t work_item_id = item.get_local_linear_id(); -#pragma unroll - for (size_t i = 0; i < ElementsPerWorkItem; i++) - if ((work_item_id * ElementsPerWorkItem) + i < valid_items) - data[i] = input_iter[(work_item_id * ElementsPerWorkItem) + i]; -} - -/// Load a linear segment of elements into a striped arrangement across the -/// work-group, guarded by range. -/// -/// \tparam T The data type to load. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam InputIteratorT The random-access iterator type for input \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param input_iter The work-group's base input iterator for loading from. -/// \param data Data to load. -/// \param valid_items Number of valid items to load -template -__syclcompat_inline__ void -load_direct_striped(const ItemT &item, InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem], int valid_items) { - size_t work_group_size = item.get_group().get_local_linear_range(); - size_t work_item_id = item.get_local_linear_id(); -#pragma unroll - for (size_t i = 0; i < ElementsPerWorkItem; i++) - if (work_item_id + (i * work_group_size) < valid_items) - data[i] = input_iter[work_item_id + i * work_group_size]; -} - -/// Store a blocked arrangement of items across a work-group into a linear -/// segment of items. -/// -/// \tparam T The data type to store. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam OutputIteratorT The random-access iterator type for output. -/// \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param output_iter The work-group's base output iterator for writing. -/// \param data Data to store. -template -__syclcompat_inline__ void -store_direct_blocked(const ItemT &item, OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem]) { - size_t work_item_id = item.get_local_linear_id(); - OutputIteratorT work_item_iter = - output_iter + (work_item_id * ElementsPerWorkItem); -#pragma unroll - for (size_t i = 0; i < ElementsPerWorkItem; i++) - work_item_iter[i] = data[i]; -} - -/// Store a striped arrangement of items across a work-group into a linear -/// segment of items. -/// -/// \tparam T The data type to store. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam OutputIteratorT The random-access iterator type for output. -/// \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param output_iter The work-group's base output iterator for writing. -/// \param items Data to store. -template -__syclcompat_inline__ void -store_direct_striped(const ItemT &item, OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem]) { - size_t work_group_size = item.get_group().get_local_linear_range(); - size_t work_item_id = item.get_local_linear_id(); - OutputIteratorT work_item_iter = output_iter + work_item_id; -#pragma unroll - for (size_t i = 0; i < ElementsPerWorkItem; i++) - work_item_iter[i * work_group_size] = data[i]; -} - -/// Store a blocked arrangement of items across a work-group into a linear -/// segment of items, guarded by range. -/// -/// \tparam T The data type to store. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam OutputIteratorT The random-access iterator type for output. -/// \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param output_iter The work-group's base output iterator for writing. -/// \param data Data to store. -/// \param valid_items Number of valid items to load -template -__syclcompat_inline__ void -store_direct_blocked(const ItemT &item, OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem], size_t valid_items) { - size_t work_item_id = item.get_local_linear_id(); - OutputIteratorT work_item_iter = - output_iter + (work_item_id * ElementsPerWorkItem); -#pragma unroll - for (size_t i = 0; i < ElementsPerWorkItem; i++) - if (i + (work_item_id * ElementsPerWorkItem) < valid_items) - work_item_iter[i] = data[i]; -} - -/// Store a striped arrangement of items across a work-group into a linear -/// segment of items, guarded by range. -/// -/// \tparam T The data type to store. -/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned -/// onto each work-item. -/// \tparam OutputIteratorT The random-access iterator type for output. -/// \iterator. -/// \tparam ItemT The sycl::nd_item index space class. -/// \param item The calling work-item. -/// \param output_iter The work-group's base output iterator for writing. -/// \param items Data to store. -/// \param valid_items Number of valid items to load -template -__syclcompat_inline__ void -store_direct_striped(const ItemT &item, OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem], size_t valid_items) { - size_t work_group_size = item.get_group().get_local_linear_range(); - size_t work_item_id = item.get_local_linear_id(); - OutputIteratorT work_item_iter = output_iter + work_item_id; -#pragma unroll - for (size_t i = 0; i < ElementsPerWorkItem; i++) - if ((i * work_group_size) + work_item_id < valid_items) - work_item_iter[i * work_group_size] = data[i]; -} - -/// Enumerates alternative algorithms for syclcompat::group::group_load to read -/// a linear segment of data from memory into a blocked arrangement across a -/// work-group. -enum class group_load_algorithm { - /// A blocked arrangement of data is read directly from memory. - blocked, - - /// A striped arrangement of data is read directly from memory. - striped -}; - -/// Provide methods for loading a linear segment of items from memory into a -/// blocked arrangement across a work-group. -/// -/// \tparam T The input data type. -/// \tparam ElementsPerWorkItem The number of data elements assigned to a -/// work-item. -/// \tparam LoadAlgorithm The data movement strategy, default is blocked. -template -class group_load { -public: - static size_t get_local_memory_size([[maybe_unused]] size_t work_group_size) { - return 0; - } - group_load(uint8_t *) {} - - /// Load a linear segment of items from memory. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// 1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511. - /// - /// The blocked order \p data of each work-item will be: - /// - /// {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. - /// - /// The striped order \p output of each work-item will be: - /// - /// {[0,128,256,384], [1,129,257,385], ..., [127,255,383,511]}. - /// - /// \tparam ItemT The sycl::nd_item index space class. - /// \tparam InputIteratorT The random-access iterator type for input - /// \iterator. - /// \param item The work-item identifier. - /// \param input_iter The work-group's base input iterator for loading from. - /// \param data The data to load. - template - __syclcompat_inline__ void load(const ItemT &item, InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem]) { - if constexpr (LoadAlgorithm == group_load_algorithm::blocked) { - load_direct_blocked( - item, input_iter, data); - } else if constexpr (LoadAlgorithm == group_load_algorithm::striped) { - load_direct_striped( - item, input_iter, data); - } - } - - /// Load a linear segment of items from memory, guarded by range. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and - /// valid_items is 5, the \p input across the work-group is: - /// - /// 0, 1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511. - /// - /// The blocked order \p data of each work-item will be: - /// - /// {[0,1,2,3], [4,?,?,?], ..., [?,?,?,?]}. - /// - /// The striped order \p output of each work-item will be: - /// - /// {[0,?,?,?], [1,?,?,?], [2,?,?,?], [3,?,?,?] ..., [?,?,?,?]}. - /// - /// \tparam ItemT The sycl::nd_item index space class. - /// \tparam InputIteratorT The random-access iterator type for input - /// \iterator. - /// \param item The work-item identifier. - /// \param input_iter The work-group's base input iterator for loading from. - /// \param data The data to load. - /// \param valid_items Number of valid items to load - template - __syclcompat_inline__ void load(const ItemT &item, InputIteratorT input_iter, - T (&data)[ElementsPerWorkItem], - int valid_items) { - if constexpr (LoadAlgorithm == group_load_algorithm::blocked) { - load_direct_blocked( - item, input_iter, data, valid_items); - } else if constexpr (LoadAlgorithm == group_load_algorithm::striped) { - load_direct_striped( - item, input_iter, data, valid_items); - } - } -}; - -/// Enumerates alternative algorithms for syclcompat::group::group_load to write -/// a blocked arrangement of items across a work-group to a linear segment of -/// memory. -enum class group_store_algorithm { - /// A blocked arrangement of data is written directly to memory. - blocked, - - /// A striped arrangement of data is written directly to memory. - striped, -}; - -/// Provide methods for writing a blocked arrangement of elements partitioned -/// across a work-group to a linear segment of memory. -/// -/// \tparam T The output data type. -/// \tparam ElementsPerWorkItem The number of data elements assigned to a -/// work-item. -/// \tparam StoreAlgorithm The data movement strategy, default is blocked. -template -class group_store { -public: - static size_t get_local_memory_size([[maybe_unused]] size_t work_group_size) { - return 0; - } - group_store(uint8_t *) {} - - /// Store items into a linear segment of memory. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the - /// \p input across the work-group is: - /// - /// {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. - /// - /// The blocked order \p output will be: - /// - /// 1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511. - /// - /// The striped order \p output will be: - /// - /// 0, 128, 256, 384, 1, 129, 257, 385, ..., 127, 255, 383, 511. - /// - /// \tparam ItemT The sycl::nd_item index space class. - /// \tparam OutputIteratorT The random-access iterator type for \p output - /// iterator. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param data The data to store. - template - __syclcompat_inline__ void store(const ItemT &item, - OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem]) { - if constexpr (StoreAlgorithm == group_store_algorithm::blocked) { - store_direct_blocked( - item, output_iter, data); - } else if constexpr (StoreAlgorithm == group_store_algorithm::striped) { - store_direct_striped( - item, output_iter, data); - } - } - - /// Store items into a linear segment of memory, guarded by range. - /// - /// Suppose 512 integer data elements partitioned across 128 work-items, where - /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and - /// \p valid_items is 5, the \p output across the work-group is: - /// - /// {[0,0,0,0], [0,0,0,0], ..., [0,0,0,0]}. - /// - /// The blocked order \p output will be: - /// - /// 0, 1, 2, 3, 4, 5, 0, 0, ..., 0, 0, 0, 0. - /// - /// The striped order \p output will be: - /// - /// 0, 4, 8, 12, 16, 0, 0, 0, ..., 0, 0, 0, 0. - /// - /// \tparam ItemT The sycl::nd_item index space class. - /// \tparam OutputIteratorT The random-access iterator type for \p output - /// iterator. - /// \param item The work-item identifier. - /// \param input The input data of each work-item. - /// \param data The data to store. - /// \param valid_items Number of valid items to load - template - __syclcompat_inline__ void - store(const ItemT &item, OutputIteratorT output_iter, - T (&data)[ElementsPerWorkItem], size_t valid_items) { - if constexpr (StoreAlgorithm == group_store_algorithm::blocked) { - store_direct_blocked( - item, output_iter, data, valid_items); - } else if constexpr (StoreAlgorithm == group_store_algorithm::striped) { - store_direct_striped( - item, output_iter, data, valid_items); - } - } -}; - -/// The work-group wide shuffle operations that allow work-items to exchange -/// data elements with other work-items within the same work-group. -/// -/// \tparam T The type of the data elements. -/// \tparam group_dim_0 The first dimension size of the work-group. -/// \tparam group_dim_1 The second dimension size of the work-group. -/// \tparam group_dim_2 The third dimension size of the work-group. -template -class group_shuffle { - T *_local_memory = nullptr; - static constexpr size_t group_work_items = - group_dim_0 * group_dim_1 * group_dim_2; - -public: - static constexpr size_t get_local_memory_size(size_t work_group_size) { - return sizeof(T) * work_group_size; - } - group_shuffle(uint8_t *local_memory) : _local_memory((T *)local_memory) {} - - /// Selects a value from a work-item at a given distance in the work-group - /// and stores the value in the output. - /// - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input from the calling work-item. - /// \param output The output where the selected data will be stored. - /// \param distance The distance of work-items to look ahead or behind in the - /// work-group. - template - __syclcompat_inline__ void select(const ItemT &item, T input, T &output, - int distance = 1) { - auto g = item.get_group(); - size_t id = g.get_local_linear_id(); - _local_memory[id] = input; - - sycl::group_barrier(g, sycl::memory_scope::work_group); - - const int target_id = static_cast(id) + distance; - if ((target_id >= 0) && (target_id < group_work_items)) { - output = _local_memory[static_cast(target_id)]; - } - } - /// Selects a value from a work-item at a given distance in the work-group - /// and stores the value in the output, using a wrapped index to handle - /// overflow. - /// - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data to be selected. - /// \param output The output where the selected data will be stored. - /// \param distance The number of work-items to look ahead in the - /// work-group. - template - __syclcompat_inline__ void select2(const ItemT &item, T input, T &output, - unsigned int distance = 1) { - auto g = item.get_group(); - size_t id = g.get_local_linear_id(); - _local_memory[id] = input; - - sycl::group_barrier(g, sycl::memory_scope::work_group); - - unsigned int offset = id + distance; - if (offset >= group_work_items) - offset -= group_work_items; - - output = _local_memory[offset]; - } - /// Performs a shuffle operation to move data to the right across the - /// work-items, shifting elements in a work-item array by one position to the - /// right. - /// - /// \tparam ElementsPerWorkItem The number of data elements per work-item. - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data to be shuffled. - /// \param output The array that will store the shuffle result. - template - __syclcompat_inline__ void shuffle_right(const ItemT &item, - T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem]) { - auto g = item.get_group(); - size_t id = g.get_local_linear_id(); - _local_memory[id] = input[ElementsPerWorkItem - 1]; - - sycl::group_barrier(g, sycl::memory_scope::work_group); - -#pragma unroll - for (int index = ElementsPerWorkItem - 1; index > 0; --index) - output[index] = input[index - 1]; - - if (id > 0) - output[0] = _local_memory[id - 1]; - } - /// Performs a shuffle operation to move data to the right across the - /// work-items, storing the suffix of the group after the shuffle operation. - /// - /// \tparam ElementsPerWorkItem The number of data elements per work-item. - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data to be shuffled. - /// \param output The array that will store the shuffle result. - /// \param group_suffix The suffix of the group after the shuffle. - template - __syclcompat_inline__ void - shuffle_right(const ItemT &item, T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem], T &group_suffix) { - shuffle_right(item, input, output); - group_suffix = _local_memory[group_work_items - 1]; - } - /// Performs a shuffle operation to move data to the left across the - /// work-items, shifting elements in a work-item array by one position to the - /// left. - /// - /// \tparam ElementsPerWorkItem The number of data elements per work-item. - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data to be shuffled. - /// \param output The array that will store the shuffle result. - template - __syclcompat_inline__ void shuffle_left(const ItemT &item, - T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem]) { - auto g = item.get_group(); - size_t id = g.get_local_linear_id(); - _local_memory[id] = input[0]; - - sycl::group_barrier(g, sycl::memory_scope::work_group); - -#pragma unroll - for (int index = 0; index < ElementsPerWorkItem - 1; index++) - output[index] = input[index + 1]; - - if (id < group_work_items - 1) - output[ElementsPerWorkItem - 1] = _local_memory[id + 1]; - } - /// Performs a shuffle operation to move data to the left across the - /// work-items, storing the prefix of the group before the shuffle operation. - /// - /// \tparam ElementsPerWorkItem The number of data elements per work-item. - /// \tparam ItemT The work-item identifier type. - /// \param item The work-item identifier. - /// \param input The input data to be shuffled. - /// \param output The array that will store the shuffle result. - /// \param group_prefix The prefix of the group before the shuffle. - template - __syclcompat_inline__ void - shuffle_left(const ItemT &item, T (&input)[ElementsPerWorkItem], - T (&output)[ElementsPerWorkItem], T &group_prefix) { - shuffle_left(item, input, output); - group_prefix = _local_memory[0]; - } -}; -} // namespace group -} // namespace syclcompat diff --git a/sycl/include/syclcompat/id_query.hpp b/sycl/include/syclcompat/id_query.hpp deleted file mode 100644 index 2a61ac7c2127f..0000000000000 --- a/sycl/include/syclcompat/id_query.hpp +++ /dev/null @@ -1,70 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCL compatibility extension - * - * id_query.hpp - * - * Description: - * id_query functionality for the SYCL compatibility extension - **************************************************************************/ - -#pragma once - -#include -#include - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { - -using sycl::ext::oneapi::this_work_item::get_nd_item; - -inline void wg_barrier() { get_nd_item<3>().barrier(); } - -namespace local_id { -inline size_t x() { return get_nd_item<3>().get_local_id(2); } -inline size_t y() { return get_nd_item<3>().get_local_id(1); } -inline size_t z() { return get_nd_item<3>().get_local_id(0); } -} // namespace local_id - -namespace local_range { -inline size_t x() { return get_nd_item<3>().get_local_range(2); } -inline size_t y() { return get_nd_item<3>().get_local_range(1); } -inline size_t z() { return get_nd_item<3>().get_local_range(0); } -} // namespace local_range - -namespace work_group_id { -inline size_t x() { return get_nd_item<3>().get_group(2); } -inline size_t y() { return get_nd_item<3>().get_group(1); } -inline size_t z() { return get_nd_item<3>().get_group(0); } -} // namespace work_group_id - -namespace work_group_range { -inline size_t x() { return get_nd_item<3>().get_group_range(2); } -inline size_t y() { return get_nd_item<3>().get_group_range(1); } -inline size_t z() { return get_nd_item<3>().get_group_range(0); } -} // namespace work_group_range - -namespace global_range { -inline size_t x() { return get_nd_item<3>().get_global_range(2); } -inline size_t y() { return get_nd_item<3>().get_global_range(1); } -inline size_t z() { return get_nd_item<3>().get_global_range(0); } -} // namespace global_range - -namespace global_id { -inline size_t x() { return get_nd_item<3>().get_global_id(2); } -inline size_t y() { return get_nd_item<3>().get_global_id(1); } -inline size_t z() { return get_nd_item<3>().get_global_id(0); } -} // namespace global_id - -} // namespace syclcompat diff --git a/sycl/include/syclcompat/kernel.hpp b/sycl/include/syclcompat/kernel.hpp deleted file mode 100644 index 286761fe343ce..0000000000000 --- a/sycl/include/syclcompat/kernel.hpp +++ /dev/null @@ -1,471 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCL compatibility extension - * - * kernel.hpp - * - * Description: - * kernel functionality for the SYCL compatibility extension. - **************************************************************************/ - -// The original source was under the license below: -//==---- kernel.hpp -------------------------------*- C++ -*----------------==// -// -// Copyright (C) Intel Corporation -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#ifdef _WIN32 -#include -#include -#else -#include -#endif - -#if defined(__has_include) && __has_include() -#include -#elif defined(__has_include) && __has_include() -#include -#else -#error "SYCLomatic runtime requires C++ filesystem support" -#endif - -#include -#include - -#include -#include -#include -#include - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { - -typedef void (*kernel_functor)(sycl::queue &, const sycl::nd_range<3> &, - unsigned int, void **, void **); - -struct kernel_function_info { - int max_work_group_size = 0; -}; - -static inline void get_kernel_function_info(kernel_function_info *kernel_info, - const void *function) { - kernel_info->max_work_group_size = - detail::dev_mgr::instance() - .current_device() - .get_info(); -} - -static inline kernel_function_info -get_kernel_function_info(const void *function) { - kernel_function_info kernel_info; - kernel_info.max_work_group_size = - detail::dev_mgr::instance() - .current_device() - .get_info(); - return kernel_info; -} - -namespace detail { - -#if defined(__has_include) && __has_include() -namespace fs = std::filesystem; -#else -namespace fs = std::experimental::filesystem; -#endif - -/// Write data to temporary file and return absolute path to temporary file. -/// Temporary file is created in a temporary directory both of which have random -/// names with only the user having access permissions. Only one temporary file -/// will be created in the temporary directory. -static inline fs::path write_data_to_file(char const *const data, size_t size) { - std::error_code ec; - - if (sizeof(size_t) >= sizeof(std::streamsize) && - size > (std::numeric_limits::max)()) - throw std::runtime_error("[SYCLcompat] data file too large"); - - // random number generator - std::random_device dev; - std::mt19937 prng(dev()); - std::uniform_int_distribution rand(0); - - // find temporary directory - auto tmp_dir = fs::temp_directory_path(ec); - if (ec) - throw std::runtime_error("[SYCLcompat] could not find temporary directory"); - - // create private directory - std::stringstream directory; - directory.imbue(std::locale::classic()); // avoid locale issues, like commas - fs::path directory_path; - constexpr int max_attempts = 5; - int i; - - for (i = 0; i < max_attempts; i++) { - directory << std::hex << rand(prng); - directory_path = tmp_dir / directory.str(); - if (fs::create_directory(directory_path)) { - break; - } - } - if (i == max_attempts) - throw std::runtime_error("[SYCLcompat] could not create directory"); - - // only allow owner permissions to private directory - fs::permissions(directory_path, fs::perms::owner_all, ec); - if (ec) - throw std::runtime_error( - "[SYCLcompat] could not set directory permissions"); - - // random filename in private directory - std::stringstream filename; - filename.imbue(std::locale::classic()); - filename << std::hex << rand(prng); -#ifdef _WIN32 - auto filepath = directory_path / (filename.str() + ".dll"); -#else - auto filepath = directory_path / filename.str(); -#endif - - // write data to temporary file - auto outfile = std::ofstream(filepath, std::ios::out | std::ios::binary); - if (outfile) { - // only allow program to write file - fs::permissions(filepath, fs::perms::owner_write, ec); - if (ec) - throw std::runtime_error("[SYCLcompat] could not set permissions"); - - outfile.write(data, size); - if (!outfile.good()) - throw std::runtime_error("[SYCLcompat] could not write data"); - outfile.close(); - - // only allow program to read/execute file - fs::permissions(filepath, fs::perms::owner_read | fs::perms::owner_exec, - ec); - if (ec) - throw std::runtime_error("[SYCLcompat] could not set permissions"); - } else - throw std::runtime_error("[SYCLcompat] could not write data"); - - // check temporary file contents - auto infile = std::ifstream(filepath, std::ios::in | std::ios::binary); - if (infile) { - bool mismatch = false; - size_t cnt = 0; - - while (1) { - char c; - infile.get(c); - if (infile.eof()) - break; - if (c != data[cnt++]) - mismatch = true; - } - if (cnt != size || mismatch) - throw std::runtime_error( - "[SYCLcompat] file contents not written correctly"); - } else - throw std::runtime_error("[SYCLcompat] could not validate file"); - - if (!filepath.is_absolute()) - throw std::runtime_error("[SYCLcompat] temporary filepath is not absolute"); - - return filepath; -} - -static inline uint16_t extract16(unsigned char const *const ptr) { - uint16_t ret = 0; - - ret |= static_cast(ptr[0]) << 0; - ret |= static_cast(ptr[1]) << 8; - - return (ret); -} - -static inline uint32_t extract32(unsigned char const *const ptr) { - uint32_t ret = 0; - - ret |= static_cast(ptr[0]) << 0; - ret |= static_cast(ptr[1]) << 8; - ret |= static_cast(ptr[2]) << 16; - ret |= static_cast(ptr[3]) << 24; - - return (ret); -} - -static inline uint64_t extract64(unsigned char const *const ptr) { - uint64_t ret = 0; - - ret |= static_cast(ptr[0]) << 0; - ret |= static_cast(ptr[1]) << 8; - ret |= static_cast(ptr[2]) << 16; - ret |= static_cast(ptr[3]) << 24; - ret |= static_cast(ptr[4]) << 32; - ret |= static_cast(ptr[5]) << 40; - ret |= static_cast(ptr[6]) << 48; - ret |= static_cast(ptr[7]) << 56; - - return (ret); -} - -static inline uint64_t get_lib_size(char const *const blob) { -#ifdef _WIN32 - /////////////////////////////////////////////////////////////////////// - // Analyze DOS stub - unsigned char const *const ublob = - reinterpret_cast(blob); - if (ublob[0] != 0x4d || ublob[1] != 0x5a) { - throw std::runtime_error("[SYCLcompat] blob is not a Windows DLL."); - } - uint32_t pe_header_offset = extract32(ublob + 0x3c); - - /////////////////////////////////////////////////////////////////////// - // Ananlyze PE-header - unsigned char const *const pe_header = ublob + pe_header_offset; - - // signature - uint32_t pe_signature = extract32(pe_header + 0); - if (pe_signature != 0x00004550) { - throw std::runtime_error( - "[SYCLcompat] PE-header signature is not 0x00004550"); - } - - // machine - uint16_t machine = extract16(pe_header + 4); - if (machine != 0x8664) { - throw std::runtime_error("[SYCLcompat] only DLLs for x64 supported"); - } - - // number of sections - uint16_t number_of_sections = extract16(pe_header + 6); - - // sizeof optional header - uint16_t sizeof_optional_header = extract16(pe_header + 20); - - // magic - uint16_t magic = extract16(pe_header + 24); - if (magic != 0x10b && magic != 0x20b) { - throw std::runtime_error("[SYCLcompat] MAGIC is not 0x010b or 0x020b"); - } - - /////////////////////////////////////////////////////////////////////// - // Analyze tail of optional header - constexpr int coff_header_size = 24; - - unsigned char const *const tail_of_optional_header = - pe_header + coff_header_size + sizeof_optional_header; - if (extract64(tail_of_optional_header - 8) != 0) { - throw std::runtime_error("Optional header not zero-padded"); - } - - /////////////////////////////////////////////////////////////////////// - // Analyze last section header - constexpr int section_header_size = 40; - unsigned char const *const last_section_header = - tail_of_optional_header + section_header_size * (number_of_sections - 1); - - uint32_t sizeof_raw_data = extract32(last_section_header + 16); - uint32_t pointer_to_raw_data = extract32(last_section_header + 20); - - return sizeof_raw_data + pointer_to_raw_data; -#else - if (blob[0] != 0x7F || blob[1] != 'E' || blob[2] != 'L' || blob[3] != 'F') - throw std::runtime_error("[SYCLcompat] blob is not in ELF format"); - - if (blob[4] != 0x02) - throw std::runtime_error("[SYCLcompat] only 64-bit headers are supported"); - - if (blob[5] != 0x01) - throw std::runtime_error( - "[SYCLcompat] only little-endian headers are supported"); - - unsigned char const *const ublob = - reinterpret_cast(blob); - uint64_t e_shoff = extract64(ublob + 0x28); - uint16_t e_shentsize = extract16(ublob + 0x3A); - uint16_t e_shnum = extract16(ublob + 0x3C); - - return e_shoff + (e_shentsize * e_shnum); -#endif -} - -#ifdef _WIN32 -class path_lib_record { -public: - void operator=(const path_lib_record &) = delete; - ~path_lib_record() { - for (auto entry : lib_to_path) { - FreeLibrary(static_cast(entry.first)); - fs::permissions(entry.second, fs::perms::owner_all); - fs::remove_all(entry.second.remove_filename()); - } - } - static void record_lib_path(fs::path path, void *library) { - lib_to_path[library] = path; - } - static void remove_lib(void *library) { - auto path = lib_to_path[library]; - std::error_code ec; - - FreeLibrary(static_cast(library)); - fs::permissions(path, fs::perms::owner_all); - if (fs::remove_all(path.remove_filename(), ec) != 2 || ec) - // one directory and one temporary file should have been deleted - throw std::runtime_error("[SYCLcompat] directory delete failed"); - - lib_to_path.erase(library); - } - -private: - static inline std::unordered_map lib_to_path; -}; -#endif - -} // namespace detail - -class kernel_library { -public: - constexpr kernel_library() : ptr{nullptr} {} - constexpr kernel_library(void *ptr) : ptr{ptr} {} - - operator void *() const { return ptr; } - -private: - void *ptr; -#ifdef _WIN32 - static inline detail::path_lib_record single_instance_to_trigger_destructor; -#endif -}; - -namespace detail { - -static inline kernel_library load_dl_from_data(char const *const data, - size_t size) { - fs::path filename = write_data_to_file(data, size); -#ifdef _WIN32 - void *so = LoadLibraryW(filename.wstring().c_str()); -#else - void *so = dlopen(filename.c_str(), RTLD_LAZY); -#endif - if (so == nullptr) - throw std::runtime_error("[SYCLcompat] failed to load kernel library"); - -#ifdef _WIN32 - detail::path_lib_record::record_lib_path(filename, so); -#else - std::error_code ec; - - // Windows DLL cannot be deleted while in use - if (fs::remove_all(filename.remove_filename(), ec) != 2 || ec) - // one directory and one temporary file should have been deleted - throw std::runtime_error("[SYCLcompat] directory delete failed"); -#endif - - return so; -} - -} // namespace detail - -/// Load kernel library and return a handle to use the library. -/// \param [in] name The name of the library. -static inline kernel_library load_kernel_library(const std::string &name) { - std::ifstream ifs; - ifs.open(name, std::ios::in | std::ios::binary); - - std::stringstream buffer; - buffer << ifs.rdbuf(); - - const std::string buffer_string = buffer.str(); - return detail::load_dl_from_data(buffer_string.c_str(), buffer_string.size()); -} - -/// Load kernel library whose image is alreay in memory and return a handle to -/// use the library. -/// \param [in] image A pointer to the image in memory. -static inline kernel_library load_kernel_library_mem(char const *const image) { - const size_t size = detail::get_lib_size(image); - - return detail::load_dl_from_data(image, size); -} - -/// Unload kernel library. -/// \param [in,out] library Handle to the library to be closed. -static inline void unload_kernel_library(const kernel_library &library) { -#ifdef _WIN32 - detail::path_lib_record::remove_lib(library); -#else - dlclose(library); -#endif -} - -class kernel_function { -public: - constexpr kernel_function() : ptr{nullptr} {} - constexpr kernel_function(kernel_functor ptr) : ptr{ptr} {} - - operator void *() const { return ((void *)ptr); } - - void operator()(sycl::queue &q, const sycl::nd_range<3> &range, - unsigned int local_mem_size, void **args, void **extra) { - ptr(q, range, local_mem_size, args, extra); - } - -private: - kernel_functor ptr; -}; - -/// Find kernel function in a kernel library and return its address. -/// \param [in] library Handle to the kernel library. -/// \param [in] name Name of the kernel function. -static inline kernel_function get_kernel_function(kernel_library &library, - const std::string &name) { -#ifdef _WIN32 - kernel_functor fn = reinterpret_cast( - GetProcAddress(static_cast(static_cast(library)), - (name + std::string("_wrapper")).c_str())); -#else - kernel_functor fn = reinterpret_cast( - dlsym(library, (name + std::string("_wrapper")).c_str())); -#endif - if (fn == nullptr) - throw std::runtime_error("[SYCLcompat] failed to get function"); - return fn; -} - -/// Invoke a kernel function. -/// \param [in] function kernel function. -/// \param [in] queue SYCL queue used to execute kernel -/// \param [in] group_range SYCL group range -/// \param [in] local_range SYCL local range -/// \param [in] local_mem_size The size of local memory required by the kernel -/// function. -/// \param [in] kernel_params Array of pointers to kernel arguments. -/// \param [in] extra Extra arguments. -static inline void invoke_kernel_function(kernel_function &function, - sycl::queue &queue, - sycl::range<3> group_range, - sycl::range<3> local_range, - unsigned int local_mem_size, - void **kernel_params, void **extra) { - function(queue, sycl::nd_range<3>(group_range * local_range, local_range), - local_mem_size, kernel_params, extra); -} - -} // namespace syclcompat diff --git a/sycl/include/syclcompat/launch.hpp b/sycl/include/syclcompat/launch.hpp deleted file mode 100644 index 83234182c8fee..0000000000000 --- a/sycl/include/syclcompat/launch.hpp +++ /dev/null @@ -1,164 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCL compatibility extension - * - * launch.hpp - * - * Description: - * launch functionality for the SYCL compatibility extension - **************************************************************************/ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { - -namespace detail { - -template -constexpr size_t getArgumentCount(R (*f)(Types...)) { - return sizeof...(Types); -} - -template -sycl::nd_range<3> transform_nd_range(const sycl::nd_range &range) { - sycl::range global_range = range.get_global_range(); - sycl::range local_range = range.get_local_range(); - if constexpr (Dim == 3) { - return range; - } else if constexpr (Dim == 2) { - return sycl::nd_range<3>{{1, global_range[0], global_range[1]}, - {1, local_range[0], local_range[1]}}; - } - return sycl::nd_range<3>{{1, 1, global_range[0]}, {1, 1, local_range[0]}}; -} - -template -std::enable_if_t, sycl::event> -launch(const sycl::nd_range<3> &range, sycl::queue q, Args... args) { - static_assert(detail::getArgumentCount(F) == sizeof...(args), - "Wrong number of arguments to SYCL kernel"); - static_assert( - std::is_same, void>::value, - "SYCL kernels should return void"); - - return q.parallel_for( - range, [=](sycl::nd_item<3>) { [[clang::always_inline]] F(args...); }); -} - -} // namespace detail - -template -inline sycl::nd_range compute_nd_range(sycl::range global_size_in, - sycl::range work_group_size) { - - if (global_size_in.size() == 0 || work_group_size.size() == 0) { - throw std::invalid_argument("Global or local size is zero!"); - } - for (size_t i = 0; i < Dim; ++i) { - if (global_size_in[i] < work_group_size[i]) - throw std::invalid_argument("Work group size larger than global size"); - } - - auto global_size = - ((global_size_in + work_group_size - 1) / work_group_size) * - work_group_size; - return {global_size, work_group_size}; -} - -inline sycl::nd_range<1> compute_nd_range(int global_size_in, - int work_group_size) { - return compute_nd_range<1>(global_size_in, work_group_size); -} - -template -std::enable_if_t, sycl::event> -launch(const sycl::nd_range &range, sycl::queue q, Args... args) { - return detail::launch(detail::transform_nd_range(range), q, args...); -} - -template -std::enable_if_t, sycl::event> -launch(const sycl::nd_range &range, Args... args) { - return launch(range, get_default_queue(), args...); -} - -// Alternative launch through dim3 objects -template -std::enable_if_t, sycl::event> -launch(const dim3 &grid, const dim3 &threads, sycl::queue q, Args... args) { - return launch(sycl::nd_range<3>{grid * threads, threads}, q, args...); -} - -template -std::enable_if_t, sycl::event> -launch(const dim3 &grid, const dim3 &threads, Args... args) { - return launch(grid, threads, get_default_queue(), args...); -} - -} // namespace syclcompat - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { -namespace experimental { -namespace detail { - -template -sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args) { - static_assert(syclcompat::args_compatible, - "Mismatch between device function signature and supplied " - "arguments. Have you correctly handled local memory/char*?"); - - sycl_exp::launch_config config(launch_policy.get_range(), - launch_policy.get_launch_properties()); - - return sycl_exp::submit_with_event(q, [&](sycl::handler &cgh) { - auto KernelFunctor = build_kernel_functor(cgh, launch_policy, args...); - if constexpr (syclcompat::detail::is_range_v< - typename LaunchPolicy::RangeT>) { - parallel_for(cgh, config, KernelFunctor); - } else { - static_assert( - syclcompat::detail::is_nd_range_v); - nd_launch(cgh, config, KernelFunctor); - } - }); -} -} - - -template -sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args) { - static_assert(detail::is_launch_policy_v); - return detail::launch(launch_policy, q, args...); -} - -template -sycl::event launch(LaunchPolicy launch_policy, Args... args) { - static_assert(detail::is_launch_policy_v); - return launch(launch_policy, get_default_queue(), args...); -} - -} // namespace experimental -} // namespace syclcompat diff --git a/sycl/include/syclcompat/launch_policy.hpp b/sycl/include/syclcompat/launch_policy.hpp deleted file mode 100644 index 13980d03c93c9..0000000000000 --- a/sycl/include/syclcompat/launch_policy.hpp +++ /dev/null @@ -1,272 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCL compatibility extension - * - * launch.hpp - * - * Description: - * launch functionality for the SYCL compatibility extension - **************************************************************************/ - -#pragma once - -#include "sycl/ext/oneapi/experimental/enqueue_functions.hpp" -#include "sycl/ext/oneapi/properties/properties.hpp" -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { -namespace experimental { - -namespace sycl_exp = sycl::ext::oneapi::experimental; - -// Wrapper for kernel sycl_exp::properties -template struct kernel_properties { - static_assert(sycl_exp::is_property_list_v); - using Props = Properties; - - template - kernel_properties(Props... properties) : props{properties...} {} - - template - kernel_properties(sycl_exp::properties properties) - : props{properties} {} - - Properties props; -}; - -template ::value, void>> -kernel_properties(Props... props) - -> kernel_properties; - -template -kernel_properties(sycl_exp::properties props) - -> kernel_properties>; - -// Wrapper for launch sycl_exp::properties -template struct launch_properties { - static_assert(sycl_exp::is_property_list_v); - using Props = Properties; - - template - launch_properties(Props... properties) : props{properties...} {} - - template - launch_properties(sycl_exp::properties properties) - : props{properties} {} - - Properties props; -}; - -template ::value, void>> -launch_properties(Props... props) - -> launch_properties; - -template -launch_properties(sycl_exp::properties props) - -> launch_properties>; - -// Wrapper for local memory size -struct local_mem_size { - local_mem_size(size_t size = 0) : size{size} {}; - size_t size; -}; - -// launch_policy is constructed by the user & passed to `compat_exp::launch` -template -class launch_policy { - static_assert(sycl_exp::is_property_list_v); - static_assert(sycl_exp::is_property_list_v); - static_assert(syclcompat::detail::is_range_or_nd_range_v); - static_assert(syclcompat::detail::is_nd_range_v || !LocalMem, - "sycl::range kernel launches are incompatible with local " - "memory usage!"); - -public: - using KPropsT = KProps; - using LPropsT = LProps; - using RangeT = Range; - static constexpr bool HasLocalMem = LocalMem; - -private: - launch_policy() = default; - - template - launch_policy(Ts... ts) - : _kernel_properties{detail::property_getter< - kernel_properties, kernel_properties, std::tuple>()( - std::tuple(ts...))}, - _launch_properties{detail::property_getter< - launch_properties, launch_properties, std::tuple>()( - std::tuple(ts...))}, - _local_mem_size{ - detail::local_mem_getter>()( - std::tuple(ts...))} { - check_variadic_args(ts...); - } - - template void check_variadic_args(Ts...) { - static_assert( - std::conjunction_v, - detail::is_launch_properties, - detail::is_local_mem_size>...>, - "Received an unexpected argument to ctor. Did you forget to wrap " - "in " - "compat::kernel_properties, launch_properties, local_mem_size?"); - } - -public: - template - launch_policy(Range range, Ts... ts) : launch_policy(ts...) { - _range = range; - check_variadic_args(ts...); - } - - template - launch_policy(dim3 global_range, Ts... ts) : launch_policy(ts...) { - _range = Range{global_range}; - check_variadic_args(ts...); - } - - template - launch_policy(dim3 global_range, dim3 local_range, Ts... ts) - : launch_policy(ts...) { - _range = Range{global_range * local_range, local_range}; - check_variadic_args(ts...); - } - - KProps get_kernel_properties() { return _kernel_properties.props; } - LProps get_launch_properties() { return _launch_properties.props; } - size_t get_local_mem_size() { return _local_mem_size.size; } - Range get_range() { return _range; } - -private: - Range _range; - kernel_properties _kernel_properties; - launch_properties _launch_properties; - local_mem_size _local_mem_size; -}; - -// Deduction guides for launch_policy -template -launch_policy(Range, Ts...) -> launch_policy< - Range, detail::properties_or_empty, - detail::properties_or_empty, - detail::has_type>::value>; - -template -launch_policy(sycl::range, sycl::range, Ts...) -> launch_policy< - sycl::nd_range, detail::properties_or_empty, - detail::properties_or_empty, - detail::has_type>::value>; - -template -launch_policy(dim3, Ts...) -> launch_policy< - sycl::range<3>, detail::properties_or_empty, - detail::properties_or_empty, - detail::has_type>::value>; - -template -launch_policy(dim3, dim3, Ts...) -> launch_policy< - sycl::nd_range<3>, detail::properties_or_empty, - detail::properties_or_empty, - detail::has_type>::value>; - -namespace detail { -// Custom std::apply helpers to enable inlining -template -__syclcompat_inline__ constexpr void apply_expand(F &&f, Tuple &&t, - std::index_sequence) { - [[clang::always_inline]] std::forward(f)( - get(std::forward(t))...); -} - -template -__syclcompat_inline__ constexpr void apply_helper(F &&f, Tuple &&t) { - apply_expand( - std::forward(f), std::forward(t), - std::make_index_sequence>>{}); -} - -template -struct KernelFunctor { - KernelFunctor(KProps kernel_props, Args... args) - : _kernel_properties{kernel_props}, - _argument_tuple(std::make_tuple(args...)) {} - - KernelFunctor(KProps kernel_props, sycl::local_accessor local_acc, - Args... args) - : _kernel_properties{kernel_props}, _local_acc{local_acc}, - _argument_tuple(std::make_tuple(args...)) {} - - auto get(sycl_exp::properties_tag) const { return _kernel_properties; } - - __syclcompat_inline__ void - operator()(syclcompat::detail::range_to_item_t) const { - if constexpr (HasLocalMem) { - char *local_mem_ptr = static_cast( - _local_acc.template get_multi_ptr() - .get()); - apply_helper( - [lmem_ptr = local_mem_ptr](auto &&...args) { - [[clang::always_inline]] F(args..., lmem_ptr); - }, - _argument_tuple); - } else { - apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); }, - _argument_tuple); - } - } - - KProps _kernel_properties; - std::tuple _argument_tuple; - std::conditional_t, std::monostate> - _local_acc; // monostate for empty type -}; - -//==================================================================== -// This helper function avoids 2 nested `if constexpr` in detail::launch -template -auto build_kernel_functor(sycl::handler &cgh, LaunchPolicy launch_policy, - Args... args) - -> KernelFunctor { - if constexpr (LaunchPolicy::HasLocalMem) { - sycl::local_accessor local_memory( - launch_policy.get_local_mem_size(), cgh); - return KernelFunctor( - launch_policy.get_kernel_properties(), local_memory, args...); - } else { - return KernelFunctor( - launch_policy.get_kernel_properties(), args...); - } -} - -} // namespace detail -} // namespace experimental -} // namespace syclcompat diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp deleted file mode 100644 index f70a2b0dcb085..0000000000000 --- a/sycl/include/syclcompat/math.hpp +++ /dev/null @@ -1,2385 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCL compatibility extension - * - * math.hpp - * - * Description: - * math utilities for the SYCL compatibility extension. - **************************************************************************/ - -// The original source was under the license below: -//==---- math.hpp ---------------------------------*- C++ -*----------------==// -// -// Copyright (C) Intel Corporation -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include -#include - -// TODO(syclcompat-lib-reviewers): this should not be required -#ifndef SYCL_EXT_ONEAPI_COMPLEX -#define SYCL_EXT_ONEAPI_COMPLEX -#endif - -#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS -#include -#endif -#include -#include - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { -namespace detail { - -namespace complex_namespace = sycl::ext::oneapi::experimental; - -template -using complex_type = detail::complex_namespace::complex; - -template -constexpr bool is_int32_type = std::is_same_v, int32_t> || - std::is_same_v, uint32_t>; - -// Helper constexpr bool to avoid ugly macros where possible -#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS -constexpr bool support_bfloat16_math = true; -#else -constexpr bool support_bfloat16_math = false; -#endif - -template -inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val) { - return sycl::clamp(val, min_val, max_val); -} -#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS -// TODO(syclcompat-lib-reviewers): Follow the process to add this (& other math -// fns) to the bfloat16 math function extension. If added, remove this -// functionality from the header. -template <> -inline sycl::ext::oneapi::bfloat16 clamp(sycl::ext::oneapi::bfloat16 val, - sycl::ext::oneapi::bfloat16 min_val, - sycl::ext::oneapi::bfloat16 max_val) { - if (val < min_val) - return min_val; - if (val > max_val) - return max_val; - return val; -} - -template -inline std::enable_if_t, - sycl::vec> -clamp(sycl::vec val, sycl::vec min_val, - sycl::vec max_val) { - return [&val, &min_val, &max_val](std::integer_sequence) { - return sycl::vec{ - clamp(val[I], min_val[I], max_val[I])...}; - }(std::make_integer_sequence{}); -} - -template -inline std::enable_if_t, - sycl::marray> -clamp(sycl::marray val, sycl::marray min_val, - sycl::marray max_val) { - return [&val, &min_val, &max_val](std::index_sequence) { - return sycl::marray{ - clamp(val[I], min_val[I], max_val[I])...}; - }(std::make_index_sequence{}); -} -#endif - -template -class vectorized_binary { -public: - inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) { - VecT v4; - for (size_t i = 0; i < v4.size(); ++i) { - v4[i] = binary_op(a[i], b[i]); - } - return v4; - } -}; - -template -class vectorized_binary< - VecT, BinaryOperation, - std::void_t>> { -public: - inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) { - return binary_op(a, b).template as(); - } -}; - -/// Extend the 'val' to 'bit' size, zero extend for unsigned int and signed -/// extend for signed int. Returns a signed integer type. -template -inline auto zero_or_signed_extend(ValueT val, unsigned bit) { - static_assert(std::is_integral_v); - if constexpr (sizeof(ValueT) == 4) { - assert(bit < 64 && - "When extending int32 value, bit must be smaller than 64."); - if constexpr (std::is_signed_v) - return int64_t(val) << (64 - bit) >> (64 - bit); - else - return int64_t(val); - } else if constexpr (sizeof(ValueT) == 2) { - assert(bit < 32 && - "When extending int16 value, bit must be smaller than 32."); - if constexpr (std::is_signed_v) - return int32_t(val) << (32 - bit) >> (32 - bit); - else - return int32_t(val); - } else if constexpr (sizeof(ValueT) == 1) { - assert(bit < 16 && - "When extending int8 value, bit must be smaller than 16."); - if constexpr (std::is_signed_v) - return int16_t(val) << (16 - bit) >> (16 - bit); - else - return int16_t(val); - } else { - static_assert(sizeof(ValueT) == 8); - assert(bit < 64 && "Cannot extend int64 value."); - return static_cast(val); - } -} - -template -inline constexpr RetT extend_binary(AT a, BT b, BinaryOperation binary_op) { - const int64_t extend_a = zero_or_signed_extend(a, 33); - const int64_t extend_b = zero_or_signed_extend(b, 33); - const int64_t ret = binary_op(extend_a, extend_b); - if constexpr (needSat) - return detail::clamp(ret, std::numeric_limits::min(), - std::numeric_limits::max()); - return ret; -} - -template -inline constexpr RetT extend_binary(AT a, BT b, CT c, - BinaryOperation1 binary_op, - BinaryOperation2 second_op) { - const int64_t extend_a = zero_or_signed_extend(a, 33); - const int64_t extend_b = zero_or_signed_extend(b, 33); - int64_t extend_temp = - zero_or_signed_extend(binary_op(extend_a, extend_b), 34); - if constexpr (needSat) - extend_temp = - detail::clamp(extend_temp, std::numeric_limits::min(), - std::numeric_limits::max()); - const int64_t extend_c = zero_or_signed_extend(c, 33); - return second_op(extend_temp, extend_c); -} - -template sycl::vec extract_and_extend2(T a) { - sycl::vec ret; - sycl::vec va{a}; - using IntT = std::conditional_t, int16_t, uint16_t>; - auto v = va.template as>(); - ret[0] = zero_or_signed_extend(v[0], 17); - ret[1] = zero_or_signed_extend(v[1], 17); - return ret; -} - -template sycl::vec extract_and_extend4(T a) { - sycl::vec ret; - sycl::vec va{a}; - using IntT = std::conditional_t, int8_t, uint8_t>; - auto v = va.template as>(); - ret[0] = zero_or_signed_extend(v[0], 9); - ret[1] = zero_or_signed_extend(v[1], 9); - ret[2] = zero_or_signed_extend(v[2], 9); - ret[3] = zero_or_signed_extend(v[3], 9); - return ret; -} - -template -inline constexpr RetT extend_vbinary2(AT a, BT b, RetT c, - BinaryOperation binary_op) { - static_assert(is_int32_type && is_int32_type && is_int32_type); - sycl::vec extend_a = extract_and_extend2(a); - sycl::vec extend_b = extract_and_extend2(b); - sycl::vec temp{binary_op(extend_a[0], extend_b[0]), - binary_op(extend_a[1], extend_b[1])}; - using IntT = std::conditional_t, int16_t, uint16_t>; - - if constexpr (NeedSat) { - int32_t min_val = 0, max_val = 0; - min_val = std::numeric_limits::min(); - max_val = std::numeric_limits::max(); - temp = detail::clamp(temp, sycl::vec(min_val), - sycl::vec(max_val)); - } - if constexpr (NeedAdd) { - return temp[0] + temp[1] + c; - } - return sycl::vec{temp[0], temp[1]}.template as>(); -} - -template -inline constexpr RetT extend_vbinary4(AT a, BT b, RetT c, - BinaryOperation binary_op) { - static_assert(is_int32_type && is_int32_type && is_int32_type); - sycl::vec extend_a = extract_and_extend4(a); - sycl::vec extend_b = extract_and_extend4(b); - sycl::vec temp{ - binary_op(extend_a[0], extend_b[0]), binary_op(extend_a[1], extend_b[1]), - binary_op(extend_a[2], extend_b[2]), binary_op(extend_a[3], extend_b[3])}; - using IntT = std::conditional_t, int8_t, uint8_t>; - - if constexpr (NeedSat) { - int16_t min_val = 0, max_val = 0; - min_val = std::numeric_limits::min(); - max_val = std::numeric_limits::max(); - temp = detail::clamp(temp, sycl::vec(min_val), - sycl::vec(max_val)); - } - if constexpr (NeedAdd) { - return temp[0] + temp[1] + temp[2] + temp[3] + c; - } - - return sycl::vec{temp[0], temp[1], temp[2], temp[3]} - .template as>(); -} - -template inline bool isnan(const ValueT a) { - if constexpr (std::is_same_v) { - static_assert(detail::support_bfloat16_math); - return sycl::ext::oneapi::experimental::isnan(a); - } else { - return sycl::isnan(a); - } -} - -// FIXME(syclcompat-lib-reviewers): move bfe outside detail once perf is -// improved & semantics understood -/// Bitfield-extract. -/// -/// \tparam T The type of \param source value, must be an integer. -/// \param source The source value to extracting. -/// \param bit_start The position to start extracting. -/// \param num_bits The number of bits to extracting. -template -inline T bfe(const T source, const uint32_t bit_start, - const uint32_t num_bits) { - static_assert(std::is_unsigned_v); - // FIXME(syclcompat-lib-reviewers): This ternary was added to catch a case - // which may be undefined anyway. Consider that we are losing perf here. - const T mask = - num_bits >= std::numeric_limits::digits * sizeof(T) - ? static_cast(-1) - : ((static_cast(1) << num_bits) - 1); - return (source >> bit_start) & mask; -} - -} // namespace detail - -/// Bitfield-extract with boundary checking. -/// -/// Extract bit field from \param source and return the zero or sign-extended -/// result. Source \param bit_start gives the bit field starting bit position, -/// and source \param num_bits gives the bit field length in bits. -/// -/// The result is padded with the sign bit of the extracted field. If `num_bits` -/// is zero, the result is zero. If the start position is beyond the msb of the -/// input, the result is filled with the replicated sign bit of the extracted -/// field. -/// -/// \tparam T The type of \param source value, must be an integer. -/// \param source The source value to extracting. -/// \param bit_start The position to start extracting. -/// \param num_bits The number of bits to extracting. -template -inline T bfe_safe(const T source, const uint32_t bit_start, - const uint32_t num_bits) { - static_assert(std::is_integral_v); -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v) { - int32_t res{}; - asm volatile("bfe.s32 %0, %1, %2, %3;" - : "=r"(res) - : "r"((int32_t)source), "r"(bit_start), "r"(num_bits)); - return res; - } else if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - uint32_t res{}; - asm volatile("bfe.u32 %0, %1, %2, %3;" - : "=r"(res) - : "r"((uint32_t)source), "r"(bit_start), "r"(num_bits)); - return res; - } else if constexpr (std::is_same_v) { - T res{}; - asm volatile("bfe.s64 %0, %1, %2, %3;" - : "=l"(res) - : "l"(source), "r"(bit_start), "r"(num_bits)); - return res; - } else if constexpr (std::is_same_v) { - T res{}; - asm volatile("bfe.u64 %0, %1, %2, %3;" - : "=l"(res) - : "l"(source), "r"(bit_start), "r"(num_bits)); - return res; - } -#endif - const uint32_t bit_width = - std::numeric_limits::digits * sizeof(T); - const uint32_t pos = std::min(bit_start, bit_width); - const uint32_t len = std::min(pos + num_bits, bit_width) - pos; - if constexpr (std::is_signed_v) { - // FIXME(syclcompat-lib-reviewers): As above, catching a case whose result - // is undefined and likely losing perf. - const T mask = len >= bit_width ? T{-1} : static_cast((T{1} << len) - 1); - - // Find the sign-bit, the result is padded with the sign bit of the - // extracted field. - // Note if requested num_bits==0, we return zero via sign_bit=0 - const uint32_t sign_bit_pos = std::min(pos + len - 1, bit_width - 1); - const T sign_bit = num_bits != 0 && ((source >> sign_bit_pos) & 1); - const T sign_bit_padding = (-sign_bit & ~mask); - return ((source >> pos) & mask) | sign_bit_padding; - } else { - return syclcompat::detail::bfe(source, pos, len); - } -} - -namespace detail { -// FIXME(syclcompat-lib-reviewers): move bfi outside detail once perf is -// improved & semantics understood -/// Bitfield-insert. -/// -/// \tparam T The type of \param x and \param y , must be an unsigned integer. -/// \param x The source of the bitfield. -/// \param y The source where bitfield is inserted. -/// \param bit_start The position to start insertion. -/// \param num_bits The number of bits to insertion. -template -inline T bfi(const T x, const T y, const uint32_t bit_start, - const uint32_t num_bits) { - static_assert(std::is_unsigned_v); - constexpr unsigned bit_width = - std::numeric_limits::digits * sizeof(T); - - // if bit_start > bit_width || len == 0, should return y. - const T ignore_bfi = static_cast(bit_start > bit_width || num_bits == 0); - T extract_bitfield_mask = (static_cast(~T{0}) >> (bit_width - num_bits)) - << bit_start; - T clean_bitfield_mask = ~extract_bitfield_mask; - return (y & (-ignore_bfi | clean_bitfield_mask)) | - (~-ignore_bfi & ((x << bit_start) & extract_bitfield_mask)); -} -} // namespace detail - -/// Bitfield-insert with boundary checking. -/// -/// Align and insert a bit field from \param x into \param y . Source \param -/// bit_start gives the starting bit position for the insertion, and source -/// \param num_bits gives the bit field length in bits. -/// -/// \tparam T The type of \param x and \param y , must be an unsigned integer. -/// \param x The source of the bitfield. -/// \param y The source where bitfield is inserted. -/// \param bit_start The position to start insertion. -/// \param num_bits The number of bits to insertion. -template -inline T bfi_safe(const T x, const T y, const uint32_t bit_start, - const uint32_t num_bits) { - static_assert(std::is_unsigned_v); -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v) { - uint32_t res{}; - asm volatile("bfi.b32 %0, %1, %2, %3, %4;" - : "=r"(res) - : "r"((uint32_t)x), "r"((uint32_t)y), "r"(bit_start), - "r"(num_bits)); - return res; - } else if constexpr (std::is_same_v) { - uint64_t res{}; - asm volatile("bfi.b64 %0, %1, %2, %3, %4;" - : "=l"(res) - : "l"(x), "l"(y), "r"(bit_start), "r"(num_bits)); - return res; - } -#endif - constexpr unsigned bit_width = - std::numeric_limits::digits * sizeof(T); - const uint32_t pos = std::min(bit_start, bit_width); - const uint32_t len = std::min(pos + num_bits, bit_width) - pos; - return syclcompat::detail::bfi(x, y, pos, len); -} - -/// Emulated function for __funnelshift_l -inline unsigned int funnelshift_l(unsigned int low, unsigned int high, - unsigned int shift) { - return (sycl::upsample(high, low) << (shift & 31U)) >> 32; -} - -/// Emulated function for __funnelshift_lc -inline unsigned int funnelshift_lc(unsigned int low, unsigned int high, - unsigned int shift) { - return (sycl::upsample(high, low) << sycl::min(shift, 32U)) >> 32; -} - -/// Emulated function for __funnelshift_r -inline unsigned int funnelshift_r(unsigned int low, unsigned int high, - unsigned int shift) { - return (sycl::upsample(high, low) >> (shift & 31U)) & 0xFFFFFFFF; -} - -/// Emulated function for __funnelshift_rc -inline unsigned int funnelshift_rc(unsigned int low, unsigned int high, - unsigned int shift) { - return (sycl::upsample(high, low) >> sycl::min(shift, 32U)) & 0xFFFFFFFF; -} - -/// Compute fast_length for variable-length array -/// \param [in] a The array -/// \param [in] len Length of the array -/// \returns The computed fast_length -inline float fast_length(const float *a, int len) { - switch (len) { - case 1: - return sycl::fast_length(a[0]); - case 2: - return sycl::fast_length(sycl::float2(a[0], a[1])); - case 3: - return sycl::fast_length(sycl::float3(a[0], a[1], a[2])); - case 4: - return sycl::fast_length(sycl::float4(a[0], a[1], a[2], a[3])); - case 0: - return 0; - default: - float f = 0; - for (int i = 0; i < len; ++i) - f += a[i] * a[i]; - return sycl::sqrt(f); - } -} - -/// Calculate the square root of the input array. -/// \param [in] a The array pointer -/// \param [in] len Length of the array -/// \returns The square root -template -inline ValueT length(const ValueT *a, const int len) { - switch (len) { - case 1: - return a[0]; - case 2: - return sycl::length(sycl::vec(a[0], a[1])); - case 3: - return sycl::length(sycl::vec(a[0], a[1], a[2])); - case 4: - return sycl::length(sycl::vec(a[0], a[1], a[2], a[3])); - default: - ValueT ret = 0; - for (int i = 0; i < len; ++i) - ret += a[i] * a[i]; - return sycl::sqrt(ret); - } -} - -/// Performs comparison. -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] binary_op functor that implements the binary operation -/// \returns the comparison result -template -inline std::enable_if_t< - std::is_same_v, bool>, - bool> -compare(const ValueT a, const ValueT b, const BinaryOperation binary_op) { - return binary_op(a, b); -} -template -inline std::enable_if_t< - std::is_same_v, ValueT, ValueT>, - bool>, - bool> -compare(const ValueT a, const ValueT b, const std::not_equal_to<> binary_op) { - return !detail::isnan(a) && !detail::isnan(b) && binary_op(a, b); -} - -/// Performs 2 element comparison. -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] binary_op functor that implements the binary operation -/// \returns the comparison result -template -inline std::enable_if_t -compare(const ValueT a, const ValueT b, const BinaryOperation binary_op) { - return {compare(a[0], b[0], binary_op), compare(a[1], b[1], binary_op)}; -} - -/// Performs unordered comparison. -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] binary_op functor that implements the binary operation -/// \returns the comparison result -template -inline std::enable_if_t< - std::is_same_v, bool>, - bool> -unordered_compare(const ValueT a, const ValueT b, - const BinaryOperation binary_op) { - return detail::isnan(a) || detail::isnan(b) || binary_op(a, b); -} - -/// Performs 2 element unordered comparison. -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] binary_op functor that implements the binary operation -/// \returns the comparison result -template -inline std::enable_if_t -unordered_compare(const ValueT a, const ValueT b, - const BinaryOperation binary_op) { - return {unordered_compare(a[0], b[0], binary_op), - unordered_compare(a[1], b[1], binary_op)}; -} - -/// Performs 2 element comparison and return true if both results are true. -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] binary_op functor that implements the binary operation -/// \returns the comparison result -template -inline std::enable_if_t -compare_both(const ValueT a, const ValueT b, const BinaryOperation binary_op) { - return compare(a[0], b[0], binary_op) && compare(a[1], b[1], binary_op); -} - -/// Performs 2 element unordered comparison and return true if both results are -/// true. -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] binary_op functor that implements the binary operation -/// \returns the comparison result -template -inline std::enable_if_t -unordered_compare_both(const ValueT a, const ValueT b, - const BinaryOperation binary_op) { - return unordered_compare(a[0], b[0], binary_op) && - unordered_compare(a[1], b[1], binary_op); -} - -/// Performs 2 elements comparison, compare result of each element is 0 (false) -/// or 0xffff (true), returns an unsigned int by composing compare result of two -/// elements. -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] binary_op functor that implements the binary operation -/// \returns the comparison result -template -inline std::enable_if_t -compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op) { - // Since compare returns 0 or 1, -compare will be 0x00000000 or 0xFFFFFFFF - return ((-compare(a[0], b[0], binary_op)) & 0xFFFF) | - ((-compare(a[1], b[1], binary_op)) << 16u); -} - -/// Performs 2 elements unordered comparison, compare result of each element is -/// 0 (false) or 0xffff (true), returns an unsigned int by composing compare -/// result of two elements. -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] binary_op functor that implements the binary operation -/// \returns the comparison result -template -inline std::enable_if_t -unordered_compare_mask(const ValueT a, const ValueT b, - const BinaryOperation binary_op) { - return ((-unordered_compare(a[0], b[0], binary_op)) & 0xFFFF) | - ((-unordered_compare(a[1], b[1], binary_op)) << 16); -} - -/// Compute vectorized max for two values, with each value treated as a vector -/// type \p S -/// \param [in] S The type of the vector -/// \param [in] T The type of the original values -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The vectorized max of the two values -template inline T vectorized_max(T a, T b) { - sycl::vec v0{a}, v1{b}; - auto v2 = v0.template as(); - auto v3 = v1.template as(); - v2 = sycl::max(v2, v3); - v0 = v2.template as>(); - return v0; -} - -/// Compute vectorized min for two values, with each value treated as a vector -/// type \p S -/// \param [in] S The type of the vector -/// \param [in] T The type of the original values -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The vectorized min of the two values -template inline T vectorized_min(T a, T b) { - sycl::vec v0{a}, v1{b}; - auto v2 = v0.template as(); - auto v3 = v1.template as(); - v2 = sycl::min(v2, v3); - v0 = v2.template as>(); - return v0; -} - -/// Compute vectorized unary operation for a value, with the value treated as a -/// vector type \p VecT. -/// \tparam [in] VecT The type of the vector -/// \tparam [in] UnaryOperation The unary operation class -/// \param [in] a The input value -/// \returns The vectorized unary operation value of the input value -template -inline unsigned vectorized_unary(unsigned a, const UnaryOperation unary_op) { - sycl::vec v0{a}; - auto v1 = v0.as(); - auto v2 = unary_op(v1); - v0 = v2.template as>(); - return v0; -} - -/// Compute vectorized absolute difference for two values without modulo -/// overflow, with each value treated as a vector type \p VecT. -/// \tparam [in] VecT The type of the vector -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The vectorized absolute difference of the two values -template -inline unsigned vectorized_sum_abs_diff(unsigned a, unsigned b) { - sycl::vec v0{a}, v1{b}; - // Need convert element type to wider signed type to avoid overflow. - auto v2 = v0.as().template convert(); - auto v3 = v1.as().template convert(); - auto v4 = sycl::abs_diff(v2, v3); - unsigned sum = 0; - for (size_t i = 0; i < v4.size(); ++i) { - sum += v4[i]; - } - return sum; -} - -/// Compute vectorized isgreater for two values, with each value treated as a -/// vector type \p S -/// \param [in] S The type of the vector -/// \param [in] T The type of the original values -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The vectorized greater than of the two values -template inline T vectorized_isgreater(T a, T b) { - sycl::vec v0{a}, v1{b}; - auto v2 = v0.template as(); - auto v3 = v1.template as(); - auto v4 = sycl::isgreater(v2, v3); - v0 = v4.template as>(); - return v0; -} - -/// Compute vectorized isgreater for two unsigned int values, with each value -/// treated as a vector of two unsigned short -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The vectorized greater than of the two values -template <> -inline unsigned vectorized_isgreater(unsigned a, - unsigned b) { - sycl::vec v0{a}, v1{b}; - auto v2 = v0.template as(); - auto v3 = v1.template as(); - sycl::ushort2 v4; - v4[0] = v2[0] > v3[0]; - v4[1] = v2[1] > v3[1]; - v0 = v4.template as>(); - return v0; -} - -/// Returns min(max(val, min_val), max_val) -/// \param [in] val The input value -/// \param [in] min_val The minimum value -/// \param [in] max_val The maximum value -/// \returns the value between min_val and max_val -template -inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val) { - return detail::clamp(val, min_val, max_val); -} - -/// Determine whether 2 element value is NaN. -/// \param [in] a The input value -/// \returns the comparison result -template -inline std::enable_if_t isnan(const ValueT a) { - return {detail::isnan(a[0]), detail::isnan(a[1])}; -} - -/// cbrt function wrapper. -template -inline std::enable_if_t || - std::is_same_v, - ValueT> -cbrt(ValueT val) { - return sycl::cbrt(static_cast(val)); -} - -// min/max function overloads. -// For floating-point types, `float` or `double` arguments are acceptable. -// For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or -// `std::int64_t` type arguments are acceptable. -// sycl::half supported as well, and sycl::ext::oneapi::bfloat16 if available. -template -inline std::enable_if_t && - std::is_integral_v, - std::common_type_t> -min(ValueT a, ValueU b) { - return sycl::min(static_cast>(a), - static_cast>(b)); -} - -template -inline std::enable_if_t && - syclcompat::is_floating_point_v, - std::common_type_t> -min(ValueT a, ValueU b) { - if constexpr (std::is_same_v, - sycl::ext::oneapi::bfloat16>) { - static_assert(detail::support_bfloat16_math); - return sycl::ext::oneapi::experimental::fmin( - static_cast>(a), - static_cast>(b)); - } else { - return sycl::fmin(static_cast>(a), - static_cast>(b)); - } -} - -template -inline std::enable_if_t && - std::is_integral_v, - std::common_type_t> -max(ValueT a, ValueU b) { - return sycl::max(static_cast>(a), - static_cast>(b)); -} -template -inline std::enable_if_t && - syclcompat::is_floating_point_v, - std::common_type_t> -max(ValueT a, ValueU b) { - if constexpr (std::is_same_v, - sycl::ext::oneapi::bfloat16>) { - static_assert(detail::support_bfloat16_math); - return sycl::ext::oneapi::experimental::fmax( - static_cast>(a), - static_cast>(b)); - } else { - return sycl::fmax(static_cast>(a), - static_cast>(b)); - } -} - -/// Performs 2 elements comparison and returns the bigger one. If either of -/// inputs is NaN, then return NaN. -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns the bigger value -template -inline std::common_type_t fmax_nan(const ValueT a, - const ValueU b) { - if (detail::isnan(a) || detail::isnan(b)) - return NAN; - return syclcompat::max(a, b); -} - -template -inline sycl::vec, 2> -fmax_nan(const sycl::vec a, const sycl::vec b) { - return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])}; -} - -template -inline sycl::marray, 2> -fmax_nan(const sycl::marray a, const sycl::marray b) { - return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])}; -} - -/// Performs 2 elements comparison and returns the smaller one. If either of -/// inputs is NaN, then return NaN. -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns the smaller value -template -inline std::common_type_t fmin_nan(const ValueT a, - const ValueU b) { - if (detail::isnan(a) || detail::isnan(b)) - return NAN; - return syclcompat::min(a,b); -} - -template -inline sycl::vec, 2> -fmin_nan(const sycl::vec a, const sycl::vec b) { - return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])}; -} - -template -inline sycl::marray, 2> -fmin_nan(const sycl::marray a, const sycl::marray b) { - return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])}; -} - -// pow functions overload. -inline float pow(const float a, const int b) { return sycl::pown(a, b); } -inline double pow(const double a, const int b) { return sycl::pown(a, b); } - -template -inline typename std::enable_if_t, ValueT> -pow(const ValueT a, const ValueU b) { - return sycl::pow(a, static_cast(b)); -} -// TODO(syclcompat-lib-reviewers) calling pow with non-floating point values -// is currently defaulting to double, which fails on devices without -// aspect::fp64. This has to be properly documented, and maybe changed to -// support all devices. -template -inline typename std::enable_if_t, double> -pow(const ValueT a, const ValueU b) { - return sycl::pow(static_cast(a), static_cast(b)); -} - -/// Performs relu saturation. -/// \param [in] a The input value -/// \returns the relu saturation result -template inline ValueT relu(const ValueT a) { - if constexpr (syclcompat::is_floating_point_v) - if (detail::isnan(a)) - return a; - if (a < ValueT(0)) - return ValueT(0); - return a; -} -template -inline sycl::vec -relu(const sycl::vec a) { - sycl::vec ret; - for (int i = 0; i < NumElements; ++i) - ret[i] = relu(a[i]); - return ret; -} -template -inline sycl::marray relu(const sycl::marray a) { - return {relu(a[0]), relu(a[1])}; -} - -/// Computes the multiplication of two complex numbers. -/// \tparam T Complex element type -/// \param [in] x The first input complex number -/// \param [in] y The second input complex number -/// \returns The result -template -sycl::vec cmul(sycl::vec x, sycl::vec y) { - sycl::ext::oneapi::experimental::complex t1(x[0], x[1]), t2(y[0], y[1]); - t1 = t1 * t2; - return sycl::vec(t1.real(), t1.imag()); -} - -/// Computes the division of two complex numbers. -/// \tparam T Complex element type -/// \param [in] x The first input complex number -/// \param [in] y The second input complex number -/// \returns The result -template -sycl::vec cdiv(sycl::vec x, sycl::vec y) { - sycl::ext::oneapi::experimental::complex t1(x[0], x[1]), t2(y[0], y[1]); - t1 = t1 / t2; - return sycl::vec(t1.real(), t1.imag()); -} - -/// Computes the magnitude of a complex number. -/// \tparam T Complex element type -/// \param [in] x The input complex number -/// \returns The result -template T cabs(sycl::vec x) { - sycl::ext::oneapi::experimental::complex t(x[0], x[1]); - return sycl::ext::oneapi::experimental::abs(t); -} - -/// Computes the complex conjugate of a complex number. -/// \tparam T Complex element type -/// \param [in] x The input complex number -/// \returns The result -template sycl::vec conj(sycl::vec x) { - sycl::ext::oneapi::experimental::complex t(x[0], x[1]); - t = conj(t); - return sycl::vec(t.real(), t.imag()); -} - -/// Performs complex number multiply addition. -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns the operation result -template -inline sycl::vec cmul_add(const sycl::vec a, - const sycl::vec b, - const sycl::vec c) { - sycl::ext::oneapi::experimental::complex t(a[0], a[1]); - sycl::ext::oneapi::experimental::complex u(b[0], b[1]); - sycl::ext::oneapi::experimental::complex v(c[0], c[1]); - t = t * u + v; - return sycl::vec{t.real(), t.imag()}; -} -template -inline sycl::marray cmul_add(const sycl::marray a, - const sycl::marray b, - const sycl::marray c) { - sycl::ext::oneapi::experimental::complex t(a[0], a[1]); - sycl::ext::oneapi::experimental::complex u(b[0], b[1]); - sycl::ext::oneapi::experimental::complex v(c[0], c[1]); - t = t * u + v; - return sycl::marray{t.real(), t.imag()}; -} - -/// A sycl::abs wrapper functors. -struct abs { - template auto operator()(const ValueT x) const { - return sycl::abs(x); - } -}; - -/// A sycl::abs_diff wrapper functors. -struct abs_diff { - template - auto operator()(const ValueT x, const ValueT y) const { - return sycl::abs_diff(x, y); - } -}; - -/// A sycl::add_sat wrapper functors. -struct add_sat { - template - auto operator()(const ValueT x, const ValueT y) const { - return sycl::add_sat(x, y); - } -}; - -/// A sycl::rhadd wrapper functors. -struct rhadd { - template - auto operator()(const ValueT x, const ValueT y) const { - return sycl::rhadd(x, y); - } -}; - -/// A sycl::hadd wrapper functors. -struct hadd { - template - auto operator()(const ValueT x, const ValueT y) const { - return sycl::hadd(x, y); - } -}; - -/// A sycl::max wrapper functors. -struct maximum { - template - auto operator()(const ValueT x, const ValueT y) const { - return sycl::max(x, y); - } - template - auto operator()(const ValueT x, const ValueT y, bool *pred) const { - return (x >= y) ? ((*pred = true), x) : ((*pred = false), y); - } -}; - -/// A sycl::min wrapper functors. -struct minimum { - template - auto operator()(const ValueT x, const ValueT y) const { - return sycl::min(x, y); - } - template - auto operator()(const ValueT x, const ValueT y, bool *pred) const { - return (x <= y) ? ((*pred = true), x) : ((*pred = false), y); - } -}; - -/// A sycl::sub_sat wrapper functors. -struct sub_sat { - template - auto operator()(const ValueT x, const ValueT y) const { - return sycl::sub_sat(x, y); - } -}; - -namespace detail { -struct shift_left { - template - auto operator()(const T x, const uint32_t offset) const { - return x << offset; - } -}; - -struct shift_right { - template - auto operator()(const T x, const uint32_t offset) const { - return x >> offset; - } -}; - -struct average { - template auto operator()(const T x, const T y) const { - return (x + y + (x + y >= 0)) >> 1; - } -}; - -} // namespace detail - -/// Compute vectorized binary operation value for two/four values, with each -/// treated as a vector type \p VecT. -/// \tparam [in] VecT The type of the vector -/// \tparam [in] BinaryOperation The binary operation class -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] binary_op The operation to do with the two values -/// \param [in] need_relu Whether the result need relu saturation -/// \returns The vectorized binary operation value of the two values -template -inline unsigned vectorized_binary(unsigned a, unsigned b, - const BinaryOperation binary_op, - [[maybe_unused]] bool need_relu = false) { - sycl::vec v0{a}, v1{b}; - auto v2 = v0.as(); - auto v3 = v1.as(); - auto v4 = - detail::vectorized_binary()(v2, v3, binary_op); - if (need_relu) - v4 = relu(v4); - v0 = v4.template as>(); - return v0; -} - -/// Compute two vectorized binary operation value with pred for three values, -/// with each value treated as a 2 \p T type elements vector type. -/// -/// \tparam [in] VecT The type of the vector -/// \tparam [in] BinaryOperation1 The first binary operation class -/// \tparam [in] BinaryOperation2 The second binary operation class -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] binary_op1 The first operation to do with the first two values -/// \param [in] binary_op2 The second operation to do with the third values -/// \param [in] need_relu Whether the result need relu saturation -/// \returns The two vectorized binary operation value of the three values -template -inline unsigned vectorized_ternary(unsigned a, unsigned b, unsigned c, - const BinaryOperation1 binary_op1, - const BinaryOperation2 binary_op2, - bool need_relu = false) { - const auto v1 = sycl::vec(a).as(); - const auto v2 = sycl::vec(b).as(); - const auto v3 = sycl::vec(c).as(); - auto v4 = - detail::vectorized_binary()(v1, v2, binary_op1); - v4 = detail::vectorized_binary()(v4, v3, binary_op2); - if (need_relu) - v4 = relu(v4); - return v4.template as>(); -} - -/// Compute vectorized binary operation value with pred for two values, with -/// each value treated as a 2 \p T type elements vector type. -/// -/// \tparam [in] VecT The type of the vector -/// \tparam [in] BinaryOperation The binary operation class -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] binary_op The operation with pred to do with the two values -/// \param [out] pred_hi The pred pointer that pass into high halfword operation -/// \param [out] pred_lo The pred pointer that pass into low halfword operation -/// \returns The vectorized binary operation value of the two values -template -inline unsigned vectorized_binary_with_pred(unsigned a, unsigned b, - const BinaryOperation binary_op, - bool *pred_hi, bool *pred_lo) { - auto v1 = sycl::vec(a).as(); - auto v2 = sycl::vec(b).as(); - VecT ret; - ret[0] = binary_op(v1[0], v2[0], pred_lo); - ret[1] = binary_op(v1[1], v2[1], pred_hi); - return ret.template as>(); -} - -template -using dot_product_acc_t = - std::conditional_t && std::is_unsigned_v, - uint32_t, int32_t>; - -namespace detail { - -template sycl::vec extract_and_sign_or_zero_extend4(T val) { - return sycl::vec(val) - .template as, int8_t, uint8_t>, 4>>() - .template convert(); -} - -template sycl::vec extract_and_sign_or_zero_extend2(T val) { - return sycl::vec(val) - .template as, int16_t, uint16_t>, 2>>() - .template convert(); -} - -} // namespace detail - -/// Two-way dot product-accumulate. Calculate and return integer_vector2( -/// \param a) dot product integer_vector2(low16_bit( \param b)) + \param c -/// -/// \tparam [in] T1 The type of first value. -/// \tparam [in] T2 The type of second value. -/// \param [in] a The first value. -/// \param [in] b The second value. -/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are -/// uint32_t else has type int32_t. -/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit -/// result. -template -inline dot_product_acc_t dp2a_lo(T1 a, T2 b, - dot_product_acc_t c) { - static_assert(detail::is_int32_type && detail::is_int32_type, - "[SYCLcompat] dp2a_lo expects 32-bit integers as operands."); -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ - defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610 - dot_product_acc_t res; - if constexpr (std::is_signed_v && std::is_signed_v) { - asm volatile("dp2a.lo.s32.s32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } else if constexpr (std::is_signed_v && std::is_unsigned_v) { - asm volatile("dp2a.lo.s32.u32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } else if constexpr (std::is_unsigned_v && std::is_signed_v) { - asm volatile("dp2a.lo.u32.s32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } else { - asm volatile("dp2a.lo.u32.u32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } - return res; -#else - dot_product_acc_t res = c; - auto va = detail::extract_and_sign_or_zero_extend2(a); - auto vb = detail::extract_and_sign_or_zero_extend4(b); - res += va[0] * vb[0]; - res += va[1] * vb[1]; - return res; -#endif -} - -/// Two-way dot product-accumulate. Calculate and return integer_vector2( -/// \param a) dot product integer_vector2(high_16bit( \param b)) + \param c -/// -/// \tparam [in] T1 The type of first value. -/// \tparam [in] T2 The type of second value. -/// \param [in] a The first value. -/// \param [in] b The second value. -/// \param [in] c The third value. uint32_t if both T1 and T1 are -/// uint32_t else has type int32_t. -/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit -/// result. -template -inline dot_product_acc_t dp2a_hi(T1 a, T2 b, - dot_product_acc_t c) { - static_assert(detail::is_int32_type && detail::is_int32_type, - "[SYCLcompat] dp2a_hi expects 32-bit integers as operands."); -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ - defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610 - dot_product_acc_t res; - if constexpr (std::is_signed_v && std::is_signed_v) { - asm volatile("dp2a.hi.s32.s32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } else if constexpr (std::is_signed_v && std::is_unsigned_v) { - asm volatile("dp2a.hi.s32.u32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } else if constexpr (std::is_unsigned_v && std::is_signed_v) { - asm volatile("dp2a.hi.u32.s32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } else { - asm volatile("dp2a.hi.u32.u32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } - return res; -#else - dot_product_acc_t res = c; - auto va = detail::extract_and_sign_or_zero_extend2(a); - auto vb = detail::extract_and_sign_or_zero_extend4(b); - res += va[0] * vb[2]; - res += va[1] * vb[3]; - return res; -#endif -} - -/// Four-way byte dot product-accumulate. Calculate and return integer_vector4( -/// \param a) dot product integer_vector4( \param b) + \param c -/// -/// \tparam [in] T1 The type of first value. -/// \tparam [in] T2 The type of second value. -/// \param [in] a The first value. -/// \param [in] b The second value. -/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are -/// uint32_t else has type int32_t. -/// \return Four-way byte dot product which is accumulated in 32-bit result. -template -inline dot_product_acc_t dp4a(T1 a, T2 b, dot_product_acc_t c) { - static_assert(detail::is_int32_type && detail::is_int32_type, - "[SYCLcompat] dp4a expects 32-bit integers as operands."); -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ - defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610 - dot_product_acc_t res; - if constexpr (std::is_signed_v && std::is_signed_v) { - asm volatile("dp4a.s32.s32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } else if constexpr (std::is_signed_v && std::is_unsigned_v) { - asm volatile("dp4a.s32.u32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } else if constexpr (std::is_unsigned_v && std::is_signed_v) { - asm volatile("dp4a.u32.s32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } else { - asm volatile("dp4a.u32.u32 %0, %1, %2, %3;" - : "=r"(res) - : "r"(a), "r"(b), "r"(c)); - } - return res; -#else - dot_product_acc_t res = c; - auto va = detail::extract_and_sign_or_zero_extend4(a); - auto vb = detail::extract_and_sign_or_zero_extend4(b); - res += va[0] * vb[0]; - res += va[1] * vb[1]; - res += va[2] * vb[2]; - res += va[3] * vb[3]; - return res; -#endif -} - -/// Extend \p a and \p b to 33 bit and add them. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The extend addition of the two values -template -inline constexpr RetT extend_add(AT a, BT b) { - return detail::extend_binary(a, b, std::plus()); -} - -/// Extend Inputs to 33 bit, add \p a, \p b, then do \p second_op with \p c. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \tparam [in] CT The type of the third value -/// \tparam [in] BinaryOperation The type of the second operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] second_op The operation to do with the third value -/// \returns The extend addition of \p a, \p b and \p second_op with \p c -template -inline constexpr RetT extend_add(AT a, BT b, CT c, BinaryOperation second_op) { - return detail::extend_binary(a, b, c, std::plus(), second_op); -} - -/// Extend \p a and \p b to 33 bit and add them with saturation. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The extend addition of the two values with saturation -template -inline constexpr RetT extend_add_sat(AT a, BT b) { - return detail::extend_binary(a, b, std::plus()); -} - -/// Extend Inputs to 33 bit, add \p a, \p b with saturation, then do \p -/// second_op with \p c. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \tparam [in] CT The type of the third value -/// \tparam [in] BinaryOperation The type of the second operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] second_op The operation to do with the third value -/// \returns The extend addition of \p a, \p b with saturation and \p second_op -/// with \p c -template -inline constexpr RetT extend_add_sat(AT a, BT b, CT c, - BinaryOperation second_op) { - return detail::extend_binary(a, b, c, std::plus(), second_op); -} - -/// Extend \p a and \p b to 33 bit and minus them. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The extend subtraction of the two values -template -inline constexpr RetT extend_sub(AT a, BT b) { - return detail::extend_binary(a, b, std::minus()); -} - -/// Extend Inputs to 33 bit, minus \p a, \p b, then do \p second_op with \p c. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \tparam [in] CT The type of the third value -/// \tparam [in] BinaryOperation The type of the second operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] second_op The operation to do with the third value -/// \returns The extend subtraction of \p a, \p b and \p second_op with \p c -template -inline constexpr RetT extend_sub(AT a, BT b, CT c, BinaryOperation second_op) { - return detail::extend_binary(a, b, c, std::minus(), second_op); -} - -/// Extend \p a and \p b to 33 bit and minus them with saturation. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The extend subtraction of the two values with saturation -template -inline constexpr RetT extend_sub_sat(AT a, BT b) { - return detail::extend_binary(a, b, std::minus()); -} - -/// Extend Inputs to 33 bit, minus \p a, \p b with saturation, then do \p -/// second_op with \p c. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \tparam [in] CT The type of the third value -/// \tparam [in] BinaryOperation The type of the second operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] second_op The operation to do with the third value -/// \returns The extend subtraction of \p a, \p b with saturation and \p -/// second_op with \p c -template -inline constexpr RetT extend_sub_sat(AT a, BT b, CT c, - BinaryOperation second_op) { - return detail::extend_binary(a, b, c, std::minus(), second_op); -} - -/// Extend \p a and \p b to 33 bit and do abs_diff. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The extend abs_diff of the two values -template -inline constexpr RetT extend_absdiff(AT a, BT b) { - return detail::extend_binary(a, b, abs_diff()); -} - -/// Extend Inputs to 33 bit, abs_diff \p a, \p b, then do \p second_op with \p -/// c. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \tparam [in] CT The type of the third value -/// \tparam [in] BinaryOperation The type of the second operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] second_op The operation to do with the third value -/// \returns The extend abs_diff of \p a, \p b and \p second_op with \p c -template -inline constexpr RetT extend_absdiff(AT a, BT b, CT c, - BinaryOperation second_op) { - return detail::extend_binary(a, b, c, abs_diff(), second_op); -} - -/// Extend \p a and \p b to 33 bit and do abs_diff with saturation. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The extend abs_diff of the two values with saturation -template -inline constexpr RetT extend_absdiff_sat(AT a, BT b) { - return detail::extend_binary(a, b, abs_diff()); -} - -/// Extend Inputs to 33 bit, abs_diff \p a, \p b with saturation, then do \p -/// second_op with \p c. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \tparam [in] CT The type of the third value -/// \tparam [in] BinaryOperation The type of the second operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] second_op The operation to do with the third value -/// \returns The extend abs_diff of \p a, \p b with saturation and \p -/// second_op with \p c -template -inline constexpr RetT extend_absdiff_sat(AT a, BT b, CT c, - BinaryOperation second_op) { - return detail::extend_binary(a, b, c, abs_diff(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return smaller one. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The smaller one of the two extended values -template -inline constexpr RetT extend_min(AT a, BT b) { - return detail::extend_binary(a, b, minimum()); -} - -/// Extend Inputs to 33 bit, find the smaller one in \p a, \p b, then do \p -/// second_op with \p c. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \tparam [in] CT The type of the third value -/// \tparam [in] BinaryOperation The type of the second operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] second_op The operation to do with the third value -/// \returns The smaller one of \p a, \p b and \p second_op with \p c -template -inline constexpr RetT extend_min(AT a, BT b, CT c, BinaryOperation second_op) { - return detail::extend_binary(a, b, c, minimum(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return smaller one with saturation. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The smaller one of the two extended values with saturation -template -inline constexpr RetT extend_min_sat(AT a, BT b) { - return detail::extend_binary(a, b, minimum()); -} - -/// Extend Inputs to 33 bit, find the smaller one in \p a, \p b with saturation, -/// then do \p second_op with \p c. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \tparam [in] CT The type of the third value -/// \tparam [in] BinaryOperation The type of the second operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] second_op The operation to do with the third value -/// \returns The smaller one of \p a, \p b with saturation and \p -/// second_op with \p c -template -inline constexpr RetT extend_min_sat(AT a, BT b, CT c, - BinaryOperation second_op) { - return detail::extend_binary(a, b, c, minimum(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return bigger one. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The bigger one of the two extended values -template -inline constexpr RetT extend_max(AT a, BT b) { - return detail::extend_binary(a, b, maximum()); -} - -/// Extend Inputs to 33 bit, find the bigger one in \p a, \p b, then do \p -/// second_op with \p c. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \tparam [in] CT The type of the third value -/// \tparam [in] BinaryOperation The type of the second operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] second_op The operation to do with the third value -/// \returns The bigger one of \p a, \p b and \p second_op with \p c -template -inline constexpr RetT extend_max(AT a, BT b, CT c, BinaryOperation second_op) { - return detail::extend_binary(a, b, c, maximum(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return bigger one with saturation. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \param [in] a The first value -/// \param [in] b The second value -/// \returns The bigger one of the two extended values with saturation -template -inline constexpr RetT extend_max_sat(AT a, BT b) { - return detail::extend_binary(a, b, maximum()); -} - -/// Extend Inputs to 33 bit, find the bigger one in \p a, \p b with saturation, -/// then do \p second_op with \p c. -/// \tparam [in] RetT The type of the return value -/// \tparam [in] AT The type of the first value -/// \tparam [in] BT The type of the second value -/// \tparam [in] CT The type of the third value -/// \tparam [in] BinaryOperation The type of the second operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] second_op The operation to do with the third value -/// \returns The bigger one of \p a, \p b with saturation and \p -/// second_op with \p c -template -inline constexpr RetT extend_max_sat(AT a, BT b, CT c, - BinaryOperation second_op) { - return detail::extend_binary(a, b, c, maximum(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return a << clamp(b, 0, 32). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \returns a << clamp(b, 0, 32) -template -inline constexpr RetT extend_shl_clamp(T a, uint32_t b) { - return detail::extend_binary(a, sycl::clamp(b, 0u, 32u), - detail::shift_left()); -} - -/// Extend Inputs to 33 bit, and return second_op(a << clamp(b, 0, 32), c). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \param [in] c The value to merge -/// \param [in] second_op The operation to do with the third value -/// \returns second_op(a << clamp(b, 0, 32), c) -template -inline constexpr RetT extend_shl_clamp(T a, uint32_t b, uint32_t c, - BinaryOperation second_op) { - return detail::extend_binary(a, sycl::clamp(b, 0u, 32u), c, - detail::shift_left(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return sat(a << clamp(b, 0, 32)). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \returns sat(a << clamp(b, 0, 32)) -template -inline constexpr RetT extend_shl_sat_clamp(T a, uint32_t b) { - return detail::extend_binary(a, sycl::clamp(b, 0u, 32u), - detail::shift_left()); -} - -/// Extend Inputs to 33 bit, and return second_op(sat(a << clamp(b, 0, 32)), c). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \param [in] c The value to merge -/// \param [in] second_op The operation to do with the third value -/// \returns second_op(sat(a << clamp(b, 0, 32)), c) -template -inline constexpr RetT extend_shl_sat_clamp(T a, uint32_t b, uint32_t c, - BinaryOperation second_op) { - return detail::extend_binary(a, sycl::clamp(b, 0u, 32u), c, - detail::shift_left(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return a << (b & 0x1F). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \returns a << (b & 0x1F) -template -inline constexpr RetT extend_shl_wrap(T a, uint32_t b) { - return detail::extend_binary(a, b & 0x1F, detail::shift_left()); -} - -/// Extend Inputs to 33 bit, and return second_op(a << (b & 0x1F), c). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \param [in] c The value to merge -/// \param [in] second_op The operation to do with the third value -/// \returns second_op(a << (b & 0x1F), c) -template -inline constexpr RetT extend_shl_wrap(T a, uint32_t b, uint32_t c, - BinaryOperation second_op) { - return detail::extend_binary(a, b & 0x1F, c, - detail::shift_left(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return sat(a << (b & 0x1F)). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \returns sat(a << (b & 0x1F)) -template -inline constexpr RetT extend_shl_sat_wrap(T a, uint32_t b) { - return detail::extend_binary(a, b & 0x1F, detail::shift_left()); -} - -/// Extend Inputs to 33 bit, and return second_op(sat(a << (b & 0x1F)), c). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \param [in] c The value to merge -/// \param [in] second_op The operation to do with the third value -/// \returns second_op(sat(a << (b & 0x1F)), c) -template -inline constexpr RetT extend_shl_sat_wrap(T a, uint32_t b, uint32_t c, - BinaryOperation second_op) { - return detail::extend_binary(a, b & 0x1F, c, detail::shift_left(), - second_op); -} - -/// Extend \p a and \p b to 33 bit and return a >> clamp(b, 0, 32). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \returns a >> clamp(b, 0, 32) -template -inline constexpr RetT extend_shr_clamp(T a, uint32_t b) { - return detail::extend_binary(a, sycl::clamp(b, 0u, 32u), - detail::shift_right()); -} - -/// Extend Inputs to 33 bit, and return second_op(a >> clamp(b, 0, 32), c). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \param [in] c The value to merge -/// \param [in] second_op The operation to do with the third value -/// \returns second_op(a >> clamp(b, 0, 32), c) -template -inline constexpr RetT extend_shr_clamp(T a, uint32_t b, uint32_t c, - BinaryOperation second_op) { - return detail::extend_binary(a, sycl::clamp(b, 0u, 32u), c, - detail::shift_right(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return sat(a >> clamp(b, 0, 32)). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \returns sat(a >> clamp(b, 0, 32)) -template -inline constexpr RetT extend_shr_sat_clamp(T a, uint32_t b) { - return detail::extend_binary(a, sycl::clamp(b, 0u, 32u), - detail::shift_right()); -} - -/// Extend Inputs to 33 bit, and return second_op(sat(a >> clamp(b, 0, 32)), c). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \param [in] c The value to merge -/// \param [in] second_op The operation to do with the third value -/// \returns second_op(sat(a >> clamp(b, 0, 32)), c) -template -inline constexpr RetT extend_shr_sat_clamp(T a, uint32_t b, uint32_t c, - BinaryOperation second_op) { - return detail::extend_binary(a, sycl::clamp(b, 0u, 32u), c, - detail::shift_right(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return a >> (b & 0x1F). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \returns a >> (b & 0x1F) -template -inline constexpr RetT extend_shr_wrap(T a, uint32_t b) { - return detail::extend_binary(a, b & 0x1F, detail::shift_right()); -} - -/// Extend Inputs to 33 bit, and return second_op(a >> (b & 0x1F), c). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \param [in] c The value to merge -/// \param [in] second_op The operation to do with the third value -/// \returns second_op(a >> (b & 0x1F), c) -template -inline constexpr RetT extend_shr_wrap(T a, uint32_t b, uint32_t c, - BinaryOperation second_op) { - return detail::extend_binary(a, b & 0x1F, c, - detail::shift_right(), second_op); -} - -/// Extend \p a and \p b to 33 bit and return sat(a >> (b & 0x1F)). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \returns sat(a >> (b & 0x1F)) -template -inline constexpr RetT extend_shr_sat_wrap(T a, uint32_t b) { - return detail::extend_binary(a, b & 0x1F, detail::shift_right()); -} - -/// Extend Inputs to 33 bit, and return second_op(sat(a >> (b & 0x1F)), c). -/// \param [in] a The source value -/// \param [in] b The offset to shift -/// \param [in] c The value to merge -/// \param [in] second_op The operation to do with the third value -/// \returns second_op(sat(a >> (b & 0x1F)), c) -template -inline constexpr RetT extend_shr_sat_wrap(T a, uint32_t b, uint32_t c, - BinaryOperation second_op) { - return detail::extend_binary(a, b & 0x1F, c, - detail::shift_right(), second_op); -} - -/// Compute vectorized addition of \p a and \p b, with each value treated as a -/// 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized addition of the two values -template -inline constexpr RetT extend_vadd2(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, std::plus()); -} - -/// Compute vectorized addition of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized addition of the two -/// values and the third value -template -inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, std::plus()); -} - -/// Compute vectorized addition of \p a and \p b with saturation, with each -/// value treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized addition of the two values with saturation -template -inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, std::plus()); -} - -/// Compute vectorized subtraction of \p a and \p b, with each value treated as -/// a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized subtraction of the two values -template -inline constexpr RetT extend_vsub2(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, std::minus()); -} - -/// Compute vectorized subtraction of \p a and \p b, with each value treated as -/// a 2 elements vector type and extend each element to 17 bit. Then add each -/// half of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized subtraction of the -/// two values and the third value -template -inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, std::minus()); -} - -/// Compute vectorized subtraction of \p a and \p b with saturation, with each -/// value treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized subtraction of the two values with saturation -template -inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, std::minus()); -} - -/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized abs_diff of the two values -template -inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, abs_diff()); -} - -/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized abs_diff of the -/// two values and the third value -template -inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, abs_diff()); -} - -/// Compute vectorized abs_diff of \p a and \p b with saturation, with each -/// value treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized abs_diff of the two values with saturation -template -inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, abs_diff()); -} - -/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized minimum of the two values -template -inline constexpr RetT extend_vmin2(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, minimum()); -} - -/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized minimum of the -/// two values and the third value -template -inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, minimum()); -} - -/// Compute vectorized minimum of \p a and \p b with saturation, with each value -/// treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized minimum of the two values with saturation -template -inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, minimum()); -} - -/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized maximum of the two values -template -inline constexpr RetT extend_vmax2(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, maximum()); -} - -/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized maximum of the -/// two values and the third value -template -inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, maximum()); -} - -/// Compute vectorized maximum of \p a and \p b with saturation, with each value -/// treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized maximum of the two values with saturation -template -inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, maximum()); -} - -/// Compute vectorized average of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized average of the two values -template -inline constexpr RetT extend_vavrg2(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, - detail::average()); -} - -/// Compute vectorized average of \p a and \p b, with each value treated as a 2 -/// elements vector type and extend each element to 17 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend average maximum of the -/// two values and the third value -template -inline constexpr RetT extend_vavrg2_add(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, detail::average()); -} - -/// Compute vectorized average of \p a and \p b with saturation, with each value -/// treated as a 2 elements vector type and extend each element to 17 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized average of the two values with saturation -template -inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary2(a, b, c, detail::average()); -} - -/// Extend \p a and \p b to 33 bit and vectorized compare input values using -/// specified comparison \p cmp . -/// -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \tparam [in] BinaryOperation The type of the compare operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] cmp The comparsion operator -/// \returns The comparison result of the two extended values. -template -inline constexpr unsigned extend_vcompare2(AT a, BT b, BinaryOperation cmp) { - return detail::extend_vbinary2(a, b, 0, cmp); -} - -/// Extend Inputs to 33 bit, and vectorized compare input values using specified -/// comparison \p cmp , then add the result with \p c . -/// -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \tparam [in] BinaryOperation The type of the compare operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] cmp The comparsion operator -/// \returns The comparison result of the two extended values, and add the -/// result with \p c . -template -inline constexpr unsigned extend_vcompare2_add(AT a, BT b, unsigned c, - BinaryOperation cmp) { - return detail::extend_vbinary2(a, b, c, cmp); -} - -/// Compute vectorized addition of \p a and \p b, with each value treated as a -/// 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized addition of the two values -template -inline constexpr RetT extend_vadd4(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, std::plus()); -} - -/// Compute vectorized addition of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized addition of the two -/// values and the third value -template -inline constexpr RetT extend_vadd4_add(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, std::plus()); -} - -/// Compute vectorized addition of \p a and \p b with saturation, with each -/// value treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized addition of the two values with saturation -template -inline constexpr RetT extend_vadd4_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, std::plus()); -} - -/// Compute vectorized subtraction of \p a and \p b, with each value treated as -/// a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized subtraction of the two values -template -inline constexpr RetT extend_vsub4(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, std::minus()); -} - -/// Compute vectorized subtraction of \p a and \p b, with each value treated as -/// a 4 elements vector type and extend each element to 9 bit. Then add each -/// half of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized subtraction of the -/// two values and the third value -template -inline constexpr RetT extend_vsub4_add(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, std::minus()); -} - -/// Compute vectorized subtraction of \p a and \p b with saturation, with each -/// value treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized subtraction of the two values with saturation -template -inline constexpr RetT extend_vsub4_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, std::minus()); -} - -/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized abs_diff of the two values -template -inline constexpr RetT extend_vabsdiff4(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, abs_diff()); -} - -/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized abs_diff of the -/// two values and the third value -template -inline constexpr RetT extend_vabsdiff4_add(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, abs_diff()); -} - -/// Compute vectorized abs_diff of \p a and \p b with saturation, with each -/// value treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized abs_diff of the two values with saturation -template -inline constexpr RetT extend_vabsdiff4_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, abs_diff()); -} - -/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized minimum of the two values -template -inline constexpr RetT extend_vmin4(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, minimum()); -} - -/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized minimum of the -/// two values and the third value -template -inline constexpr RetT extend_vmin4_add(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, minimum()); -} - -/// Compute vectorized minimum of \p a and \p b with saturation, with each value -/// treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized minimum of the two values with saturation -template -inline constexpr RetT extend_vmin4_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, minimum()); -} - -/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized maximum of the two values -template -inline constexpr RetT extend_vmax4(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, maximum()); -} - -/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized maximum of the -/// two values and the third value -template -inline constexpr RetT extend_vmax4_add(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, maximum()); -} - -/// Compute vectorized maximum of \p a and \p b with saturation, with each value -/// treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized maximum of the two values with saturation -template -inline constexpr RetT extend_vmax4_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, maximum()); -} - -/// Compute vectorized average of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized average of the two values -template -inline constexpr RetT extend_vavrg4(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, - detail::average()); -} - -/// Compute vectorized average of \p a and \p b, with each value treated as a 4 -/// elements vector type and extend each element to 9 bit. Then add each half -/// of the result and add with \p c. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The addition of each half of extend vectorized average of the -/// two values and the third value -template -inline constexpr RetT extend_vavrg4_add(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, detail::average()); -} - -/// Compute vectorized average of \p a and \p b with saturation, with each value -/// treated as a 4 elements vector type and extend each element to 9 bit. -/// \tparam [in] RetT The type of the return value, can only be 32 bit integer -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \returns The extend vectorized average of the two values with saturation -template -inline constexpr RetT extend_vavrg4_sat(AT a, BT b, RetT c) { - return detail::extend_vbinary4(a, b, c, detail::average()); -} - -/// Extend \p a and \p b to 33 bit and vectorized compare input values using -/// specified comparison \p cmp . -/// -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \tparam [in] BinaryOperation The type of the compare operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] cmp The comparsion operator -/// \returns The comparison result of the two extended values. -template -inline constexpr unsigned extend_vcompare4(AT a, BT b, BinaryOperation cmp) { - return detail::extend_vbinary4(a, b, 0, cmp); -} - -/// Extend Inputs to 33 bit, and vectorized compare input values using specified -/// comparison \p cmp , then add the result with \p c . -/// -/// \tparam [in] AT The type of the first value, can only be 32 bit integer -/// \tparam [in] BT The type of the second value, can only be 32 bit integer -/// \tparam [in] BinaryOperation The type of the compare operation -/// \param [in] a The first value -/// \param [in] b The second value -/// \param [in] c The third value -/// \param [in] cmp The comparsion operator -/// \returns The comparison result of the two extended values, and add the -/// result with \p c . -template -inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c, - BinaryOperation cmp) { - return detail::extend_vbinary4(a, b, c, cmp); -} - -} // namespace syclcompat diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp deleted file mode 100644 index 7fc21fec8d2d4..0000000000000 --- a/sycl/include/syclcompat/memory.hpp +++ /dev/null @@ -1,1883 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCL compatibility extension - * - * memory.hpp - * - * Description: - * memory functionality for the SYCL compatibility extension - **************************************************************************/ - -// The original source was under the license below: -//==---- memory.hpp -------------------------------*- C++ -*----------------==// -// -// Copyright (C) Intel Corporation -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY -#include -#endif - -#include -#include -#include - -#if defined(__linux__) -#include -#elif defined(_WIN64) -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#else -#error "Only support Windows and Linux." -#endif - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { - -template -#ifdef __SYCL_DEVICE_ONLY__ -[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]] -#endif -__SYCL_ALWAYS_INLINE auto *local_mem() { - sycl::multi_ptr - As_multi_ptr = - sycl::ext::oneapi::group_local_memory_for_overwrite( - sycl::ext::oneapi::this_work_item::get_work_group<3>()); - auto *As = *As_multi_ptr; - return As; -} - -namespace detail { -enum memcpy_direction { - host_to_host, - host_to_device, - device_to_host, - device_to_device, - automatic -}; -} // namespace detail - -template -__syclcompat_inline__ - std::enable_if_t || std::is_same_v, - T> - ptr_to_int(void *ptr) { -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - if constexpr (std::is_same_v) { - return (intptr_t)(sycl::decorated_local_ptr::pointer)ptr; - } else { - return (size_t)(sycl::decorated_local_ptr::pointer)ptr; - } -#else - throw sycl::exception(make_error_code(sycl::errc::runtime), - "ptr_to_int is only supported on Nvidia devices."); -#endif -} - -enum class memory_region { - global = 0, // device global memory - constant, // device read-only memory - local, // device local memory - usm_shared, // memory which can be accessed by host and device -}; - -using byte_t = uint8_t; - -/// Buffer type to be used in Memory Management runtime. -typedef sycl::buffer buffer_t; - -/// Pitched 2D/3D memory data. -class pitched_data { -public: - pitched_data() : pitched_data(nullptr, 0, 0, 0) {} - pitched_data(void *data, size_t pitch, size_t x, size_t y) - : _data(data), _pitch(pitch), _x(x), _y(y) {} - - void *get_data_ptr() { return _data; } - void set_data_ptr(void *data) { _data = data; } - - size_t get_pitch() { return _pitch; } - void set_pitch(size_t pitch) { _pitch = pitch; } - - size_t get_x() { return _x; } - void set_x(size_t x) { _x = x; }; - - size_t get_y() { return _y; } - void set_y(size_t y) { _y = y; } - -private: - void *_data; - size_t _pitch, _x, _y; -}; - -namespace experimental { -#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES -class image_mem_wrapper; -namespace detail { -static sycl::event memcpy(const image_mem_wrapper *src, - const sycl::id<3> &src_id, pitched_data &dest, - const sycl::id<3> &dest_id, - const sycl::range<3> ©_extend, sycl::queue q); -static sycl::event memcpy(const pitched_data src, const sycl::id<3> &src_id, - image_mem_wrapper *dest, const sycl::id<3> &dest_id, - const sycl::range<3> ©_extend, sycl::queue q); -} // namespace detail -#endif -class image_matrix; -namespace detail { -static pitched_data to_pitched_data(image_matrix *image); -} - -/// Memory copy parameters for 2D/3D memory data. -struct memcpy_parameter { - struct data_wrapper { - pitched_data pitched{}; - sycl::id<3> pos{}; -#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES - experimental::image_mem_wrapper *image_bindless{nullptr}; -#endif - image_matrix *image{nullptr}; - }; - data_wrapper from{}; - data_wrapper to{}; - sycl::range<3> size{}; -}; -} // namespace experimental - -namespace detail { -class mem_mgr { - mem_mgr() { - // Reserved address space, no real memory allocation happens here. -#if defined(__linux__) - mapped_address_space = - (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); -#elif defined(_WIN64) - mapped_address_space = (byte_t *)VirtualAlloc( - NULL, // NULL specified as the base address parameter - mapped_region_size, // Size of allocation - MEM_RESERVE, // Allocate reserved pages - PAGE_NOACCESS); // Protection = no access -#else -#error "Only support Windows and Linux." -#endif - next_free = mapped_address_space; - }; - -public: - using buffer_id_t = int; - - struct allocation { - buffer_t buffer; - byte_t *alloc_ptr; - size_t size; - }; - - ~mem_mgr() { -#if defined(__linux__) - munmap(mapped_address_space, mapped_region_size); -#elif defined(_WIN64) - VirtualFree(mapped_address_space, 0, MEM_RELEASE); -#else -#error "Only support Windows and Linux." -#endif - }; - - mem_mgr(const mem_mgr &) = delete; - mem_mgr &operator=(const mem_mgr &) = delete; - mem_mgr(mem_mgr &&) = delete; - mem_mgr &operator=(mem_mgr &&) = delete; - - /// Allocate - void *mem_alloc(size_t size) { - if (!size) - return nullptr; - std::lock_guard lock(m_mutex); - if (next_free + size > mapped_address_space + mapped_region_size) { - throw std::runtime_error( - "[SYCLcompat] malloc: out of memory for virtual memory pool"); - } - // Allocation - sycl::range<1> buffer_range(size); - buffer_t buf(buffer_range); - allocation alloc{buf, next_free, size}; - // Map allocation to device pointer - void *result = next_free; - m_map.emplace(next_free + size, alloc); - // Update pointer to the next free space. - next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1); - - return result; - } - - /// Deallocate - void mem_free(const void *ptr) { - if (!ptr) - return; - std::lock_guard lock(m_mutex); - auto it = get_map_iterator(ptr); - m_map.erase(it); - } - - /// map: device pointer -> allocation(buffer, alloc_ptr, size) - allocation translate_ptr(const void *ptr) { - std::lock_guard lock(m_mutex); - auto it = get_map_iterator(ptr); - return it->second; - } - - /// Check if the pointer represents device pointer or not. - bool is_device_ptr(const void *ptr) const { - std::lock_guard lock(m_mutex); - return (mapped_address_space <= ptr) && - (ptr < mapped_address_space + mapped_region_size); - } - - /// Returns the instance of memory manager singleton. - static mem_mgr &instance() { - static mem_mgr m; - return m; - } - -private: - std::map m_map; - mutable std::mutex m_mutex; - byte_t *mapped_address_space; - byte_t *next_free; - const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024; - const size_t alignment = 256; - /// This padding may be defined to some positive value to debug - /// out of bound accesses. - const size_t extra_padding = 0; - - std::map::iterator get_map_iterator(const void *ptr) { - auto it = m_map.upper_bound((byte_t *)ptr); - if (it == m_map.end()) { - // Not a virtual pointer. - throw std::runtime_error("[SYCLcompat] can not get buffer from non-virtual pointer"); - } - const allocation &alloc = it->second; - if (ptr < alloc.alloc_ptr) { - // Out of bound. - // This may happen if there's a gap between allocations due to alignment - // or extra padding and pointer points to this gap. - throw std::runtime_error("[SYCLcompat] invalid virtual pointer"); - } - return it; - } -}; - -template class accessor; -template class memory_traits { -public: - static constexpr sycl::access::address_space asp = - (Memory == memory_region::local) - ? sycl::access::address_space::local_space - : sycl::access::address_space::global_space; - static constexpr sycl::target target = (Memory == memory_region::local) - ? sycl::target::local - : sycl::target::device; - static constexpr sycl::access_mode mode = (Memory == memory_region::constant) - ? sycl::access_mode::read - : sycl::access_mode::read_write; - static constexpr size_t type_size = sizeof(T); - using element_t = - typename std::conditional_t; - using value_t = typename std::remove_cv_t; - template - using accessor_t = - typename std::conditional_t, - sycl::accessor>; - using pointer_t = - typename std::conditional_t; -}; - -static inline void *malloc(size_t size, sycl::queue q) { -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - return mem_mgr::instance().mem_alloc(size * sizeof(byte_t)); -#else - return sycl::malloc_device(size, q.get_device(), q.get_context()); -#endif // SYCLCOMPAT_USM_LEVEL_NONE -} - -/// Calculate pitch (padded length of major dimension \p x) by rounding up to -/// multiple of 32. -/// \param x The dimension to be padded (in bytes) -/// \returns size_t representing pitched length of dimension x (in bytes). -static inline constexpr size_t get_pitch(size_t x) { - return ((x) + 31) & ~(0x1F); -} - -/// \brief Malloc pitched 3D data -/// \param [out] pitch returns the calculated pitch (in bytes) -/// \param [in] x width of the allocation (in bytes) -/// \param [in] y height of the allocation -/// \param [in] z depth of the allocation -/// \param [in] q The queue in which the operation is done. -/// \returns A pointer to the allocated memory -static inline void *malloc(size_t &pitch, size_t x, size_t y, size_t z, - sycl::queue q) { - pitch = get_pitch(x); - return malloc(pitch * y * z, q); -} - -/// \brief Set \p pattern to the first \p count elements of type \p T -/// starting from \p dev_ptr. -/// -/// \tparam T Datatype of the pattern to be set. -/// \param q The queue in which the operation is done. -/// \param dev_ptr Pointer to the device memory address. -/// \param pattern Pattern of type T to be set. -/// \param count Number of elements to be set to the patten. -/// \returns An event representing the fill operation. -template -static inline sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern, - size_t count) { -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - assert(mm.is_device_ptr(dev_ptr)); - auto alloc = mm.translate_ptr(dev_ptr); - size_t offset = (T *)dev_ptr - (T *)alloc.alloc_ptr; - - return q.submit([&](sycl::handler &cgh) { - auto r = sycl::range<1>(count); - auto o = sycl::id<1>(offset); - auto new_buffer = - alloc.buffer.reinterpret(sycl::range<1>(alloc.size / sizeof(T))); - sycl::accessor - acc(new_buffer, cgh, r, o); - cgh.fill(acc, pattern); - }); -#else - return q.fill(dev_ptr, pattern, count); -#endif -} - -/// Set \p value to the first \p size bytes starting from \p dev_ptr in \p q. -/// -/// \param q The queue in which the operation is done. -/// \param dev_ptr Pointer to the device memory address. -/// \param value Value to be set. -/// \param size Number of bytes to be set to the value. -/// \returns An event representing the memset operation. -static inline sycl::event memset(sycl::queue q, void *dev_ptr, int value, - size_t size) { -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - assert(mm.is_device_ptr(dev_ptr)); - auto alloc = mm.translate_ptr(dev_ptr); - size_t offset = (byte_t *)dev_ptr - (byte_t *)alloc.alloc_ptr; - - return q.submit([&](sycl::handler &cgh) { - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - auto new_buffer = alloc.buffer.reinterpret( - sycl::range<1>(alloc.size / sizeof(byte_t))); - sycl::accessor - acc(new_buffer, cgh, r, o); - cgh.fill(acc, static_cast(value)); - }); -#else - return q.memset(dev_ptr, value, size); -#endif // SYCLCOMPAT_USM_LEVEL_NONE -} - -/// \brief Sets \p value to the 3D memory region pointed by \p data in \p q. -/// \tparam T The type of the element to be set. -/// \param [in] q The queue in which the operation is done. -/// \param [in] data Pointer to the pitched device memory region. -/// \param [in] value The value to be set. -/// \param [in] size 3D memory region by number of elements. -/// \return An event list representing the memset operations. -template -static inline std::vector -memset(sycl::queue q, pitched_data data, const T &value, sycl::range<3> size) { - std::vector event_list; - size_t slice = data.get_pitch() * data.get_y(); - unsigned char *data_surface = (unsigned char *)data.get_data_ptr(); - for (size_t z = 0; z < size.get(2); ++z) { - unsigned char *data_ptr = data_surface; - for (size_t y = 0; y < size.get(1); ++y) { - event_list.push_back(detail::fill(q, data_ptr, value, size.get(0))); - data_ptr += data.get_pitch(); - } - data_surface += slice; - } - return event_list; -} - -/// \brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p -/// q. -/// \tparam T The type of the element to be set. -/// \param [in] q The queue in which the operation is done. -/// \param [in] ptr Pointer to the virtual device memory. -/// \param [in] pitch The pitch size by number of elements, including padding. -/// \param [in] value The value to be set. -/// \param [in] x The width of memory region by number of elements. -/// \param [in] y The height of memory region by number of elements. -/// \return An event list representing the memset operations. -template -static inline std::vector memset(sycl::queue q, void *ptr, - size_t pitch, const T &value, - size_t x, size_t y) { - return memset(q, pitched_data(ptr, pitch, x, 1), value, - sycl::range<3>(x, y, 1)); -} - -enum class pointer_access_attribute { - host_only = 0, - device_only, - host_device, - end -}; - -static pointer_access_attribute get_pointer_attribute(sycl::queue q, - const void *ptr) { -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - return mem_mgr::instance().is_device_ptr(ptr) - ? pointer_access_attribute::device_only - : pointer_access_attribute::host_only; -#else - switch (sycl::get_pointer_type(ptr, q.get_context())) { - case sycl::usm::alloc::unknown: - return pointer_access_attribute::host_only; - case sycl::usm::alloc::device: - return pointer_access_attribute::device_only; - case sycl::usm::alloc::shared: - case sycl::usm::alloc::host: - return pointer_access_attribute::host_device; - } -#endif // SYCLCOMPAT_USM_LEVEL_NONE -} - -static memcpy_direction -deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr) { - // table[to_attribute][from_attribute] - static const memcpy_direction - direction_table[static_cast(pointer_access_attribute::end)] - [static_cast(pointer_access_attribute::end)] = { - {host_to_host, device_to_host, host_to_host}, - {host_to_device, device_to_device, device_to_device}, - {host_to_host, device_to_device, device_to_device}}; - return direction_table[static_cast(get_pointer_attribute( - q, to_ptr))][static_cast(get_pointer_attribute(q, from_ptr))]; -} - -static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, - size_t size, - const std::vector &dep_events = {}) { - if (!size) - return sycl::event{}; -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr); - - switch (real_direction) { - case host_to_host: - return q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dep_events); - cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); - }); - case host_to_device: { - auto alloc = mm.translate_ptr(to_ptr); - size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(from_ptr, acc); - }); - } - case device_to_host: { - auto alloc = mm.translate_ptr(from_ptr); - size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(acc, to_ptr); - }); - } - case device_to_device: { - auto to_alloc = mm.translate_ptr(to_ptr); - auto from_alloc = mm.translate_ptr(from_ptr); - size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, r, to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, r, from_o); - cgh.copy(from_acc, to_acc); - }); - } - default: - throw std::runtime_error("[SYCLcompat] memcpy: invalid direction value"); - } -#else - return q.memcpy(to_ptr, from_ptr, size, dep_events); -#endif // SYCLCOMPAT_USM_LEVEL_NONE -} - -// Get actual copy range and make sure it will not exceed range. -static inline size_t get_copy_range(sycl::range<3> size, size_t slice, - size_t pitch) { - return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); -} - -static inline size_t get_offset(sycl::id<3> id, size_t slice, size_t pitch) { - return slice * id.get(2) + pitch * id.get(1) + id.get(0); -} - -// RAII for host pointer -class host_buffer { - void *_buf; - size_t _size; - sycl::queue _q; - const std::vector &_deps; // free operation depends - -public: - host_buffer(size_t size, sycl::queue q, const std::vector &deps) - : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} - void *get_ptr() const { return _buf; } - size_t get_size() const { return _size; } - ~host_buffer() { - if (_buf) { - _q.submit([&](sycl::handler &cgh) { - cgh.depends_on(_deps); - cgh.host_task([buf = _buf] { std::free(buf); }); - }); - } - } -}; - -/// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr -/// and \p from_range to another specified by \p to_ptr and \p to_range. -template -static inline std::vector -memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, - sycl::range<3> to_range, sycl::range<3> from_range, sycl::id<3> to_id, - sycl::id<3> from_id, sycl::range<3> size, - const std::vector &dep_events = {}) { - static_assert( - std::is_same_v, - "This syclcompat::detail::memcpy overload only accepts a dummy template " - "parameter, T = void, which prevents SYCL kernel generation by default."); - std::vector event_list; - - size_t to_slice = to_range.get(1) * to_range.get(0); - size_t from_slice = from_range.get(1) * from_range.get(0); - unsigned char *to_surface = - (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); - const unsigned char *from_surface = - (const unsigned char *)from_ptr + - get_offset(from_id, from_slice, from_range.get(0)); - - if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) { - return {memcpy(q, to_surface, from_surface, to_slice * size.get(2), - dep_events)}; - } - using namespace experimental; // for memcpy_direction - memcpy_direction direction = deduce_memcpy_direction(q, to_ptr, from_ptr); - size_t size_slice = size.get(1) * size.get(0); - switch (direction) { - case host_to_host: - for (size_t z = 0; z < size.get(2); ++z) { - unsigned char *to_ptr = to_surface; - const unsigned char *from_ptr = from_surface; - if (to_range.get(0) == from_range.get(0) && - to_range.get(0) == size.get(0)) { - event_list.push_back( - memcpy(q, to_ptr, from_ptr, size_slice, dep_events)); - } else { - for (size_t y = 0; y < size.get(1); ++y) { - event_list.push_back( - memcpy(q, to_ptr, from_ptr, size.get(0), dep_events)); - to_ptr += to_range.get(0); - from_ptr += from_range.get(0); - } - } - to_surface += to_slice; - from_surface += from_slice; - } - break; - case host_to_device: { - host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, - event_list); - std::vector host_events; - if (to_slice == size_slice) { - // Copy host data to a temp host buffer with the shape of target. - host_events = - memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, dep_events); - } else { - // Copy host data to a temp host buffer with the shape of target. - host_events = - memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, - // If has padding data, not sure whether it is useless. So fill - // temp buffer with it. - std::vector{memcpy(q, buf.get_ptr(), to_surface, - buf.get_size(), dep_events)}); - } - // Copy from temp host buffer to device with only one submit. - event_list.push_back( - memcpy(q, to_surface, buf.get_ptr(), buf.get_size(), host_events)); - break; - } - case device_to_host: { - host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, - event_list); - // Copy from host temp buffer to host target with reshaping. - event_list = - memcpy(q, to_surface, buf.get_ptr(), to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, - // Copy from device to temp host buffer with only one submit. - std::vector{memcpy(q, buf.get_ptr(), from_surface, - buf.get_size(), dep_events)}); - break; - } - case device_to_device: -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - { - auto &mm = mem_mgr::instance(); - auto to_alloc = mm.translate_ptr(to_surface); - auto from_alloc = mm.translate_ptr(from_surface); - size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr; - event_list.push_back(q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dep_events); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, - get_copy_range(size, to_slice, to_range.get(0)), to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, - get_copy_range(size, from_slice, from_range.get(0)), from_o); - cgh.parallel_for( - size, [=](sycl::id<3> id) { - to_acc[get_offset(id, to_slice, to_range.get(0))] = - from_acc[get_offset(id, from_slice, from_range.get(0))]; - }); - })); - } -#else - event_list.push_back(q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dep_events); - cgh.parallel_for(size, [=](sycl::id<3> id) { - to_surface[get_offset(id, to_slice, to_range.get(0))] = - from_surface[get_offset(id, from_slice, from_range.get(0))]; - }); - })); -#endif // SYCLCOMPAT_USM_LEVEL_NONE - break; - default: - throw std::runtime_error("[SYCLcompat] memcpy: invalid direction value"); - } - return event_list; -} - -/// memcpy 2D/3D matrix specified by pitched_data. -template -static inline std::vector -memcpy(sycl::queue q, pitched_data to, sycl::id<3> to_id, pitched_data from, - sycl::id<3> from_id, sycl::range<3> size) { - static_assert( - std::is_same_v, - "This syclcompat::detail::memcpy overload only accepts a dummy template " - "parameter, T = void, which prevents SYCL kernel generation by default."); - return memcpy(q, to.get_data_ptr(), from.get_data_ptr(), - sycl::range<3>(to.get_pitch(), to.get_y(), 1), - sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, - from_id, size); -} - -/// memcpy 2D matrix with pitch. -template -static inline std::vector -memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t to_pitch, - size_t from_pitch, size_t x, size_t y) { - static_assert( - std::is_same_v, - "This syclcompat::detail::memcpy overload only accepts a dummy template " - "parameter, T = void, which prevents SYCL kernel generation by default."); - return memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), - sycl::range<3>(from_pitch, y, 1), sycl::id<3>(0, 0, 0), - sycl::id<3>(0, 0, 0), sycl::range<3>(x, y, 1)); -} - -// Takes a std::vector & returns a single event -// which simply depends on all of them -static sycl::event combine_events(std::vector &events, - sycl::queue q) { - return q.submit([&events](sycl::handler &cgh) { - cgh.depends_on(events); - cgh.host_task([]() {}); - }); -} -} // namespace detail - -#ifdef SYCLCOMPAT_USM_LEVEL_NONE -/// Check if the pointer \p ptr represents device pointer or not. -/// -/// \param ptr The pointer to be checked. -/// \returns true if \p ptr is a device pointer. -template static inline bool is_device_ptr(T ptr) { - if constexpr (std::is_pointer::value) { - return detail::mem_mgr::instance().is_device_ptr(ptr); - } - return false; -} -#endif - -/// Get the buffer and the offset of a piece of memory pointed to by \p ptr. -/// -/// \param ptr Pointer to a piece of memory. -/// If NULL is passed as an argument, an exception will be thrown. -/// \returns a pair containing both the buffer and the offset. -static std::pair get_buffer_and_offset(const void *ptr) { - if (ptr) { - auto alloc = detail::mem_mgr::instance().translate_ptr(ptr); - size_t offset = (byte_t *)ptr - alloc.alloc_ptr; - return std::make_pair(alloc.buffer, offset); - } else { - throw std::runtime_error( - "[SYCLcompat] NULL pointer argument in get_buffer_and_offset function is invalid"); - } -} - -/// Get the data pointed from \p ptr as a 1D buffer reinterpreted as type T. -template static sycl::buffer get_buffer(const void *ptr) { - if (!ptr) - return sycl::buffer(sycl::range<1>(0)); - auto alloc = detail::mem_mgr::instance().translate_ptr(ptr); - return alloc.buffer.reinterpret(sycl::range<1>(alloc.size / sizeof(T))); -} - -/// Get the buffer of a piece of memory pointed to by \p ptr. -/// -/// \param ptr Pointer to a piece of memory. -/// \returns the buffer. -static buffer_t get_buffer(const void *ptr) { - return detail::mem_mgr::instance().translate_ptr(ptr).buffer; -} - -/// Get the host pointer from a buffer that is mapped to virtual pointer ptr. -/// \param ptr Virtual Pointer mapped to device buffer -/// \returns A host pointer -template static inline T *get_host_ptr(const void *ptr) { - auto BufferOffset = get_buffer_and_offset(ptr); - auto host_ptr = BufferOffset.first.get_host_access() - .get_multi_ptr(); - return (T *)(host_ptr + BufferOffset.second); -} - -/// A wrapper class contains an accessor and an offset. -template -class access_wrapper { - sycl::accessor accessor; - size_t offset; - -public: - /// Construct the accessor wrapper for memory pointed by \p ptr. - /// - /// \param ptr Pointer to memory. - /// \param cgh The command group handler. - access_wrapper(const void *ptr, sycl::handler &cgh) - : accessor(get_buffer(ptr).get_access(cgh)), offset(0) { - auto alloc = detail::mem_mgr::instance().translate_ptr(ptr); - offset = (byte_t *)ptr - alloc.alloc_ptr; - } - - /// Get the device pointer. - /// - /// \returns a device pointer with offset. - dataT get_raw_pointer() const { return (dataT)(&accessor[0] + offset); } -}; - -/// Get the accessor for memory pointed by \p ptr. -/// -/// \param ptr Pointer to memory. -/// If NULL is passed as an argument, an exception will be thrown. -/// \param cgh The command group handler. -/// \returns an accessor. -template -static sycl::accessor get_access(const void *ptr, - sycl::handler &cgh) { - if (ptr) { - auto alloc = detail::mem_mgr::instance().translate_ptr(ptr); - return alloc.buffer.get_access(cgh); - } else { - throw std::runtime_error( - "[SYCLcompat] NULL pointer argument in get_access function is invalid"); - } -} - -namespace experimental { -namespace detail { -template -static inline std::vector -memcpy(sycl::queue q, const experimental::memcpy_parameter ¶m) { - static_assert(std::is_same_v, - "This syclcompat::experimental::detail::memcpy overload only " - "accepts a dummy template parameter, T = void, which prevents " - "SYCL kernel generation by default."); - auto to = param.to.pitched; - auto from = param.from.pitched; -#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES - if (param.to.image_bindless != nullptr && - param.from.image_bindless != nullptr) { - throw std::runtime_error( - "[SYCLcompat] memcpy: Unsupported bindless_image API."); - // TODO: Need change logic when sycl support image_mem to image_mem copy. - std::vector event_list; - syclcompat::detail::host_buffer buf(param.size.size(), q, event_list); - to.set_data_ptr(buf.get_ptr()); - experimental::detail::memcpy(param.from.image_bindless, param.from.pos, to, - sycl::id<3>(0, 0, 0), param.size, q); - from.set_data_ptr(buf.get_ptr()); - event_list.push_back(experimental::detail::memcpy( - from, sycl::id<3>(0, 0, 0), param.to.image_bindless, param.to.pos, - param.size, q)); - return event_list; - } else if (param.to.image_bindless != nullptr) { - throw std::runtime_error( - "[SYCLcompat] memcpy: Unsupported bindless_image API."); - return {experimental::detail::memcpy(from, param.from.pos, - param.to.image_bindless, param.to.pos, - param.size, q)}; - } else if (param.from.image_bindless != nullptr) { - throw std::runtime_error( - "[SYCLcompat] memcpy: Unsupported bindless_image API."); - return {experimental::detail::memcpy(param.from.image_bindless, - param.from.pos, to, param.to.pos, - param.size, q)}; - } -#endif - if (param.to.image != nullptr) { - throw std::runtime_error("[SYCLcompat] memcpy: Unsupported image API."); - to = experimental::detail::to_pitched_data(param.to.image); - } - if (param.from.image != nullptr) { - throw std::runtime_error("[SYCLcompat] memcpy: Unsupported image API."); - from = experimental::detail::to_pitched_data(param.from.image); - } - return syclcompat::detail::memcpy(q, to, param.to.pos, from, param.from.pos, - param.size); -} -} // namespace detail -} // namespace experimental - -/// Allocate memory block on the device. -/// \param num_bytes Number of bytes to allocate. -/// \param q Queue to execute the allocate task. -/// \returns A pointer to the newly allocated memory. -static inline void *malloc(size_t num_bytes, - sycl::queue q = get_default_queue()) { - return detail::malloc(num_bytes, q); -} - -/// Allocate memory block on the device. -/// \param T Datatype to allocate -/// \param count Number of elements to allocate. -/// \param q Queue to execute the allocate task. -/// \returns A pointer to the newly allocated memory. -template -static inline T *malloc(size_t count, sycl::queue q = get_default_queue()) { - return static_cast(detail::malloc(count * sizeof(T), q)); -} - -/// Allocate memory block on the host. -/// \param num_bytes Number of bytes to allocate. -/// \param q Queue to execute the allocate task. -/// \returns A pointer to the newly allocated memory. -static inline void *malloc_host(size_t num_bytes, - sycl::queue q = get_default_queue()) { - return sycl::malloc_host(num_bytes, q); -} - -/// Allocate memory block on the host. -/// \param T Datatype to allocate -/// \param num_bytes Number of bytes to allocate. -/// \param q Queue to execute the allocate task. -/// \returns A pointer to the newly allocated memory. -template -static inline T *malloc_host(size_t count, - sycl::queue q = get_default_queue()) { - return static_cast(sycl::malloc_host(count * sizeof(T), q)); -} - -/// Allocate memory block of usm_shared memory. -/// \param num_bytes Number of bytes to allocate. -/// \param q Queue to execute the allocate task. -/// \returns A pointer to the newly allocated memory. -static inline void *malloc_shared(size_t num_bytes, - sycl::queue q = get_default_queue()) { - return sycl::malloc_shared(num_bytes, q); -} - -/// Allocate memory block of usm_shared memory. -/// \param num_bytes Number of bytes to allocate. -/// \param q Queue to execute the allocate task. -/// \returns A pointer to the newly allocated memory. -template -static inline T *malloc_shared(size_t count, - sycl::queue q = get_default_queue()) { - return static_cast(sycl::malloc_shared(count * sizeof(T), q)); -} - -/// Allocate memory block for 3D array on the device. -/// \param size Size of the memory block, in bytes. -/// \param q Queue to execute the allocate task. -/// \returns A pitched_data object which stores the memory info. -static inline pitched_data malloc(sycl::range<3> size, - sycl::queue q = get_default_queue()) { - pitched_data pitch(nullptr, 0, size.get(0), size.get(1)); - size_t pitch_size; - pitch.set_data_ptr( - detail::malloc(pitch_size, size.get(0), size.get(1), size.get(2), q)); - pitch.set_pitch(pitch_size); - return pitch; -} - -/// Allocate memory block for 2D array on the device. -/// \param [out] pitch Aligned size of x in bytes. -/// \param x Range in dim x. -/// \param y Range in dim y. -/// \param q Queue to execute the allocate task. -/// \returns A pointer to the newly allocated memory. -static inline void *malloc(size_t &pitch, size_t x, size_t y, - sycl::queue q = get_default_queue()) { - return detail::malloc(pitch, x, y, 1, q); -} - -namespace detail { - -inline void free(void *ptr, const sycl::queue &q) { - if (ptr) { -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - detail::mem_mgr::instance().mem_free(ptr); -#else - sycl::free(ptr, q.get_context()); -#endif // SYCLCOMPAT_USM_LEVEL_NONE - } -} -} // namespace detail - -/// Wait on the queue \p q and free the memory \p ptr. -/// \param ptr Point to free. -/// \param q Queue to execute the free task. -/// \returns no return value. -static inline void wait_and_free(void *ptr, - sycl::queue q = get_default_queue()) { - get_current_device().queues_wait_and_throw(); - q.wait(); - if (ptr) { - detail::free(ptr, q); - } -} - -// Anonymous namespace to disable ADL for functions which might clash (memcpy, -// memset, free) -namespace { -/// Free the memory \p ptr on the default queue without synchronizing -/// \param ptr Point to free. -/// \returns no return value. -static inline void free(void *ptr, sycl::queue q = get_default_queue()) { - detail::free(ptr, q); -} -} // namespace - -/// Enqueues the release of all pointers in /p pointers on the /p q. -/// The command waits on all passed /p events and returns an event that -/// track the commands execution on the queue. -/// -/// \param pointers The pointers point to the device memory requested to be -/// freed. -/// \param events The events to be waited on. -/// \param q The sycl::queue the memory relates to. -// Can't be static due to the friend declaration in the memory header. -inline sycl::event enqueue_free(const std::vector &pointers, - const std::vector &events, - sycl::queue q = get_default_queue()) { - auto event = q.submit( - [&pointers, &events, &q](sycl::handler &cgh) { - cgh.depends_on(events); - cgh.host_task([=]() { - for (auto p : pointers) - detail::free(p, q); - }); - }); - get_current_device().add_event(event); - return event; -} - -namespace { -/// Synchronously copies \p size bytes from the address specified by \p from_ptr -/// to the address specified by \p to_ptr. The function will -/// return after the copy is completed. -/// -/// \param to_ptr Pointer to destination memory address. -/// \param from_ptr Pointer to source memory address. -/// \param size Number of bytes to be copied. -/// \param q Queue to execute the copy task. -/// \returns no return value. -static void memcpy(void *to_ptr, const void *from_ptr, size_t size, - sycl::queue q = get_default_queue()) { - detail::memcpy(q, to_ptr, from_ptr, size).wait(); -} - -} // namespace - -/// Asynchronously copies \p size bytes from the address specified by \p -/// from_ptr to the address specified by \p to_ptr. The return of the function -/// does NOT guarantee the copy is completed. -/// -/// \param to_ptr Pointer to destination memory address. -/// \param from_ptr Pointer to source memory address. -/// \param size Number of bytes to be copied. -/// \param q Queue to execute the copy task. -/// \returns no return value. -static sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size, - sycl::queue q = get_default_queue()) { - return detail::memcpy(q, to_ptr, from_ptr, size); -} - -/// Asynchronously copies \p count T's from the address specified by \p -/// from_ptr to the address specified by \p to_ptr. The return of the function -/// does NOT guarantee the copy is completed. -/// -/// \tparam T Datatype to be copied. -/// \param to_ptr Pointer to destination memory address. -/// \param from_ptr Pointer to source memory address. -/// \param count Number of T to be copied. -/// \param q Queue to execute the copy task. -/// \returns no return value. -template -static sycl::event -memcpy_async(type_identity_t *to_ptr, const type_identity_t *from_ptr, - size_t count, sycl::queue q = get_default_queue()) { - return detail::memcpy(q, static_cast(to_ptr), - static_cast(from_ptr), count * sizeof(T)); -} - -namespace { -/// Synchronously copies \p count T's from the address specified by \p from_ptr -/// to the address specified by \p to_ptr. The function will -/// return after the copy is completed. -/// -/// \tparam T Datatype to be copied. -/// \param to_ptr Pointer to destination memory address. -/// \param from_ptr Pointer to source memory address. -/// \param count Number of T to be copied. -/// \param q Queue to execute the copy task. -/// \returns no return value. -template -static void memcpy(type_identity_t *to_ptr, - const type_identity_t *from_ptr, size_t count, - sycl::queue q = get_default_queue()) { - detail::memcpy(q, static_cast(to_ptr), - static_cast(from_ptr), count * sizeof(T)) - .wait(); -} - -/// Synchronously copies 2D matrix specified by \p x and \p y from the address -/// specified by \p from_ptr to the address specified by \p to_ptr, while \p -/// from_pitch and \p to_pitch are the range of dim x in bytes of the matrix -/// specified by \p from_ptr and \p to_ptr. The function will return after the -/// copy is completed. -/// -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param to_ptr Pointer to destination memory address. -/// \param to_pitch Range of dim x in bytes of destination matrix. -/// \param from_ptr Pointer to source memory address. -/// \param from_pitch Range of dim x in bytes of source matrix. -/// \param x Range of dim x of matrix to be copied. -/// \param y Range of dim y of matrix to be copied. -/// \param q Queue to execute the copy task. -/// \returns no return value. -template -static inline void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr, - size_t from_pitch, size_t x, size_t y, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "This syclcompat::memcpy overload only accepts a dummy template " - "parameter, T = void, which prevents SYCL kernel generation by default."); - sycl::event::wait( - detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y)); -} - -} // namespace - -/// Asynchronously copies 2D matrix specified by \p x and \p y from the address -/// specified by \p from_ptr to the address specified by \p to_ptr, while \p -/// \p from_pitch and \p to_pitch are the range of dim x in bytes of the matrix -/// specified by \p from_ptr and \p to_ptr. The return of the function does NOT -/// guarantee the copy is completed. -/// -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param to_ptr Pointer to destination memory address. -/// \param to_pitch Range of dim x in bytes of destination matrix. -/// \param from_ptr Pointer to source memory address. -/// \param from_pitch Range of dim x in bytes of source matrix. -/// \param x Range of dim x of matrix to be copied. -/// \param y Range of dim y of matrix to be copied. -/// \param q Queue to execute the copy task. -/// \returns An event representing the memcpy operation. -template -static inline sycl::event memcpy_async(void *to_ptr, size_t to_pitch, - const void *from_ptr, size_t from_pitch, - size_t x, size_t y, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "This syclcompat::memcpy overload only accepts a dummy template " - "parameter, T = void, which prevents SYCL kernel generation by default."); - auto events = detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y); - return detail::combine_events(events, q); -} - -namespace { -/// Synchronously copies a subset of a 3D matrix specified by \p to to another -/// 3D matrix specified by \p from. The from and to position info are specified -/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size. -// The function will return after the copy is completed. -/// -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param to Destination matrix info. -/// \param to_pos Position of destination. -/// \param from Source matrix info. -/// \param from_pos Position of destination. -/// \param size Range of the submatrix to be copied. -/// \param q Queue to execute the copy task. -/// \returns no return value. -template -static inline void memcpy(pitched_data to, sycl::id<3> to_pos, - pitched_data from, sycl::id<3> from_pos, - sycl::range<3> size, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "This syclcompat::memcpy overload only accepts a dummy template " - "parameter, T = void, which prevents SYCL kernel generation by default."); - sycl::event::wait(detail::memcpy(q, to, to_pos, from, from_pos, size)); -} -} // namespace - -/// Asynchronously copies a subset of a 3D matrix specified by \p to to another -/// 3D matrix specified by \p from. The from and to position info are specified -/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size. -/// The return of the function does NOT guarantee the copy is completed. -/// -/// \param to Destination matrix info. -/// \param to_pos Position of destination. -/// \param from Source matrix info. -/// \param from_pos Position of destination. -/// \param size Range of the submatrix to be copied. -/// \param q Queue to execute the copy task. -/// \returns An event representing the memcpy operation. -template -static inline sycl::event memcpy_async(pitched_data to, sycl::id<3> to_pos, - pitched_data from, sycl::id<3> from_pos, - sycl::range<3> size, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "This syclcompat::memcpy overload only accepts a dummy template " - "parameter, T = void, which prevents SYCL kernel generation by default."); - auto events = detail::memcpy(q, to, to_pos, from, from_pos, size); - return detail::combine_events(events, q); -} - -namespace { -/// Synchronously sets \p pattern to the first \p count elements starting from -/// \p dev_ptr. The function will return after the fill operation is completed. -/// -/// \tparam T Datatype of the value to be set. -/// \param dev_ptr Pointer to the device memory address. -/// \param pattern Pattern of type \p T to be set. -/// \param count Number of elements to be set to the patten. -/// \param q The queue in which the operation is done. -/// \returns no return value. -template -static void inline fill(void *dev_ptr, const T &pattern, size_t count, - sycl::queue q = get_default_queue()) { - detail::fill(q, dev_ptr, pattern, count).wait(); -} -} // namespace - -/// Asynchronously sets \p pattern to the first \p count elements starting from -/// \p dev_ptr. -/// The return of the function does NOT guarantee the fill operation is -/// completed. -/// -/// \tparam T Datatype of the pattern to be set. -/// \param dev_ptr Pointer to the device memory address. -/// \param pattern Pattern of type \p T to be set. -/// \param count Number of elements to be set to the patten. -/// \param q The queue in which the operation is done. -/// \returns An event representing the fill operation. -template -static sycl::event inline fill_async(void *dev_ptr, const T &pattern, - size_t count, - sycl::queue q = get_default_queue()) { - return detail::fill(q, dev_ptr, pattern, count); -} - -namespace experimental { - -/// [UNSUPPORTED] Synchronously copies 2D/3D memory data specified by \p param . -/// The function will return after the copy is completed. -/// -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param param Memory copy parameters. -/// \param q Queue to execute the copy task. -/// \returns no return value. -template -static inline void memcpy(const memcpy_parameter ¶m, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "This syclcompat::memcpy overload only accepts a dummy template " - "parameter, T = void, which prevents SYCL kernel generation by default."); - sycl::event::wait(syclcompat::experimental::detail::memcpy(q, param)); -} - -/// [UNSUPPORTED] Asynchronously copies 2D/3D memory data specified by \p param -/// . The return of the function does NOT guarantee the copy is completed. -/// -/// \param param Memory copy parameters. -/// \param q Queue to execute the copy task. -/// \returns no return value. -template -static inline void memcpy_async(const memcpy_parameter ¶m, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "This syclcompat::memcpy overload only accepts a dummy template " - "parameter, T = void, which prevents SYCL kernel generation by default."); - syclcompat::experimental::detail::memcpy(q, param); -} -} // namespace experimental - -namespace { -/// Synchronously sets \p value to the first \p size bytes starting from \p -/// dev_ptr. The function will return after the memset operation is completed. -/// -/// \param dev_ptr Pointer to the device memory address. -/// \param value Value to be set. -/// \param size Number of bytes to be set to the value. -/// \param q The queue in which the operation is done. -/// \returns no return value. -static void memset(void *dev_ptr, int value, size_t size, - sycl::queue q = get_default_queue()) { - detail::memset(q, dev_ptr, value, size).wait(); -} -} // namespace - -/// \brief Sets 2 bytes data \p value to the first \p size elements starting -/// from \p dev_ptr in \p q synchronously. -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param [in] dev_ptr Pointer to the virtual device memory address. -/// \param [in] value The value to be set. -/// \param [in] size Number of elements to be set to the value. -/// \param [in] q The queue in which the operation is done. -template -static inline void memset_d16(void *dev_ptr, unsigned short value, size_t size, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "syclcompat::memset_d16 only accepts a dummy template parameter, T = " - "void, which prevents SYCL kernel generation by default."); - detail::fill(q, dev_ptr, value, size).wait(); -} - -/// \brief Sets 4 bytes data \p value to the first \p size elements starting -/// from \p dev_ptr in \p q synchronously. -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param [in] dev_ptr Pointer to the virtual device memory address. -/// \param [in] value The value to be set. -/// \param [in] size Number of elements to be set to the value. -/// \param [in] q The queue in which the operation is done. -template -static inline void memset_d32(void *dev_ptr, unsigned int value, size_t size, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "syclcompat::memset_d32 only accepts a dummy template parameter, T = " - "void, which prevents SYCL kernel generation by default."); - detail::fill(q, dev_ptr, value, size).wait(); -} - -/// \brief Sets 1 byte data \p value to the first \p size elements starting -/// from \p dev_ptr in \p q asynchronously. -/// \param dev_ptr Pointer to the device memory address. -/// \param value Value to be set. -/// \param size Number of bytes to be set to the value. -/// \returns An event representing the memset operation. -static inline sycl::event memset_async(void *dev_ptr, int value, size_t size, - sycl::queue q = get_default_queue()) { - return detail::memset(q, dev_ptr, value, size); -} - -/// \brief Sets 2 bytes data \p value to the first \p size elements starting -/// from \p dev_ptr in \p q asynchronously. -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param [in] dev_ptr Pointer to the virtual device memory address. -/// \param [in] value The value to be set. -/// \param [in] size Number of elements to be set to the value. -/// \param [in] q The queue in which the operation is done. -/// \returns An event representing the memset operation. -template -static inline sycl::event -memset_d16_async(void *dev_ptr, unsigned short value, size_t size, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "syclcompat::memset_d16_async only accepts a dummy template parameter, T " - "= void, which prevents SYCL kernel generation by default."); - return detail::fill(q, dev_ptr, value, size); -} - -/// \brief Sets 4 bytes data \p value to the first \p size elements starting -/// from \p dev_ptr in \p q asynchronously. -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param [in] dev_ptr Pointer to the virtual device memory address. -/// \param [in] value The value to be set. -/// \param [in] size Number of elements to be set to the value. -/// \param [in] q The queue in which the operation is done. -/// \returns An event representing the memset operation. -template -static inline sycl::event -memset_d32_async(void *dev_ptr, unsigned int value, size_t size, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "syclcompat::memset_d32_async only accepts a dummy template parameter, T " - "= void, which prevents SYCL kernel generation by default."); - return detail::fill(q, dev_ptr, value, size); -} - -namespace { -/// \brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p -/// ptr in \p q synchronously. -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param [in] ptr Pointer to the virtual device memory. -/// \param [in] pitch The pitch size by number of elements, including padding. -/// \param [in] val The value to be set. -/// \param [in] x The width of memory region by number of elements. -/// \param [in] y The height of memory region by number of elements. -/// \param [in] q The queue in which the operation is done. -template -static inline void memset(void *ptr, size_t pitch, int val, size_t x, size_t y, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "This syclcompat::memset overload only accepts a dummy template " - "parameter, T = void, which prevents SYCL kernel generation by default."); - sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y)); -} -} // namespace - -/// \brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by -/// ptr in \p q synchronously. -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param [in] ptr Pointer to the virtual device memory. -/// \param [in] pitch The pitch size by number of elements, including padding. -/// \param [in] val The value to be set. -/// \param [in] x The width of memory region by number of elements. -/// \param [in] y The height of memory region by number of elements. -/// \param [in] q The queue in which the operation is done. -template -static inline void memset_d16(void *ptr, size_t pitch, unsigned short val, - size_t x, size_t y, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "syclcompat::memset_d16 only accepts a dummy template parameter, T = " - "void, which prevents SYCL kernel generation by default."); - sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y)); -} - -/// \brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by -/// ptr in \p q synchronously. -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param [in] ptr Pointer to the virtual device memory. -/// \param [in] pitch The pitch size by number of elements, including padding. -/// \param [in] val The value to be set. -/// \param [in] x The width of memory region by number of elements. -/// \param [in] y The height of memory region by number of elements. -/// \param [in] q The queue in which the operation is done. -template -static inline void memset_d32(void *ptr, size_t pitch, unsigned int val, - size_t x, size_t y, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "syclcompat::memset_d32 only accepts a dummy template parameter, T = " - "void, which prevents SYCL kernel generation by default."); - sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y)); -} - -/// \brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p -/// ptr in \p q asynchronously. -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param [in] ptr Pointer to the virtual device memory. -/// \param [in] pitch The pitch size by number of elements, including padding. -/// \param [in] val The value to be set. -/// \param [in] x The width of memory region by number of elements. -/// \param [in] y The height of memory region by number of elements. -/// \param [in] q The queue in which the operation is done. -/// \returns An event representing the memset operation. -template -static inline sycl::event memset_async(void *ptr, size_t pitch, int val, - size_t x, size_t y, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "syclcompat::memset_async only accepts a dummy template parameter, T = " - "void, which prevents SYCL kernel generation by default."); - auto events = detail::memset(q, ptr, pitch, val, x, y); - return detail::combine_events(events, q); -} - -/// \brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by -/// \p ptr in \p q asynchronously. -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param [in] ptr Pointer to the virtual device memory. -/// \param [in] pitch The pitch size by number of elements, including padding. -/// \param [in] val The value to be set. -/// \param [in] x The width of memory region by number of elements. -/// \param [in] y The height of memory region by number of elements. -/// \param [in] q The queue in which the operation is done. -/// \returns An event representing the memset operation. -template -static inline sycl::event -memset_d16_async(void *ptr, size_t pitch, unsigned short val, size_t x, - size_t y, sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "syclcompat::memset_d16_async only accepts a dummy template parameter, T " - "= void, which prevents SYCL kernel generation by default."); - auto events = detail::memset(q, ptr, pitch, val, x, y); - return detail::combine_events(events, q); -} - -/// \brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by -/// \p ptr in \p q asynchronously. -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param [in] ptr Pointer to the virtual device memory. -/// \param [in] pitch The pitch size by number of elements, including padding. -/// \param [in] val The value to be set. -/// \param [in] x The width of memory region by number of elements. -/// \param [in] y The height of memory region by number of elements. -/// \param [in] q The queue in which the operation is done. -/// \returns An event representing the memset operation. -template -static inline sycl::event -memset_d32_async(void *ptr, size_t pitch, unsigned int val, size_t x, size_t y, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "syclcompat::memset_d32_async only accepts a dummy template parameter, T " - "= void, which prevents SYCL kernel generation by default."); - auto events = detail::memset(q, ptr, pitch, val, x, y); - return detail::combine_events(events, q); -} - -namespace { -/// Sets \p value to the 3D memory region specified by \p pitch in \p q. \p size -/// specify the setted 3D memory size. The function will return after the -/// memset operation is completed. -/// -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param pitch Specify the 3D memory region. -/// \param value Value to be set. -/// \param size The setted 3D memory size. -/// \param q The queue in which the operation is done. -/// \returns no return value. -template -static inline void memset(pitched_data pitch, int val, sycl::range<3> size, - sycl::queue q = get_default_queue()) { - static_assert(std::is_same_v, - "syclcompat::memset only accepts a dummy template parameter, T " - "= void, which prevents SYCL kernel generation by default."); - sycl::event::wait(detail::memset(q, pitch, val, size)); -} -} // namespace - -/// Sets \p value to the 3D memory region specified by \p pitch in \p q. \p size -/// specify the setted 3D memory size. The return of the function does NOT -/// guarantee the memset operation is completed. -/// -/// \tparam T Dummy template parameter to delay SYCL kernel instantiation -/// \param pitch Specify the 3D memory region. -/// \param value Value to be set. -/// \param size The setted 3D memory size. -/// \param q The queue in which the operation is done. -/// \returns An event representing the memset operation. -template -static inline sycl::event memset_async(pitched_data pitch, int val, - sycl::range<3> size, - sycl::queue q = get_default_queue()) { - static_assert( - std::is_same_v, - "syclcompat::memset_async only accepts a dummy template parameter, T = " - "void, which prevents SYCL kernel generation by default."); - auto events = detail::memset(q, pitch, val, size); - return detail::combine_events(events, q); -} - -/// accessor used as device function parameter. -template class accessor; -template class accessor { -public: - using memory_t = detail::memory_traits; - using element_t = typename memory_t::element_t; - using pointer_t = typename memory_t::pointer_t; - using accessor_t = typename memory_t::template accessor_t<3>; - accessor(pointer_t data, const sycl::range<3> &in_range) - : _data(data), _range(in_range) {} - template - accessor(typename std::enable_if::type &acc) - : accessor(acc, acc.get_range()) {} - accessor(const accessor_t &acc, const sycl::range<3> &in_range) - : accessor( - acc.template get_multi_ptr().get(), - in_range) {} - accessor operator[](size_t index) const { - sycl::range<2> sub(_range.get(1), _range.get(2)); - return accessor(_data + index * sub.size(), sub); - } - - pointer_t get_ptr() const { return _data; } - -private: - pointer_t _data; - sycl::range<3> _range; -}; -template class accessor { -public: - using memory_t = detail::memory_traits; - using element_t = typename memory_t::element_t; - using pointer_t = typename memory_t::pointer_t; - using accessor_t = typename memory_t::template accessor_t<2>; - accessor(pointer_t data, const sycl::range<2> &in_range) - : _data(data), _range(in_range) {} - template - accessor(typename std::enable_if::type &acc) - : accessor(acc, acc.get_range()) {} - accessor(const accessor_t &acc, const sycl::range<2> &in_range) - : accessor( - acc.template get_multi_ptr().get(), - in_range) {} - - pointer_t operator[](size_t index) const { - return _data + _range.get(1) * index; - } - - pointer_t get_ptr() const { return _data; } - -private: - pointer_t _data; - sycl::range<2> _range; -}; - -/// Device variable with address space of shared or global. -// TODO(syclcompat-lib-reviewers): This doesn't yet support multi-device (ptr -// per device) -template class device_memory { -public: - using accessor_t = - typename detail::memory_traits::template accessor_t; - using value_t = typename detail::memory_traits::value_t; - using syclcompat_accessor_t = syclcompat::accessor; - - device_memory(sycl::queue q = get_default_queue()) - : device_memory(sycl::range(1), q) {} - - /// Constructor of 1-D array with initializer list - device_memory(const sycl::range &in_range, - std::initializer_list &&init_list, - sycl::queue q = get_default_queue()) - : device_memory(in_range, q) { - assert(init_list.size() <= in_range.size()); - _host_ptr = (value_t *)std::malloc(_size); - std::memset(_host_ptr, 0, _size); - std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T)); - } - - /// Constructor of 2-D array with initializer list - template - device_memory( - const typename std::enable_if>::type &in_range, - std::initializer_list> &&init_list, - sycl::queue q = get_default_queue()) - : device_memory(in_range, q) { - assert(init_list.size() <= in_range[0]); - _host_ptr = (value_t *)std::malloc(_size); - std::memset(_host_ptr, 0, _size); - auto tmp_data = _host_ptr; - for (auto sub_list : init_list) { - assert(sub_list.size() <= in_range[1]); - std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T)); - tmp_data += in_range[1]; - } - } - - /// Constructor with range - device_memory(const sycl::range &range_in, - sycl::queue q = get_default_queue()) - : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false), - _host_ptr(nullptr), _device_ptr(nullptr), _q(q) { - static_assert((Memory == memory_region::global) || - (Memory == memory_region::constant) || - (Memory == memory_region::usm_shared), - "device memory region should be global, constant or shared"); - // Make sure that singleton class dev_mgr will destruct later than this. - detail::dev_mgr::instance(); -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - detail::mem_mgr::instance(); -#endif - } - - /// Constructor with range - // enable_if_t SFINAE to avoid ambiguity with - // device_memory(Args... Arguments, sycl::queue q) - template > - device_memory(Args... Arguments) - : device_memory(sycl::range(Arguments...), - get_default_queue()) {} - - /// Constructor with range and queue - template - device_memory(Args... Arguments, sycl::queue q) - : device_memory(sycl::range(Arguments...), q) {} - - ~device_memory() { - if (_device_ptr && !_reference) - syclcompat::free(_device_ptr, _q); - if (_host_ptr) - std::free(_host_ptr); - } - - /// Allocate memory with the queue specified in the constuctor, and init - /// memory if has initial value - void init() { init(_q); } - /// Allocate memory with specified queue, and init memory if has initial - /// value. - void init(sycl::queue q) { - if (_device_ptr) - return; - if (!_size) - return; - allocate_device(q); - if (_host_ptr) - detail::memcpy(q, _device_ptr, _host_ptr, _size); - } - - /// The variable is assigned to a device pointer. - void assign(value_t *src, size_t size) { - this->~device_memory(); - new (this) device_memory(src, size, _q); - } - - // Get memory pointer of the memory object, a device USM pointer. - value_t *get_ptr() { return get_ptr(_q); } - - // Get memory pointer of the memory object, a device USM pointer. - value_t *get_ptr(sycl::queue q) { - init(q); - return _device_ptr; - } - - /// Get the device memory object size in bytes. - size_t get_size() { return _size; } - - template - typename std::enable_if::type &operator[](size_t index) { - init(); -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - return syclcompat::get_buffer::type>( - _device_ptr) - .template get_access()[index]; -#else - return _device_ptr[index]; -#endif // SYCLCOMPAT_USM_LEVEL_NONE - } - -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - /// Get sycl::accessor for the device memory object when usm is not used. - accessor_t get_access(sycl::handler &cgh) { - return get_buffer(_device_ptr) - .template reinterpret(_range) - .template get_access::mode, - detail::memory_traits::target>(cgh); - } -#else - /// Get compat_accessor with dimension info for the device memory object - /// when usm is used and dimension is greater than 1. - template - typename std::enable_if::type - get_access(sycl::handler &cgh) { - return syclcompat_accessor_t((T *)_device_ptr, _range); - } -#endif // SYCLCOMPAT_USM_LEVEL_NONE - -private: - device_memory(value_t *memory_ptr, size_t size, - sycl::queue q = get_default_queue()) - : _size(size), _range(size / sizeof(T)), _reference(true), - _device_ptr(memory_ptr), _q(q) {} - - void allocate_device(sycl::queue q) { -#ifndef SYCLCOMPAT_USM_LEVEL_NONE - if (Memory == memory_region::usm_shared) { - _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(), - q.get_context()); - return; - } -#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY - if (Memory == memory_region::constant) { - _device_ptr = (value_t *)sycl::malloc_device( - _size, q.get_device(), q.get_context(), - sycl::ext::oneapi::property::usm::device_read_only()); - return; - } -#endif -#endif - _device_ptr = (value_t *)detail::malloc(_size, q); - } - - size_t _size; - sycl::range _range; - bool _reference; - value_t *_host_ptr; - value_t *_device_ptr; - sycl::queue _q; -}; -template -class device_memory : public device_memory { -public: - using base = device_memory; - using value_t = typename base::value_t; - using accessor_t = - typename detail::memory_traits::template accessor_t<0>; - - /// Constructor with initial value. - device_memory(const value_t &val, sycl::queue q = get_default_queue()) - : base(sycl::range<1>(1), {val}, q) {} - - /// Default constructor - device_memory(sycl::queue q = get_default_queue()) : base(1, q) {} -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - /// Get sycl::accessor for the device memory object when usm is not used. - accessor_t get_access(sycl::handler &cgh) { - auto buf = get_buffer(base::get_ptr()) - .template reinterpret(sycl::range<1>(1)); - return accessor_t(buf, cgh); - } -#endif // SYCLCOMPAT_USM_LEVEL_NONE -}; - -template -using global_memory = device_memory; -template -using constant_memory = device_memory; -template -using shared_memory = device_memory; - -class pointer_attributes { -public: - void init(const void *ptr, sycl::queue q = get_default_queue()) { -#ifdef SYCLCOMPAT_USM_LEVEL_NONE - throw std::runtime_error( - "[SYCLcompat] pointer_attributes: only works for USM pointer."); -#else - memory_type = sycl::get_pointer_type(ptr, q.get_context()); - device_pointer = (memory_type != sycl::usm::alloc::unknown) ? ptr : nullptr; - host_pointer = (memory_type != sycl::usm::alloc::unknown) && - (memory_type != sycl::usm::alloc::device) - ? ptr - : nullptr; - sycl::device device_obj = sycl::get_pointer_device(ptr, q.get_context()); - device_id = detail::dev_mgr::instance().get_device_id(device_obj); -#endif // SYCLCOMPAT_USM_LEVEL_NONE - } - - sycl::usm::alloc get_memory_type() { return memory_type; } - - const void *get_device_pointer() { return device_pointer; } - - const void *get_host_pointer() { return host_pointer; } - - bool is_memory_shared() { return memory_type == sycl::usm::alloc::shared; } - - unsigned int get_device_id() { return device_id; } - -private: - sycl::usm::alloc memory_type = sycl::usm::alloc::unknown; - const void *device_pointer = nullptr; - const void *host_pointer = nullptr; - unsigned int device_id = 0; -}; - -} // namespace syclcompat diff --git a/sycl/include/syclcompat/syclcompat.hpp b/sycl/include/syclcompat/syclcompat.hpp deleted file mode 100644 index 93a3eb81d0f15..0000000000000 --- a/sycl/include/syclcompat/syclcompat.hpp +++ /dev/null @@ -1,53 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCLcompat - * - * syclcompat.hpp - * - * Description: - * Main include internal header for SYCLcompat - **************************************************************************/ - -#pragma once - -// MSVC ignores [[deprecated]] attribute on namespace unless compiled with -// /W3 or above. -#ifdef _MSC_VER -#define __SYCLCOMPAT_STRINGIFY(x) #x -#define __SYCLCOMPAT_TOSTRING(x) __SYCLCOMPAT_STRINGIFY(x) - -#define __SYCLCOMPAT_WARNING(msg) \ - __pragma(message(__FILE__ \ - "(" __SYCLCOMPAT_TOSTRING(__LINE__) "): warning: " msg)) - -__SYCLCOMPAT_WARNING("syclcompat is deprecated and the deprecation warnings " - "are ignored unless compiled with /W3 or above.") - -#undef __SYCLCOMPAT_WARNING -#undef __SYCLCOMPAT_TOSTRING -#undef __SYCLCOMPAT_STRINGIFY -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include diff --git a/sycl/include/syclcompat/traits.hpp b/sycl/include/syclcompat/traits.hpp deleted file mode 100644 index 502fd979d0066..0000000000000 --- a/sycl/include/syclcompat/traits.hpp +++ /dev/null @@ -1,295 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCL compatibility extension - * - * traits.hpp - * - * Description: - * Type traits for the SYCL compatibility extension - **************************************************************************/ - -#pragma once - -#include -#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include - -namespace [[deprecated("syclcompat is deprecated")]] syclcompat { - -// Equivalent to C++20's std::type_identity (used to create non-deduced -// contexts) -template struct type_identity { - using type = T; -}; -template using type_identity_t = typename type_identity::type; - -// Defines the operand type for arithemtic operations on T. This is identity -// for all types except pointers, for which it is std::ptrdiff_t -template struct arith { - using type = std::conditional_t, std::ptrdiff_t, T>; -}; -template using arith_t = typename arith::type; - -// Traits to check device function signature matches args (with or without local -// mem) -template -struct device_fn_invocable : std::is_invocable {}; - -template -struct device_fn_lmem_invocable - : std::is_invocable {}; - -template -constexpr inline bool args_compatible = - std::conditional_t, - device_fn_invocable>::value; - -namespace detail { - -// Trait for identifying sycl::range and sycl::nd_range. -template struct is_range : std::false_type {}; -template struct is_range> : std::true_type {}; - -template constexpr bool is_range_v = is_range::value; - -template struct is_nd_range : std::false_type {}; -template struct is_nd_range> : std::true_type {}; - -template constexpr bool is_nd_range_v = is_nd_range::value; - -template -constexpr bool is_range_or_nd_range_v = - std::disjunction_v, is_nd_range>; - -// Trait range_to_item_t to convert nd_range -> nd_item, range -> item -template struct range_to_item_map; -template struct range_to_item_map> { - using ItemT = sycl::nd_item; -}; -template struct range_to_item_map> { - using ItemT = sycl::item; -}; - -template -using range_to_item_t = typename range_to_item_map::ItemT; - -} // namespace detail - -// Forward decls -namespace experimental { - -template struct kernel_properties; -template struct launch_properties; -struct local_mem_size; - -template -class launch_policy; -} // namespace experimental - -namespace experimental::detail { - -// Helper for tuple_template_index -template